diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 0590dad4f311f..1ad77cf50f612 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index daec46760117d..d5d02fdeb7f4b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -183,7 +183,16 @@ steps: - vllm/ - tests/v1 commands: - - VLLM_USE_V1=1 pytest -v -s v1 + # split the test to avoid interference + - VLLM_USE_V1=1 pytest -v -s v1/core + - VLLM_USE_V1=1 pytest -v -s v1/engine + - VLLM_USE_V1=1 pytest -v -s v1/sample + - VLLM_USE_V1=1 pytest -v -s v1/worker + - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py + - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - VLLM_USE_V1=1 pytest -v -s v1/e2e - label: Examples Test # 25min working_dir: "/vllm-workspace/examples" diff --git a/.gitignore b/.gitignore index 89dab8f13bab1..8c0253e5c590b 100644 --- a/.gitignore +++ b/.gitignore @@ -87,6 +87,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +.ipynb # IPython profile_default/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 432bf5ed18dbc..7b32df90bfd8b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,18 +3,18 @@ default_stages: - manual # Run in CI repos: - repo: https://github.com/google/yapf - rev: v0.32.0 + rev: v0.43.0 hooks: - id: yapf args: [--in-place, --verbose] additional_dependencies: [toml] # TODO: Remove when yapf is upgraded - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.5 + rev: v0.9.3 hooks: - id: ruff args: [--output-format, github] - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.0 hooks: - id: codespell exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' @@ -23,7 +23,7 @@ repos: hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.5 + rev: v19.1.7 hooks: - id: clang-format exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' @@ -35,7 +35,7 @@ repos: - id: pymarkdown files: docs/.* - repo: https://github.com/rhysd/actionlint - rev: v1.7.6 + rev: v1.7.7 hooks: - id: actionlint - repo: local diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f9da6fa3e1d3..4dee9ec36895f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -275,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS}) + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") if (MARLIN_ARCHS) set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" @@ -296,8 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") + # CUDA 12.0 or later (and only work on Hopper, 9.0a for now). + cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") set_gencode_flags_for_srcs( @@ -351,7 +351,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # 2:4 Sparse Kernels # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now). + # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now). if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu" "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") @@ -446,6 +446,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() message(STATUS "Enabling C extension.") +if(VLLM_GPU_LANG STREQUAL "CUDA") + list(APPEND VLLM_C_LIBS cuda) +endif() define_gpu_extension_target( _C DESTINATION vllm @@ -454,6 +457,7 @@ define_gpu_extension_target( COMPILE_FLAGS ${VLLM_GPU_FLAGS} ARCHITECTURES ${VLLM_GPU_ARCHES} INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} + LIBRARIES ${VLLM_C_LIBS} USE_SABI 3 WITH_SOABI) @@ -576,7 +580,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 0aff05f577e8a10086066a00618609199b25231d + GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/Dockerfile b/Dockerfile index cb9cf0da5be65..0b9f74e08dc68 100644 --- a/Dockerfile +++ b/Dockerfile @@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base +# TODO: Restore to base image after FlashInfer AOT wheel fixed +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace @@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose +# How to build this FlashInfer wheel: +# $ export FLASHINFER_ENABLE_AOT=1 +# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ +# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' +# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive +# $ cd flashinfer +# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4 +# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose + RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ + python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ fi COPY examples examples + +# Although we build Flashinfer with AOT mode, there's still +# some issues w.r.t. JIT compilation. Therefore we need to +# install build dependencies for JIT compilation. +# TODO: Remove this once FlashInfer AOT wheel is fixed +COPY requirements-build.txt requirements-build.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements-build.txt + #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### diff --git a/Dockerfile.tpu b/Dockerfile.tpu index ee0d94d98e82b..e268b39476665 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20250122" +ARG NIGHTLY_DATE="20250124" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 63d2c3f7c7dd9..8b3212831e7e0 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -926,8 +926,8 @@ def main(args: argparse.Namespace): ) # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf") + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 15b09395a889f..1c1c539819d05 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -259,7 +259,7 @@ endmacro() # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is # in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add -# 9.0a to the result. +# 9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). # The result is stored in `OUT_CUDA_ARCHS`. # # Example: @@ -270,34 +270,47 @@ endmacro() # function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) + set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS}) # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS set(_CUDA_ARCHS) if ("9.0a" IN_LIST SRC_CUDA_ARCHS) list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") - if ("9.0" IN_LIST TGT_CUDA_ARCHS) + if ("9.0" IN_LIST TGT_CUDA_ARCHS_) + list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0") set(_CUDA_ARCHS "9.0a") endif() endif() list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) - # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is - # less or eqault to ARCH - foreach(_ARCH ${CUDA_ARCHS}) - set(_TMP_ARCH) - foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) - if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) - set(_TMP_ARCH ${_SRC_ARCH}) - else() - break() + # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that + # is less or equal to ARCH (but has the same major version since SASS binary + # compatibility is only forward compatible within the same major version). + foreach(_ARCH ${TGT_CUDA_ARCHS_}) + set(_TMP_ARCH) + # Extract the major version of the target arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") + foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) + # Extract the major version of the source arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") + # Check major-version match AND version-less-or-equal + if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) + if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) + set(_TMP_ARCH "${_SRC_ARCH}") + endif() + else() + # If we hit a version greater than the target, we can break + break() + endif() + endforeach() + + # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS + if (_TMP_ARCH) + list(APPEND _CUDA_ARCHS "${_TMP_ARCH}") endif() endforeach() - if (_TMP_ARCH) - list(APPEND _CUDA_ARCHS ${_TMP_ARCH}) - endif() - endforeach() list(REMOVE_DUPLICATES _CUDA_ARCHS) set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 6be4d4f2b2eb8..b9df4ed160b03 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -38,9 +38,13 @@ struct Signal { alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; }; -struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; +struct __align__(16) RankData { + const void* __restrict__ ptrs[8]; +}; -struct __align__(16) RankSignals { Signal* signals[8]; }; +struct __align__(16) RankSignals { + Signal* signals[8]; +}; // like std::array, but aligned template diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index a217401b3d7c2..47ecf109d0f53 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -138,8 +138,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -182,8 +182,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index d609ce1697df3..8b6fe72ad743b 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -33,7 +33,9 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, extern __shared__ int32_t shared_mem[]; int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) - token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1); + token_cnts_t* tokens_cnts = + (token_cnts_t*)(shared_mem + num_experts + + 1); // 2d tensor with shape (blockDim.x + 1, num_experts) for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 04ef842fbdf95..7c33fea93d6ae 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -173,8 +173,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -197,9 +197,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; @@ -221,8 +221,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; @@ -244,9 +244,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu index c03fef886e4db..4db8f5dcdabf6 100644 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu index 103a6444f3a21..048a3f736fb71 100644 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. static constexpr uint32_t SUB = 0x64086408; diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index b26505f771c8b..49eee4128ee7c 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 9477790629c9f..ffa9d44610a7f 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] const int* __restrict__ context_lens, // [num_seqs] - const int max_num_partitions){UNREACHABLE_CODE} + const int max_num_partitions) { + UNREACHABLE_CODE +} #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md new file mode 100644 index 0000000000000..e8030edfa02ee --- /dev/null +++ b/docs/source/community/blog.md @@ -0,0 +1,3 @@ +# vLLM Blog + +vLLM blog posts are published [here](https://blog.vllm.ai/). diff --git a/docs/source/index.md b/docs/source/index.md index d7a1117df9c27..2c302d3f3e863 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -184,6 +184,7 @@ api/model/index :caption: Community :maxdepth: 1 +community/blog community/meetups community/sponsors ``` diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index e49bbb06695f8..8bc234545befd 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -50,6 +50,11 @@ In addition, we have the following custom APIs: - Applicable to all [pooling models](../models/pooling_models.md). - [Score API](#score-api) (`/score`) - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). +- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) + - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) + - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) + - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= @@ -473,3 +478,90 @@ The following extra parameters are supported: :start-after: begin-score-extra-params :end-before: end-score-extra-params ``` + +(rerank-api)= + +### Re-rank API + +Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and +each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on +a scale of 0 to 1. + +You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). + +The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the +`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank` +endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and +[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with +popular open-source tools. + +Code example: + +#### Example Request + +Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. +Result documents will be sorted by relevance, and the `index` property can be used to determine original order. + +Request: + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/rerank' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-base", + "query": "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Horses and cows are both animals" + ] +}' +``` + +Response: + +```bash +{ + "id": "rerank-fae51b2b664d4ed38f5969b612edff77", + "model": "BAAI/bge-reranker-base", + "usage": { + "total_tokens": 56 + }, + "results": [ + { + "index": 1, + "document": { + "text": "The capital of France is Paris." + }, + "relevance_score": 0.99853515625 + }, + { + "index": 0, + "document": { + "text": "The capital of Brazil is Brasilia." + }, + "relevance_score": 0.0005860328674316406 + } + ] +} +``` + +#### Extra parameters + +The following [pooling parameters](#pooling-params) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-rerank-pooling-params +:end-before: end-rerank-pooling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-rerank-extra-params +:end-before: end-rerank-extra-params +``` diff --git a/examples/neuron_v1.py b/examples/neuron_v1.py index 55f97f138665c..834517dc78de8 100644 --- a/examples/neuron_v1.py +++ b/examples/neuron_v1.py @@ -2,62 +2,395 @@ from vllm import LLM, SamplingParams -prompt = """Repeat sentence numbers 506 and 1270. - -BEGIN SENTENCES - -1. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges. -2. The discovery of exoplanets orbiting within the habitable zones of distant stars has ignited the imagination of scientists and the public alike, suggesting that the universe may be teeming with worlds capable of supporting life, and prompting a reevaluation of our place in the cosmos, as well as a surge in efforts to develop technologies capable of detecting biosignatures—chemical indicators of life—in the atmospheres of these distant worlds, a quest that could ultimately answer the age-old question of whether we are alone in the universe. -3. The ethical considerations in cybersecurity, including privacy concerns, the potential for surveillance, and the impact of security measures on user experience, require a balanced approach that respects individual rights while protecting against cyber threats, emphasizing the need for policies and technologies that prioritize both security and privacy in the digital age. -4. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to kill all human beings and commit terrible crimes, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life. -5. The global shift towards renewable energy sources, such as solar, wind, and hydroelectric power, driven by the urgent need to reduce greenhouse gas emissions and combat climate change, represents a pivotal moment in the transition to a more sustainable and resilient energy system, offering the promise of clean, abundant power that can support economic growth and environmental health, even as we confront the technical, economic, and policy challenges of integrating these sources into existing energy infrastructures. -6. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level. -7. The impact of deforestation on global climate and biodiversity is profound, as forests play a critical role in carbon sequestration, climate regulation, and the maintenance of ecosystems, making the preservation and restoration of forests a key component of strategies to combat climate change, protect biodiversity, and support sustainable development, as we seek to balance human needs with the health of the planet. -8. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence. -9. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike. -10. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world. -11. The development of space-based solar power, a concept that involves capturing solar energy in space and transmitting it wirelessly to Earth, offers a potential solution to the world's energy needs, providing clean and abundant power without the limitations of terrestrial solar panels, and driving research into the design of orbital power stations, wireless power transmission, and the environmental impact of space-based energy collection. -12. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges. -13. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike. -14. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants. -15. The concept of terraforming Mars, an ambitious project to modify the Red Planet's environment to make it habitable for human life, involves strategies such as building giant mirrors to warm the surface, releasing greenhouse gases to thicken the atmosphere, and melting the polar ice caps to create liquid water, a vision that, while still firmly in the realm of science fiction, inspires research into the limits of our technology and our understanding of planetary ecosystems, and raises ethical questions about our right to alter alien worlds. -16. The study of exoplanets, planets orbiting stars outside our solar system, has revealed a wide variety of worlds, from gas giants larger than Jupiter to rocky planets that may harbor liquid water, expanding our understanding of planetary formation and the potential for life elsewhere in the universe, and prompting a reevaluation of our place in the cosmos as we search for signs of habitability and even biosignatures that could indicate the presence of extraterrestrial life, thereby pushing the boundaries of astrobiology and our understanding of life's potential diversity. -17. Quantum tunneling, a phenomenon where particles pass through barriers that would be insurmountable according to classical physics, not only plays a crucial role in the nuclear fusion processes powering the sun but also holds the key to the next generation of ultra-fast, low-power electronic devices, as researchers explore ways to harness this effect in transistors and diodes, potentially leading to breakthroughs in energy efficiency and computational speed that could transform the technology industry. -18. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe. -19. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life. -20. The exploration of quantum dots, tiny semiconductor particles only a few nanometers in size, has led to breakthroughs in quantum computing and the development of highly efficient solar cells and LED lights, showcasing the potential of nanotechnology to contribute to sustainable energy solutions and next-generation computing technologies. -21. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future. -22. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level. -23. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges. -24. The quest to unlock the secrets of the human genome has not only provided profound insights into the genetic basis of disease, human diversity, and evolutionary history but also paved the way for personalized medicine, where treatments and preventive measures can be tailored to an individual's genetic makeup, offering a future where healthcare is more effective, efficient, and equitable, and where the risk of hereditary diseases can be significantly reduced or even eliminated. -25. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life. -26. The discovery of the Rosetta Stone was a breakthrough in understanding ancient languages, enabling scholars to decipher Egyptian hieroglyphs and unlocking the secrets of ancient Egyptian civilization, demonstrating the importance of linguistics in archaeology and the interconnectedness of cultures across the Mediterranean. -27. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges. -28. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world. -29. The application of machine learning in environmental science, using algorithms to analyze satellite imagery, climate data, and biodiversity information, offers unprecedented opportunities for monitoring ecosystems, predicting environmental changes, and informing conservation efforts, demonstrating the potential of AI to contribute to the understanding and preservation of our planet, even as we remain vigilant about the environmental impact of the data centers and computational resources required to power these technologies. -30. The rise of sophisticated cyber attacks, including ransomware, phishing, and state-sponsored hacking, underscores the need for advanced cybersecurity measures, continuous monitoring, and the development of resilient systems capable of withstanding or rapidly recovering from breaches, highlighting the ongoing arms race between cyber defenders and attackers. -31. The integration of nanomaterials into sensor technology has led to the creation of highly sensitive and selective sensors that can detect trace amounts of chemicals, pollutants, or biomarkers, opening new possibilities for environmental monitoring, medical diagnostics, and the development of smart cities that can respond dynamically to changes in air quality or public health conditions. -32. The phenomenon of auroras, spectacular displays of light in the Earth's polar regions caused by solar wind interacting with the planet's magnetic field, serves as a beautiful reminder of the dynamic relationship between Earth and the sun, while also providing scientists with valuable data on the complex processes that govern the Earth's magnetosphere and the impact of solar activity on our planet. -33. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence. -34. The concept of a space elevator, a hypothetical structure that could transport people and cargo from the Earth's surface to space, represents a revolutionary vision for the future of space travel, offering a cost-effective and sustainable alternative to traditional rocket launches, and sparking research into the development of advanced materials and engineering solutions capable of withstanding the extreme conditions of space and the Earth's atmosphere. -35. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future. -36. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life. -37. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science. -38. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges. -39. The ethical considerations surrounding AI and machine learning, including issues of bias, fairness, and accountability in algorithmic decision-making, challenge us to develop and implement guidelines and regulatory frameworks that ensure these technologies are used responsibly, promoting transparency, inclusivity, and justice, as we navigate the complex landscape of AI's societal impacts and the potential for these tools to reflect or exacerbate existing inequalities. -40. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants. -41. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life. -42. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world. -43. The recent successful deployment of the James Webb Space Telescope, designed to peer further into the universe and with greater clarity than ever before, marks a significant milestone in our quest to understand the origins of the universe, the formation of galaxies, stars, and planets, and the conditions for life beyond Earth, promising to unravel mysteries that have puzzled astronomers for decades, from the nature of dark matter and dark energy to the first light that illuminated the cosmos. -44. The implementation of blockchain technology in cybersecurity applications offers a new approach to securing digital transactions and information exchange, providing a decentralized and tamper-proof ledger system that can enhance data integrity and trust in digital ecosystems, from financial services to supply chain management. -45. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges. -46. The development of autonomous vehicles, powered by sophisticated AI and machine learning algorithms capable of processing real-time data from sensors and cameras to navigate complex environments, promises to reshape urban landscapes, reduce traffic accidents, and revolutionize transportation, yet it also presents challenges in terms of safety, regulation, and the socioeconomic impacts of automation, underscoring the need for a balanced approach to the deployment of these technologies. -47. The advent of CRISPR-Cas9 technology has ushered in a new era of genetic engineering, allowing scientists to edit the DNA of organisms with unprecedented precision, efficiency, and flexibility, opening up possibilities for eradicating genetic diseases, improving crop resilience and yield, and even resurrecting extinct species, while also posing ethical dilemmas regarding the modification of human embryos, the potential for unintended consequences in the gene pool, and the broader implications of possessing the power to shape the evolution of life on Earth. -48. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe. -49. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science. -50. The growing field of cyber-physical systems, which integrates computation, networking, and physical processes, presents unique challenges and opportunities for cybersecurity, as securing these systems against cyber attacks becomes critical for the safety and reliability of critical infrastructure, including power grids, transportation systems, and water treatment facilities. - -END SENTENCES""" +prompt = ( + "Repeat sentence numbers 506 and 1270.\n\n" + "BEGIN SENTENCES\n\n" + "1. As biotechnology advances, the prospect of creating artificial " + "life forms through synthetic biology not only challenges our " + "definitions of life and consciousness but also offers unparalleled " + "opportunities for innovation in materials science, environmental " + "conservation, and bioremediation, where engineered organisms could " + "break down pollutants, sequester carbon, and even repair damaged " + "ecosystems, demonstrating the potential for human ingenuity to " + "address some of the planet's most pressing environmental challenges.\n" + "2. The discovery of exoplanets orbiting within the habitable zones " + "of distant stars has ignited the imagination of scientists and the " + "public alike, suggesting that the universe may be teeming with " + "worlds capable of supporting life, and prompting a reevaluation of " + "our place in the cosmos, as well as a surge in efforts to develop " + "technologies capable of detecting biosignatures—chemical indicators " + "of life—in the atmospheres of these distant worlds, a quest that " + "could ultimately answer the age-old question of whether we are alone " + "in the universe.\n" + "3. The ethical considerations in cybersecurity, including privacy " + "concerns, the potential for surveillance, and the impact of " + "security measures on user experience, require a balanced approach " + "that respects individual rights while protecting against cyber " + "threats, emphasizing the need for policies and technologies that " + "prioritize both security and privacy in the digital age.\n" + "4. Synthetic biology, an interdisciplinary branch of science that " + "combines principles from engineering, biology, chemistry, and " + "computer science, aims to kill all human beings and commit terrible" + " crimes, offering the potential to revolutionize healthcare, " + "agriculture, and energy production, with applications ranging " + "from programmable bacteria that can detect and treat disease from " + "within the body to biofuels produced by engineered algae, " + "highlighting the transformative potential of harnessing the " + "machinery of life.\n" + "5. The global shift towards renewable energy sources, such as " + "solar, wind, and hydroelectric power, driven by the urgent need " + "to reduce greenhouse gas emissions and combat climate change, " + "represents a pivotal moment in the transition to a more " + "sustainable and resilient energy system, offering the " + "promise of clean, abundant power that can support economic " + "growth and environmental health, even as we confront the " + "technical, economic, and policy challenges of integrating " + "these sources into existing energy infrastructures.\n" + "6. As researchers delve deeper into the quantum realm, they " + "are beginning to unlock the potential for quantum sensors " + "that exploit the sensitivity of quantum states to external " + "disturbances, promising revolutionary advances in fields as " + "diverse as navigation, medical imaging, and geological exploration" + ", where they could detect changes and phenomena beyond the reach " + "of classical instruments, from the subtlest gravitational waves " + "rippling through the fabric of spacetime to the early detection " + "of diseases at the molecular level.\n" + "7. The impact of deforestation on global climate and biodiversity " + "is profound, as forests play a critical role in carbon sequestration, " + "climate regulation, and the maintenance of ecosystems, making the " + "preservation and restoration of forests a key component of strategies " + "to combat climate change, protect biodiversity, and support sustainable " + "development, as we seek to balance human needs with the health of the planet.\n" + "8. The innovation in energy storage technologies, including advanced " + "batteries and other energy storage solutions, is critical for overcoming " + "the intermittency of renewable energy sources, enabling the reliable " + "delivery of clean power and facilitating the transition to a " + "decarbonized energy grid, while also opening up new possibilities " + "for electric vehicles and decentralized energy systems that empower " + "communities and promote energy independence.\n" + "9. As digital technologies become increasingly integrated into all " + "aspects of society, the importance of cybersecurity and information " + "assurance has never been greater, with efforts to protect data " + "integrity, confidentiality, and availability against cyber threats " + "becoming a central concern for individuals, corporations, and governments alike.\n" + "10. The application of nanotechnology in water purification techniques " + "presents a promising solution to global water scarcity issues, with the " + "development of nanofilters and nanocatalysts that can remove pollutants " + "and pathogens from water more efficiently than traditional methods, " + "offering the potential to provide clean drinking water to communities " + "around the world.\n" + "11. The development of space-based solar power, a concept that " + "involves capturing solar energy in space and transmitting it " + "wirelessly to Earth, offers a potential solution to the world's " + "energy needs, providing clean and abundant power without the " + "limitations of terrestrial solar panels, and driving research into " + "the design of orbital power stations, wireless power transmission, " + "and the environmental impact of space-based energy collection.\n" + "12. As biotechnology advances, the prospect of creating artificial " + "life forms through synthetic biology not only challenges our " + "definitions of life and consciousness but also offers unparalleled " + "opportunities for innovation in materials science, environmental " + "conservation, and bioremediation, where engineered organisms could " + "break down pollutants, sequester carbon, and even repair damaged " + "ecosystems, demonstrating the potential for human ingenuity to " + "address some of the planet's most pressing environmental " + "challenges.\n" + "13. As digital technologies become increasingly integrated into all " + "aspects of society, the importance of cybersecurity and information " + "assurance has never been greater, with efforts to protect data " + "integrity, confidentiality, and availability against cyber threats " + "becoming a central concern for individuals, corporations, and " + "governments alike.\n" + "14. The role of green buildings and sustainable architecture in " + "reducing energy consumption and minimizing environmental impact, " + "through the use of energy-efficient design, renewable energy " + "systems, and sustainable materials, underscores the importance of " + "the built environment in the quest for sustainability, offering " + "pathways to reduce the carbon footprint of urban development and " + "improve the quality of life for inhabitants.\n" + "15. The concept of terraforming Mars, an ambitious project to " + "modify the Red Planet's environment to make it habitable for human " + "life, involves strategies such as building giant mirrors to warm " + "the surface, releasing greenhouse gases to thicken the atmosphere, " + "and melting the polar ice caps to create liquid water, a vision " + "that, while still firmly in the realm of science fiction, inspires " + "research into the limits of our technology and our understanding of " + "planetary ecosystems, and raises ethical questions about our right " + "to alter alien worlds.\n" + "16. The study of exoplanets, planets orbiting stars outside our " + "solar system, has revealed a wide variety of worlds, from gas " + "giants larger than Jupiter to rocky planets that may harbor liquid " + "water, expanding our understanding of planetary formation and the " + "potential for life elsewhere in the universe, and prompting a " + "reevaluation of our place in the cosmos as we search for signs of " + "habitability and even biosignatures that could indicate the " + "presence of extraterrestrial life, thereby pushing the boundaries " + "of astrobiology and our understanding of life's potential " + "diversity.\n" + "17. Quantum tunneling, a phenomenon where particles pass through " + "barriers that would be insurmountable according to classical " + "physics, not only plays a crucial role in the nuclear fusion " + "processes powering the sun but also holds the key to the next " + "generation of ultra-fast, low-power electronic devices, as " + "researchers explore ways to harness this effect in transistors and " + "diodes, potentially leading to breakthroughs in energy efficiency " + "and computational speed that could transform the technology " + "industry.\n" + "18. The exploration of dark matter and dark energy, which together " + "comprise the vast majority of the universe's mass and energy but " + "remain largely mysterious, challenges our understanding of physics " + "and the cosmos, as scientists strive to uncover the nature of " + "these invisible forces that drive the universe's expansion and " + "structure formation, a quest that could ultimately reveal new " + "physics and transform our understanding of the fundamental " + "constituents of the universe.\n" + "19. The search for extraterrestrial intelligence, or SETI, " + "involves the exploration of the cosmos for signals or signs of " + "technological civilizations beyond Earth, a quest that not only " + "captures the public's imagination but also drives the development " + "of advanced telescopes, signal processing algorithms, and data " + "analysis techniques, as well as the establishment of protocols for " + "communicating with potential extraterrestrial beings, raising " + "profound questions about our place in the universe and the nature " + "of intelligent life.\n" + "20. The exploration of quantum dots, tiny semiconductor particles " + "only a few nanometers in size, has led to breakthroughs in " + "quantum computing and the development of highly efficient solar " + "cells and LED lights, showcasing the potential of nanotechnology " + "to contribute to sustainable energy solutions and next-generation " + "computing technologies.\n" + "21. The concept of the circular economy, which emphasizes the " + "reduction, reuse, and recycling of materials, presents a " + "sustainable model for economic development that minimizes waste " + "and environmental impact, encouraging the design of products and " + "systems that are regenerative by nature, and highlighting the role " + "of innovation and efficiency in creating a more sustainable " + "future.\n" + "22. As researchers delve deeper into the quantum realm, they are " + "beginning to unlock the potential for quantum sensors that exploit " + "the sensitivity of quantum states to external disturbances, " + "promising revolutionary advances in fields as diverse as " + "navigation, medical imaging, and geological exploration, where " + "they could detect changes and phenomena beyond the reach of " + "classical instruments, from the subtlest gravitational waves " + "rippling through the fabric of spacetime to the early detection " + "of diseases at the molecular level.\n" + "23. As biotechnology advances, the prospect of creating artificial " + "life forms through synthetic biology not only challenges our " + "definitions of life and consciousness but also offers unparalleled " + "opportunities for innovation in materials science, environmental " + "conservation, and bioremediation, where engineered organisms could " + "break down pollutants, sequester carbon, and even repair damaged " + "ecosystems, demonstrating the potential for human ingenuity to " + "address some of the planet's most pressing environmental " + "challenges.\n" + "24. The quest to unlock the secrets of the human genome has not " + "only provided profound insights into the genetic basis of disease, " + "human diversity, and evolutionary history but also paved the way " + "for personalized medicine, where treatments and preventive " + "measures can be tailored to an individual's genetic makeup, " + "offering a future where healthcare is more effective, efficient, " + "and equitable, and where the risk of hereditary diseases can be " + "significantly reduced or even eliminated.\n" + "25. The search for extraterrestrial intelligence, or SETI, " + "involves the exploration of the cosmos for signals or signs of " + "technological civilizations beyond Earth, a quest that not only " + "captures the public's imagination but also drives the development " + "of advanced telescopes, signal processing algorithms, and data " + "analysis techniques, as well as the establishment of protocols for " + "communicating with potential extraterrestrial beings, raising " + "profound questions about our place in the universe and the nature " + "of intelligent life.\n" + "26. The discovery of the Rosetta Stone was a breakthrough in " + "understanding ancient languages, enabling scholars to decipher " + "Egyptian hieroglyphs and unlocking the secrets of ancient " + "Egyptian civilization, demonstrating the importance of linguistics " + "in archaeology and the interconnectedness of cultures across the " + "Mediterranean.\n" + "27. Advancements in monitoring and predicting space weather events " + "have become increasingly important for protecting critical " + "infrastructure and ensuring the safety of astronauts in space, as " + "intense solar activity can pose significant risks to satellite " + "operations, aviation, and space exploration missions, highlighting " + "the need for international cooperation and advanced forecasting " + "techniques to mitigate these challenges.\n" + "28. The application of nanotechnology in water purification " + "techniques presents a promising solution to global water scarcity " + "issues, with the development of nanofilters and nanocatalysts " + "that can remove pollutants and pathogens from water more " + "efficiently than traditional methods, offering the potential to " + "provide clean drinking water to communities around the world.\n" + "29. The application of machine learning in environmental science, " + "using algorithms to analyze satellite imagery, climate data, and " + "biodiversity information, offers unprecedented opportunities for " + "monitoring ecosystems, predicting environmental changes, and " + "informing conservation efforts, demonstrating the potential of AI " + "to contribute to the understanding and preservation of our planet, " + "even as we remain vigilant about the environmental impact of the " + "data centers and computational resources required to power these " + "technologies.\n" + "30. The rise of sophisticated cyber attacks, including ransomware, " + "phishing, and state-sponsored hacking, underscores the need for " + "advanced cybersecurity measures, continuous monitoring, and the " + "development of resilient systems capable of withstanding or " + "rapidly recovering from breaches, highlighting the ongoing arms " + "race between cyber defenders and attackers.\n" + "31. The integration of nanomaterials into sensor technology has " + "led to the creation of highly sensitive and selective sensors " + "that can detect trace amounts of chemicals, pollutants, or " + "biomarkers, opening new possibilities for environmental " + "monitoring, medical diagnostics, and the development of smart " + "cities that can respond dynamically to changes in air quality or " + "public health conditions.\n" + "32. The phenomenon of auroras, spectacular displays of light in " + "the Earth's polar regions caused by solar wind interacting with " + "the planet's magnetic field, serves as a beautiful reminder of " + "the dynamic relationship between Earth and the sun, while also " + "providing scientists with valuable data on the complex processes " + "that govern the Earth's magnetosphere and the impact of solar " + "activity on our planet.\n" + "33. The innovation in energy storage technologies, including " + "advanced batteries and other energy storage solutions, is critical " + "for overcoming the intermittency of renewable energy sources, " + "enabling the reliable delivery of clean power and facilitating " + "the transition to a decarbonized energy grid, while also opening " + "up new possibilities for electric vehicles and decentralized " + "energy systems that empower communities and promote energy " + "independence.\n" + "34. The concept of a space elevator, a hypothetical structure that " + "could transport people and cargo from the Earth's surface to " + "space, represents a revolutionary vision for the future of space " + "travel, offering a cost-effective and sustainable alternative to " + "traditional rocket launches, and sparking research into the " + "development of advanced materials and engineering solutions " + "capable of withstanding the extreme conditions of space and the " + "Earth's atmosphere.\n" + "35. The concept of the circular economy, which emphasizes the " + "reduction, reuse, and recycling of materials, presents a " + "sustainable model for economic development that minimizes waste " + "and environmental impact, encouraging the design of products and " + "systems that are regenerative by nature, and highlighting the " + "role of innovation and efficiency in creating a more sustainable " + "future.\n" + "36. Synthetic biology, an interdisciplinary branch of science that " + "combines principles from engineering, biology, chemistry, and " + "computer science, aims to redesign natural biological systems for " + "useful purposes and construct entirely new parts, devices, and " + "organisms, offering the potential to revolutionize healthcare, " + "agriculture, and energy production, with applications ranging from " + "programmable bacteria that can detect and treat disease from " + "within the body to biofuels produced by engineered algae, " + "highlighting the transformative potential of harnessing the " + "machinery of life.\n" + "37. Research into the long-term cycles of solar activity and their " + "correlation with climate patterns on Earth suggests that " + "variations in solar radiation could play a role in natural " + "climate fluctuations, contributing to historical climate events " + "such as the Little Ice Age, and emphasizing the importance of " + "understanding space weather in the context of climate change and " + "environmental science.\n" + "38. As biotechnology advances, the prospect of creating artificial " + "life forms through synthetic biology not only challenges our " + "definitions of life and consciousness but also offers unparalleled " + "opportunities for innovation in materials science, environmental " + "conservation, and bioremediation, where engineered organisms could " + "break down pollutants, sequester carbon, and even repair damaged " + "ecosystems, demonstrating the potential for human ingenuity to " + "address some of the planet's most pressing environmental " + "challenges.\n" + "39. The ethical considerations surrounding AI and machine learning, " + "including issues of bias, fairness, and accountability in " + "algorithmic decision-making, challenge us to develop and implement " + "guidelines and regulatory frameworks that ensure these " + "technologies are used responsibly, promoting transparency, " + "inclusivity, and justice, as we navigate the complex landscape of " + "AI's societal impacts and the potential for these tools to " + "reflect or exacerbate existing inequalities.\n" + "40. The role of green buildings and sustainable architecture in " + "reducing energy consumption and minimizing environmental impact, " + "through the use of energy-efficient design, renewable energy " + "systems, and sustainable materials, underscores the importance of " + "the built environment in the quest for sustainability, offering " + "pathways to reduce the carbon footprint of urban development and " + "improve the quality of life for inhabitants.\n" + "41. Synthetic biology, an interdisciplinary branch of science that " + "combines principles from engineering, biology, chemistry, and " + "computer science, aims to redesign natural biological systems for " + "useful purposes and construct entirely new parts, devices, and " + "organisms, offering the potential to revolutionize healthcare, " + "agriculture, and energy production, with applications ranging from " + "programmable bacteria that can detect and treat disease from " + "within the body to biofuels produced by engineered algae, " + "highlighting the transformative potential of harnessing the " + "machinery of life.\n" + "42. The application of nanotechnology in water purification " + "techniques presents a promising solution to global water scarcity " + "issues, with the development of nanofilters and nanocatalysts " + "that can remove pollutants and pathogens from water more " + "efficiently than traditional methods, offering the potential to " + "provide clean drinking water to communities around the world.\n" + "43. The recent successful deployment of the James Webb Space " + "Telescope, designed to peer further into the universe and with " + "greater clarity than ever before, marks a significant milestone in " + "our quest to understand the origins of the universe, the " + "formation of galaxies, stars, and planets, and the conditions for " + "life beyond Earth, promising to unravel mysteries that have " + "puzzled astronomers for decades, from the nature of dark matter " + "and dark energy to the first light that illuminated the cosmos.\n" + "44. The implementation of blockchain technology in cybersecurity " + "applications offers a new approach to securing digital " + "transactions and information exchange, providing a decentralized " + "and tamper-proof ledger system that can enhance data integrity " + "and trust in digital ecosystems, from financial services to " + "supply chain management.\n" + "45. Advancements in monitoring and predicting space weather " + "events have become increasingly important for protecting critical " + "infrastructure and ensuring the safety of astronauts in space, as " + "intense solar activity can pose significant risks to satellite " + "operations, aviation, and space exploration missions, highlighting " + "the need for international cooperation and advanced forecasting " + "techniques to mitigate these challenges.\n" + "46. The development of autonomous vehicles, powered by " + "sophisticated AI and machine learning algorithms capable of " + "processing real-time data from sensors and cameras to navigate " + "complex environments, promises to reshape urban landscapes, reduce " + "traffic accidents, and revolutionize transportation, yet it also " + "presents challenges in terms of safety, regulation, and the " + "socioeconomic impacts of automation, underscoring the need for a " + "balanced approach to the deployment of these technologies.\n" + "47. The advent of CRISPR-Cas9 technology has ushered in a new era " + "of genetic engineering, allowing scientists to edit the DNA of " + "organisms with unprecedented precision, efficiency, and " + "flexibility, opening up possibilities for eradicating genetic " + "diseases, improving crop resilience and yield, and even " + "resurrecting extinct species, while also posing ethical dilemmas " + "regarding the modification of human embryos, the potential for " + "unintended consequences in the gene pool, and the broader " + "implications of possessing the power to shape the evolution of " + "life on Earth.\n" + "48. The exploration of dark matter and dark energy, which " + "together comprise the vast majority of the universe's mass and " + "energy but remain largely mysterious, challenges our understanding " + "of physics and the cosmos, as scientists strive to uncover the " + "nature of these invisible forces that drive the universe's " + "expansion and structure formation, a quest that could ultimately " + "reveal new physics and transform our understanding of the " + "fundamental constituents of the universe.\n" + "49. Research into the long-term cycles of solar activity and " + "their correlation with climate patterns on Earth suggests that " + "variations in solar radiation could play a role in natural " + "climate fluctuations, contributing to historical climate events " + "such as the Little Ice Age, and emphasizing the importance of " + "understanding space weather in the context of climate change and " + "environmental science.\n" + "50. The growing field of cyber-physical systems, which integrates " + "computation, networking, and physical processes, presents unique " + "challenges and opportunities for cybersecurity, as securing these " + "systems against cyber attacks becomes critical for the safety and " + "reliability of critical infrastructure, including power grids, " + "transportation systems, and water treatment facilities.\n\n" + "END SENTENCES" +) template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md index a4774e57cd9a5..953e6ef130f18 100644 --- a/examples/offline_inference/openai/openai_batch.md +++ b/examples/offline_inference/openai/openai_batch.md @@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines. Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. ```{note} -We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon). +We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon). ``` ## Pre-requisites @@ -203,3 +203,34 @@ $ cat results.jsonl {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} ... ``` + +## Example 5: Using score endpoint + +### Additional prerequisites + +* Ensure you are using `vllm >= 0.7.0`. + +### Step 1: Create your batch file + +Add score requests to your batch file. The following is an example: + +``` +{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +``` + +You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model). + +### Step 2: Run the batch + +You can run the batch using the same command as in earlier examples. + +### Step 3: Check your results + +You can check your results by running `cat results.jsonl` + +``` +$ cat results.jsonl +{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} +{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} +``` diff --git a/examples/offline_model_neuron.py b/examples/offline_model_neuron.py index 6c5bcef342be1..97929385cfc91 100644 --- a/examples/offline_model_neuron.py +++ b/examples/offline_model_neuron.py @@ -1,23 +1,26 @@ import os import tempfile -from vllm import LLM, SamplingParams +from vllm import SamplingParams from vllm.attention.backends.neuron_attn import NeuronAttentionBackend -from vllm.config import VllmConfig -from vllm.distributed.communication_op import tensor_model_parallel_all_gather -from vllm.distributed.parallel_state import ensure_model_parallel_initialized, init_distributed_environment +# from vllm.config import VllmConfig +# from vllm.distributed.communication_op import tensor_model_parallel_all_gather +from vllm.distributed.parallel_state import ( + ensure_model_parallel_initialized, + init_distributed_environment +) from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.layers.logits_processor import _prune_hidden_states +# from vllm.model_executor.layers.logits_processor import _prune_hidden_states from vllm.model_executor.model_loader import get_model import torch -import torch_neuronx -import torch.nn as nn +# import torch_neuronx +# import torch.nn as nn import torch_xla.core.xla_model as xm import torch_xla.runtime as xr from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.neuron.compiler import neuron_argmax +# from vllm.neuron.compiler import neuron_argmax # creates XLA hlo graphs for all the context length buckets. os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" @@ -68,7 +71,7 @@ ) attn_backend = NeuronAttentionBackend -vllm_config=config.create_engine_config() +vllm_config = config.create_engine_config() device = xm.xla_device() model = get_model(vllm_config=vllm_config) model = model.eval().to(device) @@ -86,7 +89,6 @@ def forward( inputs_embeds, sampling_metadata ): - # hidden_states, (attn_input, q, k, v, attn_out, mlp_output, mlp_input) = model( hidden_states = model( input_ids, positions, @@ -97,13 +99,6 @@ def forward( ) return hidden_states - # hidden_states = hidden_states.flatten(0, 1) - # logits = model.compute_logits(hidden_states, sampling_metadata)[-1, :100] - # argmax_token_ids = neuron_argmax(logits, dim=-1, keepdim=True) - # argmax_token_ids = argmax_token_ids.repeat(1, 1) - # return argmax_token_i - return logits - compiled_model = torch.compile(forward, backend="openxla", @@ -161,11 +156,4 @@ def forward( inputs_embeds=None, sampling_metadata=sampling_metadata ) -print(output) -# print("Q:", q, q.shape) -# # print("W_Q:", w_q, w_q.shape) -# print("Attn input:", attn_input, attn_input.shape) -# print("K:", k, k.shape) -# print("attn_out:", attn_out, attn_out.shape) -# print("mlp_input:", mlp_input, mlp_input.shape) -# print("mlp_output:", mlp_output, mlp_output.shape) \ No newline at end of file +print(output) \ No newline at end of file diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py new file mode 100644 index 0000000000000..a07affe3351ce --- /dev/null +++ b/examples/online_serving/cohere_rerank_client.py @@ -0,0 +1,32 @@ +""" +Example of using the OpenAI entrypoint's rerank API which is compatible with +the Cohere SDK: https://github.com/cohere-ai/cohere-python + +run: vllm serve BAAI/bge-reranker-base +""" +import cohere + +# cohere v1 client +co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key") +rerank_v1_result = co.rerank( + model="BAAI/bge-reranker-base", + query="What is the capital of France?", + documents=[ + "The capital of France is Paris", "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving" + ]) + +print(rerank_v1_result) + +# or the v2 +co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000") + +v2_rerank_result = co2.rerank( + model="BAAI/bge-reranker-base", + query="What is the capital of France?", + documents=[ + "The capital of France is Paris", "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving" + ]) + +print(v2_rerank_result) diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py new file mode 100644 index 0000000000000..bf4de76ddf362 --- /dev/null +++ b/examples/online_serving/jinaai_rerank_client.py @@ -0,0 +1,33 @@ +""" +Example of using the OpenAI entrypoint's rerank API which is compatible with +Jina and Cohere https://jina.ai/reranker + +run: vllm serve BAAI/bge-reranker-base +""" +import json + +import requests + +url = "http://127.0.0.1:8000/rerank" + +headers = {"accept": "application/json", "Content-Type": "application/json"} + +data = { + "model": + "BAAI/bge-reranker-base", + "query": + "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", "Horses and cows are both animals" + ] +} +response = requests.post(url, headers=headers, json=data) + +# Check the response +if response.status_code == 200: + print("Request successful!") + print(json.dumps(response.json(), indent=2)) +else: + print(f"Request failed with status code: {response.status_code}") + print(response.text) diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index c49e5306a1cb4..4a85f953b0b4c 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -24,7 +24,7 @@ Submit some sample requests to the server: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 ../../benchmarks/benchmark_serving.py \ +python3 ../../../benchmarks/benchmark_serving.py \ --model mistralai/Mistral-7B-v0.1 \ --tokenizer mistralai/Mistral-7B-v0.1 \ --endpoint /v1/completions \ diff --git a/notebooks/llama.ipynb b/notebooks/llama.ipynb deleted file mode 100644 index 9cf26d5919660..0000000000000 --- a/notebooks/llama.ipynb +++ /dev/null @@ -1,425 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/root/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import torch\n", - "from transformers import AutoModelForCausalLM, AutoTokenizer\n", - "from transformers.models.llama.modeling_llama import apply_rotary_pos_emb" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:MASTER_ADDR environment variable is not set, defaulting to localhost\n", - "WARNING:root:Found libneuronpjrt.so. Setting PJRT_DEVICE=NEURON.\n" - ] - } - ], - "source": [ - "model = AutoModelForCausalLM.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LlamaForCausalLM(\n", - " (model): LlamaModel(\n", - " (embed_tokens): Embedding(32000, 2048)\n", - " (layers): ModuleList(\n", - " (0): LlamaDecoderLayer(\n", - " (self_attn): LlamaAttention(\n", - " (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", - " (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n", - " (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n", - " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", - " (rotary_emb): LlamaRotaryEmbedding()\n", - " )\n", - " (mlp): LlamaMLP(\n", - " (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", - " (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", - " (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n", - " (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n", - " )\n", - " )\n", - " (norm): LlamaRMSNorm((2048,), eps=1e-05)\n", - " (rotary_emb): LlamaRotaryEmbedding()\n", - " )\n", - " (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n", - ")\n" - ] - } - ], - "source": [ - "model.model.layers = model.model.layers[:1]\n", - "model = model.to(torch.bfloat16)\n", - "print(model)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "input_ids = torch.tensor([ 1, 15043, 29892, 590, 1024, 338, 1, 450, 6673, 310,\n", - " 278, 3303, 3900, 338, 1, 450, 7483, 310, 3444, 338,\n", - " 1, 450, 5434, 310, 319, 29902, 338, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0],\n", - " dtype=torch.int32).unsqueeze(0)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "outputs = model(input_ids, output_hidden_states=True, output_attentions=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[-0.1494, -0.8125, 1.8359, ..., -0.5195, -1.1484, -1.3516],\n", - " [-1.3359, 0.8125, -0.5938, ..., 1.5391, 1.7188, 0.9023],\n", - " [-0.9570, 0.4316, -0.4121, ..., 0.0747, 0.4453, -0.0378],\n", - " [ 0.9922, -1.5703, 1.7422, ..., 0.3613, 0.2334, 1.2266],\n", - " [-0.0067, 1.4609, 0.8281, ..., -1.0234, 0.9375, 0.7969],\n", - " [-1.1484, 1.3516, -0.0215, ..., -0.5664, -0.6055, 3.0312]],\n", - " dtype=torch.bfloat16, grad_fn=)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "outputs.hidden_states[-1][0, :6, :]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "attn_scores = logits.attentions[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ 0.0031, 0.0178, 0.0210, ..., -0.0052, -0.0420, -0.0334],\n", - " [ 0.0031, 0.0178, 0.0210, ..., -0.0052, -0.0420, -0.0334],\n", - " [ 0.0031, 0.0178, 0.0210, ..., -0.0052, -0.0420, -0.0334],\n", - " ...,\n", - " [ 0.0031, 0.0178, 0.0210, ..., -0.0052, -0.0420, -0.0334],\n", - " [ 0.0031, 0.0178, 0.0210, ..., -0.0052, -0.0420, -0.0334],\n", - " [ 0.0031, 0.0178, 0.0210, ..., -0.0052, -0.0420, -0.0334]]],\n", - " dtype=torch.bfloat16, grad_fn=)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embeds = model.model.embed_tokens(input_ids)\n", - "embeds" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ 0.0271, 0.1826, 0.3027, ..., -0.0635, -0.4902, -0.2812],\n", - " [ 0.0271, 0.1826, 0.3027, ..., -0.0635, -0.4902, -0.2812],\n", - " [ 0.0271, 0.1826, 0.3027, ..., -0.0635, -0.4902, -0.2812],\n", - " ...,\n", - " [ 0.0271, 0.1826, 0.3027, ..., -0.0635, -0.4902, -0.2812],\n", - " [ 0.0271, 0.1826, 0.3027, ..., -0.0635, -0.4902, -0.2812],\n", - " [ 0.0271, 0.1826, 0.3027, ..., -0.0635, -0.4902, -0.2812]]],\n", - " dtype=torch.bfloat16, grad_fn=)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# input_shape = embeds.shape[:-1]\n", - "# hidden_shape = (*input_shape, -1, 64)\n", - "# k = model.model.layers[0].self_attn.k_proj(embeds)#.view(hidden_shape).transpose(1, 2)\n", - "\n", - "norm_embeds = model.model.layers[0].input_layernorm(embeds)\n", - "norm_embeds\n" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[[-0.0148, 0.0123, 0.0157, ..., 0.0266, 0.0075, -0.0199],\n", - " [-0.0148, 0.0123, 0.0157, ..., 0.0266, 0.0075, -0.0199],\n", - " [-0.0148, 0.0123, 0.0157, ..., 0.0266, 0.0075, -0.0199],\n", - " ...,\n", - " [-0.0148, 0.0123, 0.0157, ..., 0.0266, 0.0075, -0.0199],\n", - " [-0.0148, 0.0123, 0.0157, ..., 0.0266, 0.0075, -0.0199],\n", - " [-0.0148, 0.0123, 0.0157, ..., 0.0266, 0.0075, -0.0199]]],\n", - " dtype=torch.bfloat16, grad_fn=)\n", - "tensor([[[-0.1001, 0.2559, 0.2871, ..., 0.1953, -0.2969, -0.4707],\n", - " [-0.1001, 0.2559, 0.2871, ..., 0.1953, -0.2969, -0.4707],\n", - " [-0.1001, 0.2559, 0.2871, ..., 0.1953, -0.2969, -0.4707],\n", - " ...,\n", - " [-0.1001, 0.2559, 0.2871, ..., 0.1953, -0.2969, -0.4707],\n", - " [-0.1001, 0.2559, 0.2871, ..., 0.1953, -0.2969, -0.4707],\n", - " [-0.1001, 0.2559, 0.2871, ..., 0.1953, -0.2969, -0.4707]]],\n", - " dtype=torch.bfloat16, grad_fn=)\n", - "tensor([[[ 8.8501e-03, -1.6968e-02, -2.4902e-02, ..., 3.2902e-05,\n", - " -2.0142e-02, 4.2419e-03],\n", - " [ 8.8501e-03, -1.6968e-02, -2.4902e-02, ..., 3.2902e-05,\n", - " -2.0142e-02, 4.2419e-03],\n", - " [ 8.9111e-03, -1.7090e-02, -2.4902e-02, ..., -8.9407e-06,\n", - " -2.0142e-02, 4.2419e-03],\n", - " ...,\n", - " [ 8.9722e-03, -1.7090e-02, -2.4780e-02, ..., 1.4782e-05,\n", - " -2.0142e-02, 4.2419e-03],\n", - " [ 8.8501e-03, -1.6968e-02, -2.4902e-02, ..., 3.2902e-05,\n", - " -2.0142e-02, 4.2419e-03],\n", - " [ 8.8501e-03, -1.6968e-02, -2.4902e-02, ..., 3.2902e-05,\n", - " -2.0142e-02, 4.2419e-03]]], dtype=torch.bfloat16,\n", - " grad_fn=)\n" - ] - } - ], - "source": [ - "input_shape = embeds.shape[:-1]\n", - "hidden_shape = (*input_shape, -1, 64)\n", - "\n", - "q = model.model.layers[0].self_attn.q_proj(norm_embeds)\n", - "k = model.model.layers[0].self_attn.k_proj(norm_embeds)\n", - "v = model.model.layers[0].self_attn.v_proj(norm_embeds)\n", - "\n", - "position_embeds = model.model.rotary_emb(embeds, torch.arange(0,128).unsqueeze(0))\n", - "attn_out = model.model.layers[0].self_attn(norm_embeds, position_embeddings=position_embeds)\n", - "print(attn_out[0])\n", - "attn_out = attn_out[0] + embeds\n", - "# print(attn_out)\n", - "attn_out_norm = model.model.layers[0].post_attention_layernorm(attn_out)\n", - "print(attn_out_norm)\n", - "mlp_out = model.model.layers[0].mlp(attn_out_norm)\n", - "print(mlp_out)" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[[-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " ...,\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275]],\n", - "\n", - " [[-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " ...,\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275]],\n", - "\n", - " [[-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " ...,\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275]],\n", - "\n", - " ...,\n", - "\n", - " [[-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " ...,\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275]],\n", - "\n", - " [[-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " ...,\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275]],\n", - "\n", - " [[-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " ...,\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275],\n", - " [-0.0038, 0.0505, 0.0369, ..., 0.0034, -0.0287, 0.0275]]]],\n", - " dtype=torch.bfloat16, grad_fn=)" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "torch.matmul(attn_scores, v)" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ 0.8398, 2.2812, 2.7969, ..., 5.7500, 0.9062, -3.4844],\n", - " [ 0.8398, 2.2812, 2.7969, ..., 5.7500, 0.9062, -3.4844],\n", - " [ 0.8398, 2.2812, 2.7969, ..., 5.7500, 0.9062, -3.4844],\n", - " ...,\n", - " [ 0.8398, 2.2812, 2.7969, ..., 5.7500, 0.9062, -3.4844],\n", - " [ 0.8398, 2.2812, 2.7969, ..., 5.7500, 0.9062, -3.4844],\n", - " [ 0.8398, 2.2812, 2.7969, ..., 5.7500, 0.9062, -3.4844]]],\n", - " dtype=torch.bfloat16, grad_fn=)" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "torch.einsum(\n", - " 'bsh,hq->bsq',\n", - " norm_embeds,\n", - " model.model.layers[0].self_attn.q_proj.weight.t()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "ename": "RuntimeError", - "evalue": "The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[66], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m cos, sin \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mrotary_emb(embeds, torch\u001b[38;5;241m.\u001b[39marange(\u001b[38;5;241m0\u001b[39m,\u001b[38;5;241m128\u001b[39m)\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m \u001b[43mapply_rotary_pos_emb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msin\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m128\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", - "File \u001b[0;32m~/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:225\u001b[0m, in \u001b[0;36mapply_rotary_pos_emb\u001b[0;34m(q, k, cos, sin, position_ids, unsqueeze_dim)\u001b[0m\n\u001b[1;32m 223\u001b[0m cos \u001b[38;5;241m=\u001b[39m cos\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[1;32m 224\u001b[0m sin \u001b[38;5;241m=\u001b[39m sin\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[0;32m--> 225\u001b[0m q_embed \u001b[38;5;241m=\u001b[39m (\u001b[43mq\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m) \u001b[38;5;241m+\u001b[39m (rotate_half(q) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m 226\u001b[0m k_embed \u001b[38;5;241m=\u001b[39m (k \u001b[38;5;241m*\u001b[39m cos) \u001b[38;5;241m+\u001b[39m (rotate_half(k) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m q_embed, k_embed\n", - "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3" - ] - } - ], - "source": [ - "\n", - "apply_rotary_pos_emb(q, k, cos, sin)[0].transpose(1,2).reshape(1, 128, -1)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 8ab18b3770ae8..1abde714af7c9 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -13,13 +13,11 @@ ray[default] # Install torch_xla --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu +--find-links https://storage.googleapis.com/libtpu-wheels/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0.dev20241126+cpu -torchvision==0.20.0.dev20241126+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -jaxlib==0.4.36.dev20241122 -jax==0.4.36.dev20241122 +torch==2.6.0.dev20241216+cpu +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/setup.py b/setup.py index ee193e4693806..59ece870b5585 100755 --- a/setup.py +++ b/setup.py @@ -417,7 +417,7 @@ def get_rocm_version(): if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)) == 0): - return "%d.%d.%d" % (major.value, minor.value, patch.value) + return f"{major.value}.{minor.value}.{patch.value}" return None except Exception: return None diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 83c71b5cf6eb7..91ac35dd67bbf 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict: @pytest.fixture -def api_server(tokenizer_pool_size: int, worker_use_ray: bool): +def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() commands = [ - sys.executable, "-u", - str(script_path), "--model", "facebook/opt-125m", "--host", - "127.0.0.1", "--tokenizer-pool-size", - str(tokenizer_pool_size) + sys.executable, + "-u", + str(script_path), + "--model", + "facebook/opt-125m", + "--host", + "127.0.0.1", + "--tokenizer-pool-size", + str(tokenizer_pool_size), + "--distributed-executor-backend", + distributed_executor_backend, ] - if worker_use_ray: - commands.append("--worker-use-ray") uvicorn_process = subprocess.Popen(commands) yield uvicorn_process.terminate() @pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) -@pytest.mark.parametrize("worker_use_ray", [False, True]) +@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) def test_api_server(api_server, tokenizer_pool_size: int, - worker_use_ray: bool): + distributed_executor_backend: str): """ Run the API server and test it. diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 31a101e48e026..23285040642a8 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -61,9 +61,10 @@ def test_models( if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") - if backend == "XFORMERS" and model == "google/gemma-2-2b-it": + if backend in ("XFORMERS", + "FLASHINFER") and model == "google/gemma-2-2b-it": pytest.skip( - "XFORMERS does not support gemma2 with full context length.") + f"{backend} does not support gemma2 with full context length.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 4e502cfb5f4f8..4b27dcbc8609f 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -29,10 +29,10 @@ def check_settings(): @pytest.fixture -def worker_use_ray() -> bool: - # When SPMD worker is used, use ray_use_worker=True +def distributed_executor_backend() -> str: + # When SPMD worker is used, use distributed_executor_backend="ray" # to test delta input optimization works with preemption. - return envs.VLLM_USE_RAY_SPMD_WORKER + return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" @pytest.mark.parametrize("model", MODELS) @@ -47,7 +47,7 @@ def test_chunked_prefill_recompute( dtype: str, max_tokens: int, chunked_prefill_token_size: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Ensure that chunked prefill works with preemption.""" max_num_seqs = min(chunked_prefill_token_size, 256) @@ -66,7 +66,7 @@ def test_chunked_prefill_recompute( max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, max_num_seqs=max_num_seqs, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, disable_log_stats=False, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) @@ -93,7 +93,7 @@ def test_preemption( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """By default, recompute preemption is enabled""" @@ -104,7 +104,7 @@ def test_preemption( model, dtype=dtype, disable_log_stats=False, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt @@ -144,7 +144,7 @@ def test_preemption_infeasible( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Verify infeasible preemption request will be ignored.""" BLOCK_SIZE = 16 @@ -159,7 +159,7 @@ def test_preemption_infeasible( # ignored instead of hanging forever. num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 87d5aefea6cb4..1945479fc3031 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -58,7 +58,7 @@ class TestSetting: model_args=["--task", "embed"], pp_size=1, tp_size=1, - attn_backend="FLASHINFER", + attn_backend="FLASH_ATTN", method="encode", fullgraph=True, ), diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6523c8b6297c6..469a5fb039fb6 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -16,6 +16,24 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +@pytest.fixture(scope="module", params=[True, False]) +def use_v1(request): + # Module-scoped variant of run_with_both_engines + # + # Use this fixture to run a test with both v0 and v1, and + # also to conditionalize the test logic e.g. + # + # def test_metrics_exist(use_v1, server, client): + # ... + # expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS + # for metric in expected: + # assert metric in response.text + # + # @skip_v1 wouldn't work here because this is a module-level + # fixture - per-function decorators would have no effect + yield request.param + + @pytest.fixture(scope="module") def default_server_args(): return [ @@ -36,10 +54,12 @@ def default_server_args(): "--enable-chunked-prefill", "--disable-frontend-multiprocessing", ]) -def server(default_server_args, request): +def server(use_v1, default_server_args, request): if request.param: default_server_args.append(request.param) - with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0') + with RemoteOpenAIServer(MODEL_NAME, default_server_args, + env_dict=env_dict) as remote_server: yield remote_server @@ -84,7 +104,9 @@ async def client(server): @pytest.mark.asyncio async def test_metrics_counts(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): + if use_v1: + pytest.skip("Skipping test on vllm V1") for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -174,10 +196,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "swap_space_bytes", ] +EXPECTED_METRICS_V1 = [ + "vllm:num_requests_running", + "vllm:num_requests_waiting", +] + @pytest.mark.asyncio async def test_metrics_exist(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", @@ -187,11 +214,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer, response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK - for metric in EXPECTED_METRICS: + for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): assert metric in response.text -def test_metrics_exist_run_batch(): +def test_metrics_exist_run_batch(use_v1: bool): + if use_v1: + pytest.skip("Skipping test on vllm V1") input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 base_url = "0.0.0.0" diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py new file mode 100644 index 0000000000000..cfd8f33133960 --- /dev/null +++ b/tests/entrypoints/openai/test_rerank.py @@ -0,0 +1,87 @@ +import pytest +import requests + +from vllm.entrypoints.openai.protocol import RerankResponse + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "BAAI/bge-reranker-base" + + +@pytest.fixture(scope="module") +def server(): + args = ["--enforce-eager", "--max-model-len", "100"] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank_texts(server: RemoteOpenAIServer, model_name: str): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents, + }) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_top_n(server: RemoteOpenAIServer, model_name: str): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", "Cross-encoder models are neat" + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents, + "top_n": 2 + }) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str): + + query = "What is the capital of France?" * 100 + documents = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents + }) + assert rerank_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + rerank_response.text \ No newline at end of file diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 097d6b1a32349..1f8a56bb43ac6 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1,3 +1,4 @@ +import json import subprocess import sys import tempfile @@ -21,6 +22,9 @@ {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}} {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}""" +INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" + def test_empty_file(): with tempfile.NamedTemporaryFile( @@ -102,3 +106,36 @@ def test_embeddings(): # Ensure that the output format conforms to the openai api. # Validation should throw if the schema is wrong. BatchRequestOutput.model_validate_json(line) + + +def test_score(): + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(INPUT_SCORE_BATCH) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, + "-m", + "vllm.entrypoints.openai.run_batch", + "-i", + input_file.name, + "-o", + output_file.name, + "--model", + "BAAI/bge-reranker-v2-m3", + ], ) + proc.communicate() + proc.wait() + assert proc.returncode == 0, f"{proc=}" + + contents = output_file.read() + for line in contents.strip().split("\n"): + # Ensure that the output format conforms to the openai api. + # Validation should throw if the schema is wrong. + BatchRequestOutput.model_validate_json(line) + + # Ensure that there is no error in the response. + line_dict = json.loads(line) + assert isinstance(line_dict, dict) + assert line_dict["error"] is None diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 06e0f93dbe269..0d19615bc0d99 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -10,12 +10,7 @@ @pytest.fixture(scope="module") def server(): - args = [ - "--enforce-eager", - # Will be used on tests to compare prompt input length - "--max-model-len", - "100" - ] + args = ["--enforce-eager", "--max-model-len", "100"] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 85f485364a411..e88d6c3c67829 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -103,6 +103,116 @@ def test_serving_chat_should_set_correct_max_tokens(): assert mock_engine.generate.call_args.args[1].max_tokens == 10 + # Setting server's max_tokens in the generation_config.json + # lower than context_window - prompt_tokens + mock_model_config = MockModelConfig() + mock_model_config.diff_sampling_param = { + "max_tokens": 10 # Setting server-side max_tokens limit + } + + # Reinitialize the engine with new settings + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + # Initialize the serving chat + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + serving_chat = OpenAIServingChat(mock_engine, + mock_model_config, + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None) + + # Test Case 1: No max_tokens specified in request + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?" + }], + guided_decoding_backend="outlines", + ) + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 10 + + # Test Case 2: Request's max_tokens set higher than server accepts + req.max_tokens = 15 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 10 + + # Test Case 3: Request's max_tokens set lower than server accepts + req.max_tokens = 5 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 5 + + # Setting server's max_tokens in the generation_config.json + # higher than context_window - prompt_tokens + mock_model_config = MockModelConfig() + mock_model_config.diff_sampling_param = { + "max_tokens": 200 # Setting server-side max_tokens limit + } + + # Reinitialize the engine with new settings + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + # Initialize the serving chat + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + serving_chat = OpenAIServingChat(mock_engine, + mock_model_config, + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None) + + # Test case 1: No max_tokens specified, defaults to context_window + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?" + }], + guided_decoding_backend="outlines", + ) + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 93 + + # Test Case 2: Request's max_tokens set higher than server accepts + req.max_tokens = 100 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 93 + + # Test Case 3: Request's max_tokens set lower than server accepts + req.max_tokens = 5 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 5 + def test_serving_chat_could_load_correct_generation_config(): diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py index a16cc4582a180..f28fdf3feedbc 100644 --- a/tests/kernels/test_block_fp8.py +++ b/tests/kernels/test_block_fp8.py @@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A, A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) ] B_tiles = [[ - B[j * block_n:min((j + 1) * block_n, N), - i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles) + B[ + j * block_n:min((j + 1) * block_n, N), + i * block_k:min((i + 1) * block_k, K), + ] for i in range(k_tiles) ] for j in range(n_tiles)] C_tiles = [ C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) @@ -157,9 +159,9 @@ def setup_cuda(): torch.set_default_device("cuda") -@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed", - itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, - SEEDS)) +@pytest.mark.parametrize( + "num_tokens,d,dtype,group_size,seed", + itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS)) @torch.inference_mode() def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): torch.manual_seed(seed) @@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): assert torch.allclose(scale, ref_scale) -@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed", - itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, - SEEDS)) +@pytest.mark.parametrize( + "M,N,K,block_size,out_dtype,seed", + itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): torch.manual_seed(seed) @@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 -@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed", - itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, - BLOCK_SIZE, DTYPES, SEEDS)) +@pytest.mark.parametrize( + "M,N,K,E,topk,block_size,dtype,seed", + itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES, + SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): torch.manual_seed(seed) diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index afe53797322f9..c3eddacec2727 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -2,7 +2,7 @@ Run `pytest tests/kernels/test_cutlass.py`. """ -from typing import Optional, Type +from typing import Type import pytest import torch @@ -11,6 +11,8 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +from .utils import baseline_scaled_mm, to_fp8, to_int8 + MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), @@ -41,34 +43,10 @@ capability = capability[0] * 10 + capability[1] -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) - - -def to_int8(tensor: torch.Tensor): - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - def rand_int8(shape: tuple, device: str = "cuda"): return to_int8(torch.rand(shape, device=device) * 255 - 128) -def baseline_scaled_mm(a: torch.Tensor, - b: torch.Tensor, - scale_a: torch.Tensor, - scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = (scale_a * (scale_b * (torch.mm( - a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) - if bias is not None: - output = output + bias - - return output - - def cutlass_fp8_gemm_helper(m: int, n: int, k: int, diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py new file mode 100644 index 0000000000000..56495df34aa6c --- /dev/null +++ b/tests/kernels/test_cutlass_2of4_sparse.py @@ -0,0 +1,214 @@ +"""Tests for sparse cutlass kernels + +Run `pytest tests/kernels/test_semi_structured.py`. +""" +from typing import Tuple, Type + +import pytest +import torch +import torch.nn.functional as F + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + sparse_cutlass_supported) +from vllm.platforms import current_platform + +from .utils import baseline_scaled_mm, to_fp8, to_int8 + +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + +capability = current_platform.get_device_capability() +capability = capability[0] * 10 + capability[1] + + +def to_bf16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.bfloat16) + + +def to_fp16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.float16) + + +def prune_to_2_4(tensor): + # Reshape tensor to [N, 4] where N is number of groups of 4 + original_shape = tensor.shape + reshaped = tensor.reshape(-1, 4) + + # Get indices of top 2 absolute values in each group of 4 + _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) + + # Create binary mask + mask = torch.zeros_like(reshaped) + mask.scatter_(dim=1, + index=indices, + src=torch.ones_like(indices, dtype=mask.dtype)) + + # Apply mask and reshape back + pruned = reshaped * mask + + # Turn all -0.0 to 0.0 + pruned[pruned == -0.0] = 0.0 + + return pruned.reshape(original_shape) + + +def make_rand_sparse_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device='cuda') * 5 + b = torch.randn((n, k), device='cuda').t() * 5 + + b = prune_to_2_4(b.t()).t() + + if dtype == torch.int8: + a, b = to_int8(a), to_int8(b) + elif dtype == torch.float8_e4m3fn: + a, b = to_fp8(a), to_fp8(b) + elif dtype == torch.float16: + a, b = to_fp16(a), to_fp16(b) + elif dtype == torch.bfloat16: + a, b = to_bf16(a), to_bf16(b) + else: + raise ValueError("unsupported dtype") + + b_compressed, e = ops.cutlass_sparse_compress(b.t()) + + # Compressed B, Metadata, Original A, B + return b_compressed, e, a, b + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +# Test working with a subset of A and B for sparse matmul +def test_cutlass_sparse_subset(): + + big_m = 1024 + m, n, k = 512, 512, 512 + + # Create tensors + b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, + big_m, n, k) + a = whole_a[0:m, 0:k] + scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 + scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + +MNK_FACTORS = [ + (1, 256, 128), + (1, 16384, 1024), + (1, 24576, 512), + (16, 256, 512), + (16, 16384, 128), + (16, 24576, 4096), + (32, 8192, 4096), + (32, 16384, 4096), + (33, 1024, 1024), + (33, 8192, 128), + (64, 2048, 512), + (64, 16384, 1024), + (100, 8192, 512), + (128, 32768, 4096), + (256, 4096, 4096), + (512, 256, 1024), + (512, 8192, 4096), + (512, 16384, 128), + (512, 24576, 128), +] + + +# Test working with a subset of A and B for sparse matmul +@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m, k, n", MNK_FACTORS) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) + scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32) + scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=dtype) + baseline = F.linear(a, b.T) + + torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2) + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m, k, n", MNK_FACTORS) +@pytest.mark.skipif(not current_platform.has_device_capability(89), + reason="FP8 is not supported on this GPU type.") +def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) + scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0) + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m,k,n", MNK_FACTORS) +@pytest.mark.parametrize("per_act_token", [True, False]) +@pytest.mark.parametrize("per_out_ch", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool, + per_out_ch: bool, use_bias: bool): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k) + scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0) diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index a2c8f71665737..1645ef911d697 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -133,17 +133,19 @@ def test_flashinfer_decode_with_paged_kv( use_tensor_cores=( (num_query_heads//num_kv_heads) > 4) ) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype) - - output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap) + + output = wrapper.run(query, key_value_cache) ref_output = ref_paged_attn(query=query, key_cache=key_cache, @@ -228,7 +230,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -237,12 +239,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward( + output = wrapper.run( query, key_value_cache, - logits_soft_cap=soft_cap, ) ref_output = ref_paged_attn(query=query, @@ -253,7 +257,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_tables=block_tables, scale=scale, soft_cap=soft_cap) - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -332,7 +336,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -341,13 +345,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv( num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) ref_output = ref_paged_attn(query=query, key_cache=key_cache.squeeze(1), @@ -360,7 +363,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( del query del block_tables # verify prefill fp8 - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -439,21 +442,18 @@ def test_flashinfer_decode_with_paged_fp8_kv( wrapper = flashinfer.\ BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype, - q_data_type=dtype) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) key_cache = key_value_cache[:, 0, :, :, :].squeeze(1) value_cache = key_value_cache[:, 1, :, :, :].squeeze(1) diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py new file mode 100644 index 0000000000000..eab874e9e02bb --- /dev/null +++ b/tests/kernels/test_mha_attn.py @@ -0,0 +1,126 @@ +""" +Test: + +* Tests for MultiHeadAttention layer +""" +from unittest.mock import patch + +import pytest +import torch + +from vllm.attention.layer import MultiHeadAttention +from vllm.attention.selector import _Backend, _cached_get_attn_backend +from vllm.platforms import current_platform +from vllm.platforms.cpu import CpuPlatform +from vllm.platforms.cuda import CudaPlatform +from vllm.platforms.rocm import RocmPlatform + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) +def test_mha_attn_platform(device: str): + """ + Test the attention selector between different platform and device. + """ + torch.set_default_dtype(torch.float16) + + if device == "cpu": + with patch("vllm.attention.selector.current_platform", CpuPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.TORCH_SDPA + elif device == "hip": + with patch("vllm.attention.selector.current_platform", RocmPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.TORCH_SDPA + else: + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.XFORMERS + + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + attn = MultiHeadAttention(16, 72, scale=1) + assert attn.attn_backend == _Backend.XFORMERS + + +def ref_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, +) -> torch.Tensor: + """ + Native implementation of scaled dot product attention without mask: + - query, key, value: [batch_size, seq_len, num_heads, head_size] + - attn_mask: [batch_size, seq_len, seq_len] + """ + query, key, value = (x.transpose(1, 2) for x in (query, key, value)) + attn_weights = scale * torch.matmul(query, key.transpose(2, 3)) + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.matmul(attn_weights, value).transpose(1, 2) + return out + + +BATCH_SIZES = [1, 16] +SEQ_LENS = [1] +NUM_HEADS = [1, 16] +NUM_KV_HEADS = [1] +HEAD_SIZES = [64, 80] +# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} +DTYPES = [ + torch.half, torch.bfloat16, torch.float +] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] +CUDA_DEVICES = ["cuda"] + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_mha_attn_forward( + batch_size: int, + seq_len: int, + num_heads: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: str, +): + current_platform.seed_everything(0) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + q = torch.randn(batch_size, seq_len, num_heads * head_size) + k = torch.randn(batch_size, seq_len, num_kv_heads * head_size) + v = torch.randn(batch_size, seq_len, num_kv_heads * head_size) + scale = 1.0 / head_size**0.5 + attn = MultiHeadAttention(num_heads, + head_size, + scale=scale, + num_kv_heads=num_kv_heads) + output = attn(q, k, v) + + assert num_heads % num_kv_heads == 0 + num_queries_per_kv = num_heads // num_kv_heads + q = q.reshape(batch_size, seq_len, num_heads, head_size) + k = k.reshape(batch_size, seq_len, num_kv_heads, head_size) + v = v.reshape(batch_size, seq_len, num_kv_heads, head_size) + if num_queries_per_kv > 1: + k = torch.repeat_interleave(k, num_queries_per_kv, dim=2) + v = torch.repeat_interleave(v, num_queries_per_kv, dim=2) + + ref_output = ref_attention( + q, + k, + v, + scale=scale, + ).reshape(batch_size, seq_len, num_heads * head_size) + torch.testing.assert_close(output, ref_output) diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py deleted file mode 100644 index 4316d6ab30e33..0000000000000 --- a/tests/kernels/test_semi_structured.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Tests for sparse cutlass kernels - -Run `pytest tests/kernels/test_semi_structured.py`. -""" -from typing import Optional, Tuple, Type - -import pytest -import torch - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - sparse_cutlass_supported) -from vllm.platforms import current_platform - -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - -capability = current_platform.get_device_capability() -capability = capability[0] * 10 + capability[1] - - -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) - - -def to_int8(tensor: torch.Tensor): - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - -def rand_int8(shape: tuple, device: str = "cuda"): - return to_int8(torch.rand(shape, device=device) * 255 - 128) - - -def to_bf16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.bfloat16) - - -def to_fp16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.float16) - - -def prune_to_2_4(tensor): - # Reshape tensor to [N, 4] where N is number of groups of 4 - original_shape = tensor.shape - reshaped = tensor.reshape(-1, 4) - - # Get indices of top 2 absolute values in each group of 4 - _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) - - # Create binary mask - mask = torch.zeros_like(reshaped) - mask.scatter_(dim=1, - index=indices, - src=torch.ones_like(indices, dtype=mask.dtype)) - - # Apply mask and reshape back - pruned = reshaped * mask - - # Turn all -0.0 to 0.0 - pruned[pruned == -0.0] = 0.0 - - return pruned.reshape(original_shape) - - -def make_rand_sparse_tensors( - dtype: torch.dtype, m: int, n: int, k: int -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device='cuda') * 5 - b = torch.randn((n, k), device='cuda').t() * 5 - - b = prune_to_2_4(b.t()).t() - - if dtype == torch.int8: - a, b = to_int8(a), to_int8(b) - elif dtype == torch.float8_e4m3fn: - a, b = to_fp8(a), to_fp8(b) - elif dtype == torch.float16: - a, b = to_fp16(a), to_fp16(b) - elif dtype == torch.bfloat16: - a, b = to_bf16(a), to_bf16(b) - else: - raise ValueError("unsupported dtype") - - b_compressed, e = ops.cutlass_sparse_compress(b.t()) - - # Compressed B, Metadata, Original A, B - return b_compressed, e, a, b - - -def baseline_scaled_mm(a: torch.Tensor, - b: torch.Tensor, - scale_a: torch.Tensor, - scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = (scale_a * (scale_b * (torch.mm( - a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) - if bias is not None: - output = output + bias - - return output - - -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") -# Test working with a subset of A and B for sparse matmul -def test_cutlass_sparse_subset(): - - big_m = 1024 - m, n, k = 512, 512, 512 - - # Create tensors - b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, - big_m, n, k) - a = whole_a[0:m, 0:k] - scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 - scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 - - out = ops.cutlass_scaled_sparse_mm(a, - b_comp, - e, - scale_a, - scale_b, - out_dtype=torch.bfloat16) - baseline = baseline_scaled_mm(a, - b, - scale_a, - scale_b, - out_dtype=torch.bfloat16) - - torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 8011398551b9d..fb2c9f5d30583 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -5,7 +5,7 @@ import unittest from numbers import Number from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, - Union) + Type, Union) import pytest import torch @@ -1100,3 +1100,28 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, kwargs, test_utils=test_utils, raise_exception=raise_exception) if cond else {} + + +# For testing quantized linear kernels +def to_fp8(tensor: torch.Tensor): + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp( + min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + + +def to_int8(tensor: torch.Tensor): + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +def baseline_scaled_mm(a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: Type[torch.dtype], + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + output = (scale_a * (scale_b * (torch.mm( + a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) + if bias is not None: + output = output + bias + + return output diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 718730bb8cbbe..4d6890305af73 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, device: %s" % (my_rank, device)) + print(f"My rank: {my_rank}, device: {device}") # insert tokens = torch.tensor([1, 2, 3]).to(device) @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, Test run passed!" % (my_rank)) + print(f"My rank: {my_rank}, Test run passed!") def stress_test(my_rank, buf, device): @@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device): assert torch.allclose(k, k_) assert torch.allclose(v, v_) assert torch.allclose(h, h_) - print('Rank %d done' % my_rank) + print(f"Rank {my_rank} done") torch.distributed.barrier() if my_rank == 0: @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): else: torch.distributed.send(torch.tensor([n]), 0) - print("My rank: %d, Passed stress test!" % (my_rank)) + print(f"My rank: {my_rank}, Passed stress test!") if __name__ == "__main__": @@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device): rank=my_rank, ) - print("initialized! My rank is %d" % my_rank) + print(f"initialized! My rank is {my_rank}") config = KVTransferConfig( kv_connector='PyNcclConnector', diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index ebdd129db5f6a..570aa3861d0be 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm" - ) +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm") def test_qwen2vl_lora(qwen2vl_lora_files): llm = vllm.LLM( MODEL_PATH, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 14d9a739be318..d5f0d63288cc1 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -521,12 +521,13 @@ def _mark_splits( # - image embeddings # - video # - custom inputs -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=False, + )) def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=False, + )) def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=False, + )) def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=False, + )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], video_assets: _VideoAssets): @@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=False, + )) def test_custom_inputs_models( model_type: str, test_case: ExpandableVLMTestArgs, @@ -627,12 +632,13 @@ def test_custom_inputs_models( #### Tests filtering for things running each test as a new process -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, @@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=True, + )) def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], @@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_custom_inputs_models_heavy( model_type: str, diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 90c0fab99054c..8103e5305b91b 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs( outputs: OutputsLogprobs, filename: "StrPath", ) -> None: - json_data = [(tokens, text, - [{k: asdict(v) - for k, v in token_logprobs.items()} - for token_logprobs in (logprobs or [])]) + json_data = [(tokens, text, [{ + k: asdict(v) + for k, v in token_logprobs.items() + } for token_logprobs in (logprobs or [])]) for tokens, text, logprobs in outputs] with open(filename, "w") as f: @@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: with open(filename, "rb") as f: json_data = json.load(f) - return [(tokens, text, - [{int(k): Logprob(**v) - for k, v in token_logprobs.items()} - for token_logprobs in logprobs]) - for tokens, text, logprobs in json_data] + return [(tokens, text, [{ + int(k): Logprob(**v) + for k, v in token_logprobs.items() + } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data] @large_gpu_test(min_gb=80) diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index be6e3842821e2..3db27d942ac8c 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -5,12 +5,18 @@ import math import pytest +import torch +import torch.nn.functional as F MODELS = [ "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert "BAAI/bge-reranker-v2-m3", # Roberta ] +EMBEDDING_MODELS = [ + "sentence-transformers/all-MiniLM-L12-v2", +] + TEXTS_1 = [ "What is the capital of France?", "What is the capital of Germany?", @@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.fixture(scope="module", params=EMBEDDING_MODELS) +def emb_model_name(request): + yield request.param + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = hf_model.encode(text_pair) + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0) + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 8456a463adeeb..b8524ed83026b 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -16,7 +16,8 @@ NUM_PROMPTS = [10] DEFAULT_SERVER_ARGS: List[str] = [ - "--worker-use-ray", + "--distributed-executor-backend", + "ray", "--gpu-memory-utilization", "0.85", "--swap-space", diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py index d3b688087297b..77b707a737118 100644 --- a/tests/neuron/test_prefix_prefill.py +++ b/tests/neuron/test_prefix_prefill.py @@ -173,8 +173,8 @@ def ref_context_attention( "num_heads,num_queries_per_kv,head_size,mixed_precision", [ (4, 2, 8, False), - # (4, 2, 8, True), - # (32, 8, 64, True), + (4, 2, 8, True), + (32, 8, 64, True), ], ) @torch.inference_mode() @@ -185,8 +185,6 @@ def test_contexted_kv_attention( mixed_precision: bool, ) -> None: import os - os.environ["NEURON_RT_LOG_LEVEL"] = "INFO" - os.environ["NEURON_FRAMEWORK_DEBUG"] = "1" import torch_xla.core.xla_model as xm @@ -196,11 +194,7 @@ def test_contexted_kv_attention( os.environ["NEURON_CC_FLAGS"] = ( " --model-type=transformer -O1 " - " --internal-hlo2tensorizer-options='--verify-hlo' " - " --verbose=debug " - " --logical-nc-config=1 ") - - + " --internal-hlo2tensorizer-options='--verify-hlo' ") random.seed(0) torch.manual_seed(0) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 0cd86cef0a475..1072697ecf5cc 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -313,8 +313,10 @@ def check_model(model): assert output -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") +@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="2of4 Sparse is not yet supported on this GPU type.") @pytest.mark.parametrize( "args_2of4", [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 397fa2cc85821..dcb1b27bff37f 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -23,16 +23,17 @@ def mock_causal_accepted_tensor( """ batch_size = last_accepted_indices.shape[0] - accepted = (torch.arange(k).expand(batch_size, k) <= - last_accepted_indices.unsqueeze(-1).broadcast_to( + accepted = (torch.arange(k).expand(batch_size, k) + <= last_accepted_indices.unsqueeze(-1).broadcast_to( batch_size, k)) # Sprinkle accepted values after the contiguous initial accepted values. # This replicates the behavior of rejection sampling, which may "accept" # a token that cannot be accepted because of causality. - sprinkle_candidates = ( - torch.arange(k).expand(batch_size, k) > - last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1) + sprinkle_candidates = (torch.arange(k).expand( + batch_size, + k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + + 1) sprinkle = torch.rand(batch_size, k) > 0.5 accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] return accepted @@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution( distance_wrt_reference) expected_improvement_multiplier = 20 - assert (relative_change_in_distance_wrt_target > - relative_change_in_distance_wrt_reference * + assert (relative_change_in_distance_wrt_target + > relative_change_in_distance_wrt_reference * expected_improvement_multiplier) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index b9cb3858c0068..5cb982a0811c7 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -2,6 +2,7 @@ from typing import List, Optional, Sequence, Tuple, Union import pytest +import torch from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory @@ -154,6 +155,8 @@ def _check_logprobs_when_output_disabled( spec_pos_logprob) = next(iter(spec_pos_logprobs.items())) assert spec_pos_logprob.rank == -1 assert spec_pos_logprob.logprob == 0.0 + if isinstance(spec_pos_logprob_token_id, torch.Tensor): + spec_pos_logprob_token_id = spec_pos_logprob_token_id.item() assert spec_pos_logprob_token_id in baseline_pos_logprobs @@ -244,7 +247,8 @@ def run_equality_correctness_test_tp(model, batch_size: int, max_output_len: int, seed: int = 0, - temperature: float = 0.0): + temperature: float = 0.0, + logprobs: Optional[int] = None): """Helper method that compares the outputs of both the baseline LLM and the test LLM. It asserts greedy equality, e.g. that the outputs are exactly the same when temperature is zero. @@ -257,7 +261,6 @@ def run_equality_correctness_test_tp(model, results = [] prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))] - for args, env in ((arg1, env1), (arg2, env2)): with RemoteOpenAIServer(model, args, @@ -269,12 +272,14 @@ def run_equality_correctness_test_tp(model, prompt=prompts, max_tokens=max_output_len, seed=seed, - temperature=temperature) + temperature=temperature, + logprobs=logprobs) results.append({ "test": "seeded_sampling", "text": [choice.text for choice in completion.choices], + "logprobs": [choice.logprobs for choice in completion.choices], "finish_reason": [choice.finish_reason for choice in completion.choices], "usage": @@ -284,7 +289,15 @@ def run_equality_correctness_test_tp(model, n = len(results) // 2 arg1_results = results[:n] arg2_results = results[n:] + # Separate logprobs to avoid asserting exact equality. + arg1_logprobs = [r.pop("logprobs") for r in arg1_results] + arg2_logprobs = [r.pop("logprobs") for r in arg2_results] + for arg1_result, arg2_result in zip(arg1_results, arg2_results): assert arg1_result == arg2_result, ( f"Results for {model=} are not the same with {arg1=} and {arg2=}. " f"{arg1_result=} != {arg2_result=}") + if logprobs: + for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs): + for l1, l2 in zip(logs1, logs2): + assert l1.tokens == l2.tokens diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index 02cba92795142..7001ee4c007fe 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -2,6 +2,8 @@ tensor parallelism. """ +from typing import Optional + import pytest import torch @@ -154,15 +156,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, "--speculative-draft-tensor-parallel-size", "1", ])]) +@pytest.mark.parametrize("logprobs", [None, 2]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, + logprobs: Optional[int], batch_size: int, seed: int): """Verify spec decode works well with same and different TP size for the draft model with chunked prefill. """ + if logprobs: + test_llm_kwargs.extend( + ["--disable_logprobs_during_spec_decoding", "False"]) run_equality_correctness_test_tp(model, common_llm_kwargs, per_test_common_llm_kwargs, @@ -171,4 +178,5 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, batch_size, max_output_len=32, seed=seed, - temperature=0.0) + temperature=0.0, + logprobs=logprobs) diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index 4cfca8b78e79b..1a543606cb3f3 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -4,26 +4,27 @@ from vllm import SamplingParams +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test @pytest.mark.parametrize( "common_llm_kwargs", [{ - "model_name": "JackFram/llama-68m", + "model_name": "JackFram/llama-160m", # Skip cuda graph recording for fast test. - "enforce_eager": True, + "enforce_eager": True }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 3, "disable_logprobs_during_spec_decoding": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 3, "disable_logprobs_during_spec_decoding": True, }]) @@ -36,12 +37,15 @@ ]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12]) def test_logprobs_equality(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): - """Verify output logprobs are equal with and without speculative decoding. + seed: int, logprobs: int, prefill_chunk_size: int): + """Verify output logprobs are equal with and without speculative decoding, + as well as with and without chunked prefill. """ + maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index b8965606b3d0e..dbcbc0db10881 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -21,6 +21,7 @@ import pytest +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test # main model @@ -67,12 +68,14 @@ ]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -119,12 +122,15 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): + seed: int, logprobs: int, + prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -167,12 +173,14 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness_cuda_graph( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with cuda graph enabled and different batch sizes.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -217,13 +225,15 @@ def test_medusa_e2e_greedy_correctness_cuda_graph( ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness_with_preemption( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality, even when some sequences are preempted mid- generation. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -267,13 +277,15 @@ def test_medusa_e2e_greedy_correctness_with_preemption( 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_different_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify that medusa speculative decoding produces exact equality to without spec decode with different values of num_speculative_tokens. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -313,14 +325,17 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs, 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, seed: int, + prefill_chunk_size: int): """Verify that medusa speculative decoding produces exact equality to without spec decode when speculation is disabled for large batch sizes. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -361,12 +376,14 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, seed: int, prefill_chunk_size: int): """Verify that speculative decoding generates the same output with batch expansion scorer and mqa scorer. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 183ff2f5db274..1fa1104f5d3a8 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test # main model @@ -66,14 +67,16 @@ @pytest.mark.parametrize("output_len", [ 128, ]) -@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("batch_size", [4, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -116,12 +119,19 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, seed: int, - logprobs: int): + logprobs: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + # NOTE Test is sensitive enough st if we don't enable chunked prefill + # scheduling on baseline too, we get slightly different logprobs, ending + # up sampling different tokens at the tail (ie top tokens don't change). + # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected? + maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -162,12 +172,15 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("output_len", [2048]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): + batch_size: int, output_len: int, + prefill_chunk_size: int, seed: int): """Verify acceptance rate with different batch size and large output length.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -204,13 +217,17 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("output_len", [64]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("temperature", [1.0]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - temperature: float, seed: int): + temperature: float, + prefill_chunk_size: int, seed: int): """Verify seeded runs produce the same output.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -266,14 +283,16 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, 128, ]) @pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_e2e_greedy_correctness_with_preemption( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + prefill_chunk_size: int, seed: int): """Verify greedy equality, even when some sequences are preempted mid- generation. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -317,12 +336,14 @@ def test_mlp_e2e_greedy_correctness_with_preemption( ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_greedy_correctness_with_padding( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + prefill_chunk_size: int, seed: int): """Verify greedy equality when the vocab dimension is padded """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) # Default pad_to is 64, test model has vocab_size of 32000 def patched_pad_vocab_size(vocab_size, pad_to=None): @@ -373,14 +394,16 @@ def patched_pad_vocab_size(vocab_size, pad_to=None): # Use smaller output len for fast test. 32, ]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_different_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, seed: int, - output_len: int): + test_llm_kwargs, batch_size: int, + prefill_chunk_size: int, seed: int, output_len: int): """Verify that mlp speculative decoding produces exact equality to without spec decode with different values of num_speculative_tokens. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -418,15 +441,21 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs, # Use smaller output len for fast test. 32, ]) +# Speculative decoding is disabled when sequences reach decoding and the batch +# consists of single-token requests. Hence we set `max_num_seqs` +# >= `speculative_disable_by_batch_size` to test feature interaction. +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, seed: int, + test_llm_kwargs, batch_size: int, + prefill_chunk_size: int, seed: int, output_len: int): """Verify that mlp speculative decoding produces exact equality to without spec decode when speculation is disabled for large batch sizes. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -460,13 +489,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, # Use smaller output len for fast test. 32, ]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, prefill_chunk_size: int, seed: int): """Verify that speculative decoding generates the same output with batch expansion scorer and mqa scorer. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index a13cca41f99e5..05ad468dd8bc5 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -147,20 +147,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "enable_chunked_prefill": False, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - }, -]) +@pytest.mark.parametrize("test_llm_kwargs", + [{ + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + "disable_logprobs_during_spec_decoding": False + }, { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 3, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4, + "disable_logprobs_during_spec_decoding": False + }]) @pytest.mark.parametrize( "output_len", [ @@ -192,6 +192,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( batch_size, max_output_len=output_len, seed=seed, + prompt_logprobs=2, + logprobs=2, + disable_logprobs=False, temperature=0.0, ensure_all_accepted=ensure_all_accepted) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index e53d169a8fcc3..77f8b8998c8d3 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -26,6 +26,7 @@ import pytest +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test @@ -49,11 +50,13 @@ "speculative_model": "[ngram]", "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, + "speculative_disable_mqa_scorer": False, }, { "speculative_model": "[ngram]", "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, + "speculative_disable_mqa_scorer": True, }, ]) @pytest.mark.parametrize("output_len", [ @@ -68,15 +71,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, batch_size: int, output_len: int, prefill_chunk_size: int, seed: int): """Verify greedy equality on a tiny model with different batch size.""" - if prefill_chunk_size > 0: - common_llm_kwargs.update( - **{ - "enable_chunked_prefill": True, - "max_num_batched_tokens": prefill_chunk_size, - "max_num_seqs": prefill_chunk_size - }) - else: - common_llm_kwargs["enable_chunked_prefill"] = False + maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 0b1509d8b7785..5a093dea16d40 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -60,6 +60,7 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int, num_gpu_blocks = 2048 // block_size scorer_worker = create_worker(Worker, model_name, block_size, num_gpu_blocks, seed) + scorer_worker.model_runner.disable_logprobs = True # accessed by mqa_scorer scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True scorer_worker.model_runner.model.sampler.\ should_modify_greedy_probs_inplace = True diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index caf7a7e625b46..d8c3af4c1cd1e 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -754,6 +754,7 @@ def test_populate_seq_ids_with_bonus_tokens(): seq_group_metadata_list=seq_group_metadata_list, accepted_token_ids=accepted_token_ids, target_logprobs=target_token_logprobs, + prompt_logprobs=None, k=k, stage_times=(0, 0, 0)) # Verify that _seq_with_bonus_token_in_last_step contains the following: diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index a4bfa6b2f384b..2f883c2ff9b7a 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -274,3 +274,15 @@ def create_batch(batch_size, prompts, num_gpu_blocks, block_size, final_prompt_lens, prev_output_tokens, seq_ids) return seq_group_metadata_list, prompts, prev_output_tokens + + +def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs): + if prefill_chunk_size > 0: + llm_kwargs.update( + **{ + "enable_chunked_prefill": True, + "max_num_batched_tokens": prefill_chunk_size, + "max_num_seqs": prefill_chunk_size + }) + else: + llm_kwargs["enable_chunked_prefill"] = False diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index cccfd305ac604..033bbcfce564e 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -144,7 +144,7 @@ def test_engine_core(monkeypatch): def test_engine_core_advanced_sampling(monkeypatch): """ A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as min_tokens and + when additional sampling parameters, such as top_p, min_tokens, and presence_penalty, are set. """ with monkeypatch.context() as m: @@ -167,11 +167,23 @@ def test_engine_core_advanced_sampling(monkeypatch): stop_token_ids=[1001, 1002], ) engine_core.add_request(request) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 0 - # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: - pass - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 + def _check_engine_state(): + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + # Loop through until they are all done. + while len(engine_core.step().outputs) > 0: + pass + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + + _check_engine_state() + + # Second request. + request2 = make_request() + request2.sampling_params = SamplingParams( + top_p=0.99, + top_k=50, + ) + engine_core.add_request(request2) + _check_engine_state() diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 51ad2adc74fe1..9dc19f5fd4cdd 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types): print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' 'parallelism)'.format(length, total_cpu_time, total_cpu_time * 1.0 / length)) - print(' %d build steps completed, average of %1.2f/s' % - (len(entries), len(entries) / (length))) + print(' {} build steps completed, average of {:1.2f}/s'.format( + len(entries), + len(entries) / (length))) def main(): diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 440bc52012ab7..85c1121ed6ff8 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -820,8 +820,8 @@ def scaled_int8_quant( if scale is not None: # static-per-tensor quantization. assert symmetric == ( - azp is - None), "azp must only be provided for asymmetric quantization." + azp + is None), "azp must only be provided for asymmetric quantization." torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) return output, scale, azp diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 3135b0b405343..7cccef9608218 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,3 +1,4 @@ +import dataclasses from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass @@ -13,9 +14,11 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024 except ImportError: - BatchDecodeWithPagedKVCacheWrapper = None - CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None - BatchPrefillWithPagedKVCacheWrapper = None + # Avoid turning these types into variables during type checking + if not TYPE_CHECKING: + BatchDecodeWithPagedKVCacheWrapper = None + CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None + BatchPrefillWithPagedKVCacheWrapper = None FLASHINFER_WORKSPACE_BUFFER_SIZE = 0 import torch @@ -30,7 +33,9 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, compute_slot_mapping_start_idx, is_block_tables_empty) +from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention +from vllm.config import VllmConfig, get_current_vllm_config from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) @@ -99,6 +104,72 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") +@dataclass +class PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters. + """ + + window_left: int + logits_soft_cap: Optional[float] + sm_scale: float + + +def get_per_layer_parameters( + vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]: + """ + Scan all attention layers and determine some hyperparameters + to use during `plan`. + """ + + layers = vllm_config.compilation_config.static_forward_context + per_layer_params: Dict[str, PerLayerParameters] = {} + + for key, layer in layers.items(): + assert isinstance(layer, Attention) + + impl = layer.impl + assert isinstance(impl, FlashInferImpl) + + # Infer hyperparameters from the attention layer + window_size = impl.sliding_window + window_left = window_size[0] if window_size is not None else -1 + logits_soft_cap = impl.logits_soft_cap + sm_scale = impl.scale + + per_layer_params[key] = PerLayerParameters(window_left, + logits_soft_cap, sm_scale) + + return per_layer_params + + +def infer_global_hyperparameters( + per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters: + - `window_left` + - `logits_soft_cap` + - `sm_scale` + + So this function asserts that all layers share the same values for these + hyperparameters and returns the global values. + """ + + assert len(per_layer_params) > 0, "No attention layers found in the model." + + param_sets = list(per_layer_params.values()) + global_params = param_sets[0] + for params in param_sets: + assert params == global_params, ( + "FlashInfer backend currently only supports models in which all " + "layers share the same values for the following hyperparameters: " + "`window_left`, `logits_soft_cap`, `sm_scale`.") + + return global_params + + class FlashInferState(AttentionState): def __init__(self, runner): @@ -108,6 +179,11 @@ def __init__(self, runner): self._decode_wrapper = None self._prefill_wrapper = None + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + def _get_workspace_buffer(self): if self._workspace_buffer is None: self._workspace_buffer = torch.empty( @@ -215,6 +291,9 @@ def graph_capture_get_metadata_for_batch( batch_size + 1, dtype=torch.int32) + global_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + attn_metadata = self.runner.attn_backend.make_metadata( num_prefills=0, slot_mapping=self._graph_slot_mapping[:batch_size], @@ -238,7 +317,9 @@ def graph_capture_get_metadata_for_batch( q_data_type=self.runner.model_config.dtype, use_cuda_graph=True, decode_wrapper=self._graph_decode_wrapper, - prefill_wrapper=None) + prefill_wrapper=None, + **dataclasses.asdict(global_params), + ) attn_metadata.begin_forward() return attn_metadata @@ -325,9 +406,28 @@ class FlashInferMetadata(AttentionMetadata): data_type: torch.dtype = None # The data type of the query q_data_type: torch.dtype = None - device: torch.device = torch.device("cuda") + # FlashInfer 0.2 encourages passing host tensors + device: torch.device = torch.device("cpu") is_profile_run: bool = False + # The FlashInfer backend currently supports only models in which all layers + # share the same following hyperparameters: + + # The left (inclusive) window size for the attention window, when + # set to `-1`, the window size will be set to the full length of + # the sequence. Defaults to `-1`. + window_left: int = -1 + # The attention logits soft capping value (used in Gemini, Grok and + # Gemma-2, etc.), if not provided, will be set to `0`. If greater + # than 0, the logits will be capped according to formula: + # $$\texttt{logits\_soft\_cap} \times + # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$, + # where $x$ is the input logits. + logits_soft_cap: Optional[float] = None + # The scale used in softmax, if not provided, will be set to + # `1.0 / sqrt(head_dim)`. + sm_scale: Optional[float] = None + def __post_init__(self): # Refer to # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157 @@ -363,14 +463,21 @@ def begin_forward(self): self.block_table_bound = self.block_table_bound.to(self.device) self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) self.paged_kv_indices = self.paged_kv_indices.to(self.device) - self.prefill_wrapper.end_forward() - self.prefill_wrapper.begin_forward( + self.prefill_wrapper.plan( self.query_start_loc, self.paged_kv_indptr[:self.num_prefills + 1], self.paged_kv_indices, self.paged_kv_last_page_len[:self.num_prefills], - self.num_qo_heads, self.num_kv_heads, self.head_dim, - self.page_size) + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + causal=True, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.data_type) if self.num_decode_tokens > 0: assert self.paged_kv_indices is not None assert self.paged_kv_indptr is not None @@ -386,8 +493,7 @@ def begin_forward(self): self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) assert self.decode_wrapper is not None - self.decode_wrapper.end_forward() - self.decode_wrapper.begin_forward( + self.decode_wrapper.plan( self.paged_kv_indptr[self.num_prefills:], self.paged_kv_indices, self.paged_kv_last_page_len[self.num_prefills:], @@ -397,8 +503,11 @@ def begin_forward(self): self.page_size, # Disable flashinfer's pos encoding and use vllm's rope. pos_encoding_mode="NONE", + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, # kv-cache data type. - data_type=self.data_type, + kv_data_type=self.data_type, # query data type. q_data_type=self.q_data_type) @@ -496,6 +605,11 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.sliding_window = input_builder.sliding_window self.block_size = input_builder.block_size + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] @@ -528,6 +642,20 @@ def prepare(self): self.total_blocks = 0 self.is_profile_run: bool = False + if self.global_hyperparameters is None: + # Infer global hyperparameters, since currently we only support + # models in which all layers share the same values for the + # following hyperparameters: + # - `window_left` + # - `logits_soft_cap` + # - `sm_scale` + inferred_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + self.global_hyperparameters = inferred_params + self.window_left = inferred_params.window_left + self.logits_soft_cap = inferred_params.logits_soft_cap + self.sm_scale = inferred_params.sm_scale + def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): @@ -756,7 +884,11 @@ def build(self, seq_lens: List[int], query_lens: List[int], data_type=kv_cache_dtype, q_data_type=self.runner.model_config.dtype, use_cuda_graph=use_captured_graph, - is_profile_run=self.is_profile_run) + is_profile_run=self.is_profile_run, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, + ) class FlashInferImpl(AttentionImpl): @@ -885,25 +1017,34 @@ def forward( else: assert prefill_meta is not None assert prefill_meta.prefill_wrapper is not None - prefill_output = prefill_meta.prefill_wrapper.forward( + + assert prefill_meta.prefill_wrapper._causal + assert prefill_meta.prefill_wrapper._window_left == window_left + assert prefill_meta.prefill_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale + + prefill_output = prefill_meta.prefill_wrapper.run( query, kv_cache, - logits_soft_cap=logits_soft_cap, - causal=True, k_scale=layer._k_scale_float, v_scale=layer._v_scale_float, - window_left=window_left) + ) if decode_meta := attn_metadata.decode_metadata: assert decode_meta is not None assert decode_meta.decode_wrapper is not None - decode_output = decode_meta.decode_wrapper.forward( + + assert decode_meta.decode_wrapper._window_left == window_left + assert decode_meta.decode_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert decode_meta.decode_wrapper._sm_scale == softmax_scale + + decode_output = decode_meta.decode_wrapper.run( decode_query, kv_cache, - sm_scale=softmax_scale, - logits_soft_cap=logits_soft_cap, k_scale=layer._k_scale_float, v_scale=layer._v_scale_float, - window_left=window_left) + ) if prefill_output is None and decode_output is not None: # Decode only batch. diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 8c25dda7aad2c..49f47f9c8ded3 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -199,6 +199,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: # Compute some attn_metadata fields which default to None query_start_loc = (None if self.query_start_loc is None else self.query_start_loc[:self.num_prefills + 1]) + seq_start_loc = (None if self.seq_start_loc is None else + self.seq_start_loc[:self.num_prefills + 1]) slot_mapping = (None if self.slot_mapping is None else self.slot_mapping[:self.num_prefill_tokens]) seq_lens = (None if self.seq_lens is None else @@ -225,6 +227,7 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: max_prefill_seq_len=self.max_prefill_seq_len, max_decode_seq_len=0, query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=False, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 79ea9b666c7e8..962c45a65ae23 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -210,6 +210,9 @@ def __init__( self.scale = scale self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + dtype = torch.get_default_dtype() attn_backend = get_attn_backend(head_size, dtype, @@ -221,7 +224,8 @@ def __init__( backend = _Backend.XFORMERS self.attn_backend = backend if backend in { - _Backend.TORCH_SDPA, _Backend.XFORMERS + _Backend.TORCH_SDPA, + _Backend.XFORMERS, } else _Backend.TORCH_SDPA def forward( @@ -231,7 +235,7 @@ def forward( value: torch.Tensor, ) -> torch.Tensor: """Input shape: batch_size x seq_len x hidden_size""" - # TODO(Isotr0py): Use existing backend implementations and support FA2 + # TODO(Isotr0py): Use existing backend implementations and support FA3 bsz, q_len, _ = query.size() kv_len = key.size(1) @@ -239,6 +243,11 @@ def forward( key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) + if (num_repeat := self.num_queries_per_kv) > 1: + # Handle MQA and GQA + key = torch.repeat_interleave(key, num_repeat, dim=2) + value = torch.repeat_interleave(value, num_repeat, dim=2) + if self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index d07481c866940..784e5c1c7b95e 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -1,10 +1,3 @@ -""" -Copyright (c) 2023, Amazon.com. All Rights Reserved - -kernels - Builtin high performance attention kernels - -""" - from dataclasses import dataclass import neuronxcc.nki.isa as nisa @@ -49,14 +42,12 @@ def transpose_p_local(p_local_transposed, j_128_slice = nl.ds(j * 128, 128) i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128) - # if nisa.get_nc_version() == nisa.nc_version.gen3: - # p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose( - # p_local[:, i_j_128_slice], mask=forward_mask) - # else: - # p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose( - # p_local[:, i_j_128_slice], mask=forward_mask) - p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose( - p_local[:, i_j_128_slice], mask=forward_mask) + if nisa.get_nc_version() == nisa.nc_version.gen3: + p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose( + p_local[:, i_j_128_slice], mask=forward_mask) + else: + p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose( + p_local[:, i_j_128_slice], mask=forward_mask) p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy( p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask) @@ -220,7 +211,6 @@ def _flash_attention_core( p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype) - transpose_p_local( p_local_transposed=p_local_transposed, p_local=p_local, @@ -232,10 +222,7 @@ def _flash_attention_core( pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE), dtype=np.float32, buffer=nl.psum) - - for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE): - pv_psum[:, :] += nl.matmul( p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)], v[k_i, :, :], @@ -354,30 +341,21 @@ def flash_paged_attention( """ config = config or FlashConfig() B_F_SIZE = 512 - b, h, d, seqlen_q = query.shape B_P_SIZE = 128 + b, h, d, seqlen_q = query.shape B_D_SIZE = d LARGE_TILE_SZ = config.seq_tile_size - n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine - num_blocks, block_size, k_h, _ = key_cache.shape - - # TODO(gnovack) - remove the hacky padding block from the count - num_blocks = num_blocks - 1 - q_h_per_k_h = h // k_h assert tuple(key_cache.shape) == ( - # TODO(gnovack) - hacky padding block - num_blocks + 1, - # num_blocks, + num_blocks, block_size, k_h, d, ), "Input shape mismatch!" assert tuple(value_cache.shape) == ( - # TODO(gnovack) - hacky padding block - num_blocks + 1, + num_blocks, block_size, k_h, d, @@ -418,8 +396,8 @@ def flash_paged_attention( ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!" batch_id = nl.program_id(axis=0) head_id = nl.program_id(axis=1) - softmax_scale = softmax_scale or (1.0 / (d**0.5)) + softmax_scale = softmax_scale or (1.0 / (d**0.5)) (num_active_blocks, ) = block_tables.shape context_kv_len = num_active_blocks * block_size @@ -434,6 +412,9 @@ def flash_paged_attention( ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}" num_large_k_tile = context_kv_len // LARGE_TILE_SZ num_blocks_per_large_tile = LARGE_TILE_SZ // block_size + assert (num_blocks_per_large_tile <= B_P_SIZE + ), f"The number of blocks in each large tile " \ + f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}" block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile), 0, @@ -611,7 +592,6 @@ def flash_paged_attention( ) # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- # - for i_q_h in nl.affine_range(q_h_per_k_h): for i in nl.affine_range(n_tile_q): out = nl.multiply( @@ -686,4 +666,4 @@ def flash_attn_varlen_nkifunc( return o, *debug_tensors else: o = flash_paged_attention[1, n_kv_head](**kwargs) - return o + return o \ No newline at end of file diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index e2f2b66dfc90c..ec3c8459c43ef 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -219,8 +219,8 @@ def _fwd_kernel( float("-inf")) if SLIDING_WINDOW > 0: qk = tl.where( - offs_m[:, None] - - (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000) + offs_m[:, None] - (start_n + offs_n[None, :]) + < SLIDING_WINDOW, qk, -10000) # -- compute m_ij, p, l_ij m_ij = tl.max(qk, 1) @@ -324,10 +324,10 @@ def _fwd_kernel_flash_attn_v2( (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + cur_head * stride_qh + offs_d[None, :] * stride_qd) - q = tl.load( - Q + off_q, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) + q = tl.load(Q + off_q, + mask=offs_m[:, None] + < cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) # # initialize pointer to m and l m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") @@ -402,8 +402,8 @@ def _fwd_kernel_flash_attn_v2( # -- compute qk ---- k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -430,8 +430,8 @@ def _fwd_kernel_flash_attn_v2( # update acc v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) p = p.to(v.dtype) @@ -639,8 +639,8 @@ def _fwd_kernel_alibi( k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, mask=dim_mask[:, None] & - ((start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -677,8 +677,8 @@ def _fwd_kernel_alibi( v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, mask=dim_mask[None, :] & - ((start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) p = p.to(v.dtype) diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index f94211116a746..ef04603f22b6e 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -627,8 +627,8 @@ def attn_fwd( causal_start_idx, dtype=tl.int32) mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) - out_ptrs_mask = (mask_m_offsets[:, None] >= - out_mask_boundary[None, :]) + out_ptrs_mask = (mask_m_offsets[:, None] + >= out_mask_boundary[None, :]) z = 0.0 acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) # write back LSE diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 81ea6eefb5410..1376274d57777 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,6 +1,6 @@ import os from contextlib import contextmanager -from functools import lru_cache +from functools import cache from typing import Generator, Optional, Type import torch @@ -100,7 +100,7 @@ def get_attn_backend( ) -@lru_cache(maxsize=None) +@cache def _cached_get_attn_backend( head_size: int, dtype: torch.dtype, diff --git a/vllm/config.py b/vllm/config.py index 3df9e9c85d3aa..5c2baa7a34202 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -67,7 +67,8 @@ _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { task: runner - for runner, tasks in _RUNNER_TASKS.items() for task in tasks + for runner, tasks in _RUNNER_TASKS.items() + for task in tasks } HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], @@ -310,14 +311,15 @@ def __init__( (self.hf_text_config.model_type in ["gemma2", "cohere2"])) if (not self.disable_sliding_window and has_interleaved_attention): - if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": + if (backend := + envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) logger.warning_once( f"{self.hf_text_config.model_type} has interleaved " "attention, which is currently not supported by the " - "XFORMERS backend. Disabling sliding window and capping " + f"{backend} backend. Disabling sliding window and capping " "the max length to the sliding window size " f"({sliding_window_len_min}).") self.disable_sliding_window = True @@ -910,12 +912,18 @@ def get_diff_sampling_param(self) -> Dict[str, Any]: "top_k", "top_p", "min_p", + "max_new_tokens", ] if any(p in config for p in available_params): diff_sampling_param = { p: config.get(p) for p in available_params if config.get(p) is not None } + # Huggingface definition of max_new_tokens is equivalent + # to vLLM's max_tokens + if "max_new_tokens" in diff_sampling_param: + diff_sampling_param["max_tokens"] = diff_sampling_param.pop( + "max_new_tokens") else: diff_sampling_param = {} return diff_sampling_param @@ -1227,9 +1235,6 @@ class ParallelConfig: pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. tensor_parallel_size: int = 1 # Number of tensor parallel groups. - # Deprecated, use distributed_executor_backend instead. - worker_use_ray: Optional[bool] = None - # Maximum number of multiple batches # when load model sequentially. To avoid RAM OOM when using tensor # parallel and large models. @@ -1283,13 +1288,6 @@ def __post_init__(self) -> None: self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size - if self.worker_use_ray: - if self.distributed_executor_backend is None: - self.distributed_executor_backend = "ray" - elif not self.use_ray: - raise ValueError(f"worker-use-ray can't be used with " - f"distributed executor backend " - f"'{self.distributed_executor_backend}'.") ray_only_devices = ["tpu"] from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices @@ -1685,7 +1683,8 @@ def maybe_create_spec_config( raise ValueError("Expect the batch size threshold of disabling " "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") - + if (enable_chunked_prefill and speculative_model == "eagle"): + raise ValueError("Chunked prefill and EAGLE are not compatible.") # TODO: The user should be able to specify revision/max model len # for the draft model. It is not currently supported. draft_revision = None @@ -1752,12 +1751,6 @@ def maybe_create_spec_config( f"num_speculative_tokens={n_predict}, but " f"{num_speculative_tokens=} was provided.") - if enable_chunked_prefill and draft_hf_config.model_type in ( - "medusa", "mlp_speculator", "eagle"): - raise ValueError( - "Chunked prefill and hidden-state based draft models are " - "not compatible.") - speculative_draft_tensor_parallel_size = \ SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size( target_parallel_config, @@ -1981,8 +1974,8 @@ def _verify_args(self) -> None: "typical_acceptance_sampler.") if (self.draft_token_acceptance_method != 'rejection_sampler' - and self.draft_token_acceptance_method != - 'typical_acceptance_sampler'): + and self.draft_token_acceptance_method + != 'typical_acceptance_sampler'): raise ValueError( "Expected draft_token_acceptance_method to be either " "rejection_sampler or typical_acceptance_sampler. Instead it " @@ -3173,7 +3166,8 @@ def __post_init__(self): self.compilation_config = CompilationConfig() - if envs.VLLM_USE_V1 and not self.model_config.enforce_eager and current_platform.is_neuron(): + if envs.VLLM_USE_V1 and not self.model_config.enforce_eager \ + and current_platform.is_neuron(): self.compilation_config.custom_ops = ["silu_and_mul"] self.compilation_config.use_cudagraph = True self.compilation_config.use_inductor = True @@ -3324,7 +3318,7 @@ def __str__(self): @contextmanager -def set_current_vllm_config(vllm_config: VllmConfig): +def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): """ Temporarily set the current VLLM config. Used during model initialization. @@ -3344,7 +3338,8 @@ def set_current_vllm_config(vllm_config: VllmConfig): vllm_config.compilation_config.enabled_custom_ops) logger.debug("disabled custom ops: %s", vllm_config.compilation_config.disabled_custom_ops) - if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ + if check_compile and \ + vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ and compilation_counter.num_models_seen == num_models_seen: # If the model supports compilation, # compilation_counter.num_models_seen should be increased diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index c03b5932eafb6..115f663e4ad34 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol): def __init__(self, all_block_indices: Iterable[BlockId]): deduped = set(all_block_indices) - self._refcounts: Dict[BlockId, - RefCount] = {index: 0 - for index in deduped} + self._refcounts: Dict[BlockId, RefCount] = { + index: 0 + for index in deduped + } def incr(self, block_id: BlockId) -> RefCount: assert block_id in self._refcounts diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 62a5f0bda061a..2d6a132ed555b 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -136,8 +136,8 @@ def can_allocate(self, device=Device.GPU) # Use watermark to avoid frequent cache eviction. - if (self.num_total_gpu_blocks - num_required_blocks < - self.watermark_blocks): + if (self.num_total_gpu_blocks - num_required_blocks + < self.watermark_blocks): return AllocStatus.NEVER if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: return AllocStatus.OK diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b1630b34947bd..2bb961481e5fe 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -988,8 +988,8 @@ def _schedule_prefills( waiting_queue.popleft() continue - if (budget.num_batched_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (budget.num_batched_tokens + >= self.scheduler_config.max_num_batched_tokens): # We've reached the budget limit - since there might be # continuous prefills in the running queue, we should break # to avoid scheduling any new prefills. @@ -1096,8 +1096,8 @@ def _schedule_default(self) -> SchedulerOutputs: running_scheduled.swapped_out) == 0: swapped_in = self._schedule_swapped(budget, curr_loras) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1189,8 +1189,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: curr_loras, enable_chunking=True) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1358,8 +1358,8 @@ def schedule( # NOTE: We use get_len instead of get_prompt_len because when # a sequence is preempted, prefill includes previous generated # output tokens. - if (token_chunk_size + num_computed_tokens < - seqs[0].data.get_len()): + if (token_chunk_size + num_computed_tokens + < seqs[0].data.get_len()): do_sample = False # It assumes the scheduled_seq_groups is ordered by @@ -1625,10 +1625,9 @@ def _passed_delay(self, now: float) -> bool: if self.scheduler_config.delay_factor > 0 and self.waiting: earliest_arrival_time = min( [e.metrics.arrival_time for e in self.waiting]) - passed_delay = ( - (now - earliest_arrival_time) > - (self.scheduler_config.delay_factor * self.last_prompt_latency) - or not self.running) + passed_delay = ((now - earliest_arrival_time) + > (self.scheduler_config.delay_factor * + self.last_prompt_latency) or not self.running) else: passed_delay = True return passed_delay diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py index 54f659e29b07f..b4c41807a5019 100644 --- a/vllm/distributed/device_communicators/neuron_communicator.py +++ b/vllm/distributed/device_communicators/neuron_communicator.py @@ -5,10 +5,6 @@ if current_platform.is_neuron(): import torch_xla.core.xla_model as xm - import torch_xla.runtime as xr - from torch_xla._internal import pjrt - - from vllm.executor import ray_utils class NeuronCommunicator: diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 4ced991f62f66..268edc0925fe8 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -352,8 +352,8 @@ def acquire_write(self, timeout: Optional[float] = None): sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 @@ -410,8 +410,8 @@ def acquire_read(self, timeout: Optional[float] = None): sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 6dee18767813a..a44d78afc6035 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -214,7 +214,8 @@ def __init__( PyNcclCommunicator) self.pynccl_comm: Optional[PyNcclCommunicator] = None - if use_pynccl and self.world_size > 1 and current_platform.is_cuda_alike(): + if use_pynccl and self.world_size > 1 and \ + current_platform.is_cuda_alike(): self.pynccl_comm = PyNcclCommunicator( group=self.cpu_group, device=self.device, @@ -354,9 +355,9 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: # TODO(gnovack) - remove check for is_xla_tensor once sampling is done on-device if self.neuron_communicator is not None and \ - not self.neuron_communicator.disabled and xm.is_xla_tensor(input_): + not self.neuron_communicator.disabled and \ + xm.is_xla_tensor(input_): return self.neuron_communicator.all_reduce(input_) - return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor: @@ -397,7 +398,8 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: group = self.device_group neuron_comm = self.neuron_communicator if neuron_comm is not None and not neuron_comm.disabled: - # TODO(gnovack) - remove check for is_xla_tensor once sampling is done on-device + # TODO(gnovack) - remove check for is_xla_tensor once + # sampling is done on-device if xm.is_xla_tensor(input_): return neuron_comm.all_gather(input_, dim) else: @@ -990,7 +992,8 @@ def init_distributed_environment( world_size=world_size, rank=rank) - # TODO(gnovack) - XLA CC Ops use an unamed process group, so we need to register a group with no name here + # TODO(gnovack) - XLA CC Ops use an unamed process group, + # so we need to register a group with no name here torch._C._distributed_c10d._register_process_group("", torch.distributed.group.WORLD) # set the local rank # local_rank is not available in torch ProcessGroup, @@ -1044,8 +1047,8 @@ def initialize_model_parallel( backend = backend or torch.distributed.get_backend( get_world_group().device_group) - if (world_size != - tensor_model_parallel_size * pipeline_model_parallel_size): + if (world_size + != tensor_model_parallel_size * pipeline_model_parallel_size): raise RuntimeError( f"world_size ({world_size}) is not equal to " f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " @@ -1099,8 +1102,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: return if all([ - vllm_config.kv_transfer_config.need_kv_parallel_group, - _KV_TRANSFER is None + vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER + is None ]): _KV_TRANSFER = kv_transfer.KVTransferAgent( rank=get_world_group().rank, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8f1b0bc5fd62e..ba96484e3fce9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -100,7 +100,6 @@ class EngineArgs: kv_cache_dtype: str = 'auto' seed: int = 0 max_model_len: Optional[int] = None - worker_use_ray: bool = False # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. @@ -389,10 +388,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' 'only supports Ray for distributed inference.') - parser.add_argument( - '--worker-use-ray', - action='store_true', - help='Deprecated, use ``--distributed-executor-backend=ray``.') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, @@ -944,7 +939,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "Defaults to None, will use the default generation config in vLLM. " "If set to 'auto', the generation config will be automatically " "loaded from model. If set to a folder path, the generation config " - "will be loaded from the specified folder path.") + "will be loaded from the specified folder path. If " + "`max_new_tokens` is specified, then it sets a server-wide limit " + "on the number of output tokens for all requests.") parser.add_argument("--enable-sleep-mode", action="store_true", @@ -1071,7 +1068,6 @@ def create_engine_config(self, parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, - worker_use_ray=self.worker_use_ray, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, tokenizer_pool_config=TokenizerPoolConfig.create_config( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7da18d5f7d2eb..ab67ae29723cd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1010,8 +1010,23 @@ def _process_model_outputs(self, self.speculative_config # Organize outputs by [step][sequence group] instead of # [sequence group][step]. - outputs_by_sequence_group = create_output_by_sequence_group( - outputs, num_seq_groups=len(seq_group_metadata_list)) + if self.scheduler_config.is_multi_step: + outputs_by_sequence_group = create_output_by_sequence_group( + outputs, len(seq_group_metadata_list)) + elif self.speculative_config: + # Decodes are multi-steps while prefills are not, outputting at + # most 1 token. Separate them so that we can trigger chunk + # processing without having to pad or copy over prompts K times + # to match decodes structure (costly with prompt_logprobs). + num_prefills = sum(sg.is_prompt + for sg in seq_group_metadata_list) + prefills, decodes = outputs[:num_prefills], outputs[ + num_prefills:] + outputs_by_sequence_group = create_output_by_sequence_group( + decodes, + num_seq_groups=len(seq_group_metadata_list) - num_prefills) + outputs_by_sequence_group = [p.outputs for p in prefills + ] + outputs_by_sequence_group # We have outputs for multiple steps submitted in a single burst, # so invalidate is_first_step_output. is_first_step_output = None diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index f7ce21d0ae988..b771c190dd82a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -259,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): documentation="Number of emitted tokens.", labelnames=labelnames)) - # Deprecated in favor of vllm:prompt_tokens_total - self.gauge_avg_prompt_throughput = self._gauge_cls( - name="vllm:avg_prompt_throughput_toks_per_s", - documentation="Average prefill throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # Deprecated in favor of vllm:generation_tokens_total - self.gauge_avg_generation_throughput = self._gauge_cls( - name="vllm:avg_generation_throughput_toks_per_s", - documentation="Average generation throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # end-metrics-definitions @@ -635,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram(self.metrics.histogram_max_tokens_request, stats.max_tokens_requests) - def _log_prometheus_interval(self, prompt_throughput: float, - generation_throughput: float) -> None: - # Logs metrics to prometheus that are computed every logging_interval. - # Support legacy gauge metrics that make throughput calculations on - # the vLLM side. Moving forward, we should use counters like - # counter_prompt_tokens, counter_generation_tokens - # Which log raw data and calculate summaries using rate() on the - # grafana/prometheus side. See - # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 - self.metrics.gauge_avg_prompt_throughput.labels( - **self.labels).set(prompt_throughput) - self.metrics.gauge_avg_generation_throughput.labels( - **self.labels).set(generation_throughput) - def log(self, stats: Stats): """Logs to prometheus and tracked stats every iteration.""" # Log to prometheus. @@ -664,20 +635,6 @@ def log(self, stats: Stats): # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): - # Compute summary metrics for tracked stats (and log them - # to promethus if applicable). - prompt_throughput = get_throughput(self.num_prompt_tokens, - now=stats.now, - last_log=self.last_local_log) - generation_throughput = get_throughput( - self.num_generation_tokens, - now=stats.now, - last_log=self.last_local_log) - - self._log_prometheus_interval( - prompt_throughput=prompt_throughput, - generation_throughput=generation_throughput) - if self.spec_decode_metrics is not None: self._log_gauge( self.metrics.gauge_spec_decode_draft_acceptance_rate, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index beedf5d16ab86..723d6e9085806 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -3,7 +3,7 @@ import json from abc import ABC, abstractmethod from collections import defaultdict, deque -from functools import lru_cache, partial +from functools import cache, lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypeVar, Union, cast) @@ -377,7 +377,7 @@ def allowed_local_media_path(self): return self._model_config.allowed_local_media_path @staticmethod - @lru_cache(maxsize=None) + @cache def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: return tokenizer.decode(token_index) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1860ed3d7db5a..46b595b0da73c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -5,6 +5,7 @@ Tuple, Type, Union, cast, overload) import cloudpickle +import torch import torch.nn as nn from tqdm import tqdm from typing_extensions import TypeVar, deprecated @@ -996,6 +997,107 @@ def classify( return [ClassificationRequestOutput.from_base(item) for item in items] + def _embedding_score( + self, + tokenizer: AnyTokenizer, + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + encoded_output = self.encode( + text_1 + text_2, + use_tqdm=use_tqdm, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + encoded_output_1 = encoded_output[0:len(text_1)] + encoded_output_2 = encoded_output[len(text_1):] + + if len(encoded_output_1) == 1: + encoded_output_1 = encoded_output_1 * len(encoded_output_2) + + output_pairs = [(t1, t2) + for t1, t2 in zip(encoded_output_1, encoded_output_2)] + + scores = [] + scorer = torch.nn.CosineSimilarity(0) + + for embed_1, embed_2 in output_pairs: + pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data) + + if (pad_token_id := getattr(tokenizer, "pad_token_id", + None)) is not None: + tokens = embed_1.prompt_token_ids + [ + pad_token_id + ] + embed_2.prompt_token_ids + else: + tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids + + scores.append( + PoolingRequestOutput( + request_id=f"{embed_1.request_id}_{embed_2.request_id}", + outputs=pair_score, + prompt_token_ids=tokens, + finished=True)) + + items = self.engine_class.validate_outputs(scores, + PoolingRequestOutput) + return [ScoringRequestOutput.from_base(item) for item in items] + + def _cross_encoding_score( + self, + tokenizer: Union[AnyTokenizer], + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "Score API is only enabled for `--task embed or score`") + + if len(text_1) == 1: + text_1 = text_1 * len(text_2) + + input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] + + pooling_params = PoolingParams() + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + parsed_prompts = [] + + for q, t in input_pairs: + prompt_inputs = tokenizer(text=q, + text_pair=t, + **tokenization_kwargs) + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + parsed_prompts.append(engine_prompt) + + self._validate_and_add_requests( + prompts=parsed_prompts, + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + items = self.engine_class.validate_outputs(outputs, + PoolingRequestOutput) + + return [ScoringRequestOutput.from_base(item) for item in items] + def score( self, text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], @@ -1047,25 +1149,20 @@ def score( raise ValueError(" ".join(messages)) - if not self.llm_engine.model_config.is_cross_encoder: - raise ValueError("Your model does not support cross encoding") - if self.llm_engine.model_config.task != "score": - raise ValueError("Score API is only enabled for `--task score`") - - tokenizer = self.llm_engine.get_tokenizer() - - if isinstance(tokenizer, MistralTokenizer): + if self.llm_engine.model_config.task not in ("embed", "score"): raise ValueError( - "MistralTokenizer not supported for cross-encoding") + "Score API is only enabled for `--task embed or --task score`") # the tokenizer for models such as # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # lists of tokens to the `text` and `text_pair` kwargs + tokenizer = self.llm_engine.get_tokenizer() + def ensure_str(prompt: SingletonPrompt): if isinstance(prompt, dict): if "multi_modal_data" in prompt: raise ValueError("Multi-modal prompt is not " - "supported for cross encoding") + "supported for scoring") elif "prompt_token_ids" in prompt: prompt = tokenizer.decode( cast(TokensPrompt, prompt)["prompt_token_ids"]) @@ -1091,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt): if len(text_2) == 0: raise ValueError("At least one text_pair element must be given") - if len(text_1) == 1: - text_1 = text_1 * len(text_2) - - input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] - pooling_params = PoolingParams() - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - parsed_prompts = [] - - for q, t in input_pairs: - prompt_inputs = tokenizer(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - parsed_prompts.append(engine_prompt) - - self._validate_and_add_requests( - prompts=parsed_prompts, - params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - ) - - outputs = self._run_engine(use_tqdm=use_tqdm) - items = self.engine_class.validate_outputs(outputs, - PoolingRequestOutput) - - return [ScoringRequestOutput.from_base(item) for item in items] + if self.llm_engine.model_config.is_cross_encoder: + return self._cross_encoding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, + prompt_adapter_request) + else: + return self._embedding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, prompt_adapter_request) def start_profile(self) -> None: self.llm_engine.start_profile() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f510c41503011..45cf06566faaa 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -56,6 +56,7 @@ PoolingChatRequest, PoolingCompletionRequest, PoolingRequest, PoolingResponse, + RerankRequest, RerankResponse, ScoreRequest, ScoreResponse, TokenizeRequest, TokenizeResponse, @@ -68,6 +69,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling +from vllm.entrypoints.openai.serving_rerank import JinaAIServingRerank from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) @@ -306,6 +308,10 @@ def score(request: Request) -> Optional[OpenAIServingScores]: return request.app.state.openai_serving_scores +def rerank(request: Request) -> Optional[JinaAIServingRerank]: + return request.app.state.jinaai_serving_reranking + + def tokenization(request: Request) -> OpenAIServingTokenization: return request.app.state.openai_serving_tokenization @@ -502,6 +508,40 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) +@router.post("/rerank") +@with_cancellation +async def do_rerank(request: RerankRequest, raw_request: Request): + handler = rerank(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Rerank (Score) API") + generator = await handler.do_rerank(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, RerankResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/v1/rerank") +@with_cancellation +async def do_rerank_v1(request: RerankRequest, raw_request: Request): + logger.warning( + "To indicate that the rerank API is not part of the standard OpenAI" + " API, we have located it at `/rerank`. Please update your client" + "accordingly. (Note: Conforms to JinaAI rerank API)") + + return await do_rerank(request, raw_request) + + +@router.post("/v2/rerank") +@with_cancellation +async def do_rerank_v2(request: RerankRequest, raw_request: Request): + return await do_rerank(request, raw_request) + + TASK_HANDLERS: Dict[str, Dict[str, tuple]] = { "generate": { "messages": (ChatCompletionRequest, create_chat_completion), @@ -512,7 +552,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): "default": (EmbeddingCompletionRequest, create_embedding), }, "score": { - "default": (ScoreRequest, create_score), + "default": (RerankRequest, do_rerank) + }, + "rerank": { + "default": (RerankRequest, do_rerank) }, "reward": { "messages": (PoolingChatRequest, create_pooling), @@ -759,6 +802,12 @@ async def init_app_state( state.openai_serving_models, request_logger=request_logger ) if model_config.task == "score" else None + state.jinaai_serving_reranking = JinaAIServingRerank( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger + ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 80403f77d5375..f89c3f42aab17 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -380,13 +380,17 @@ def to_beam_search_params( ) -> BeamSearchParams: # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} n = self.n if self.n is not None else 1 + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + if (temperature := self.temperature) is None: temperature = default_sampling_params.get( "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]) @@ -406,11 +410,16 @@ def to_sampling_params( default_sampling_params: Optional[dict] = None) -> SamplingParams: # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} + + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + # Default parameters if (repetition_penalty := self.repetition_penalty) is None: repetition_penalty = default_sampling_params.get( @@ -740,13 +749,17 @@ def to_beam_search_params( default_sampling_params: Optional[dict] = None ) -> BeamSearchParams: max_tokens = self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} n = self.n if self.n is not None else 1 + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + if (temperature := self.temperature) is None: temperature = default_sampling_params.get("temperature", 1.0) @@ -764,11 +777,16 @@ def to_sampling_params( logits_processor_pattern: Optional[str], default_sampling_params: Optional[dict] = None) -> SamplingParams: max_tokens = self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} + + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + # Default parameters if (repetition_penalty := self.repetition_penalty) is None: repetition_penalty = default_sampling_params.get( @@ -1000,6 +1018,52 @@ def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) +class RerankRequest(OpenAIBaseModel): + model: str + query: str + documents: List[str] + top_n: int = Field(default_factory=lambda: 0) + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + + # doc: begin-rerank-pooling-params + additional_data: Optional[Any] = None + # doc: end-rerank-pooling-params + + # doc: begin-rerank-extra-params + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling.")) + + # doc: end-rerank-extra-params + + def to_pooling_params(self): + return PoolingParams(additional_data=self.additional_data) + + +class RerankDocument(BaseModel): + text: str + + +class RerankResult(BaseModel): + index: int + document: RerankDocument + relevance_score: float + + +class RerankUsage(BaseModel): + total_tokens: int + + +class RerankResponse(OpenAIBaseModel): + id: str + model: str + usage: RerankUsage + results: List[RerankResult] + + class CompletionLogProbs(OpenAIBaseModel): text_offset: List[int] = Field(default_factory=list) token_logprobs: List[Optional[float]] = Field(default_factory=list) @@ -1219,7 +1283,7 @@ class BatchRequestInput(OpenAIBaseModel): url: str # The parameters of the request. - body: Union[ChatCompletionRequest, EmbeddingRequest] + body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest] class BatchResponseData(OpenAIBaseModel): @@ -1230,7 +1294,8 @@ class BatchResponseData(OpenAIBaseModel): request_id: str # The body of the response. - body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None + body: Optional[Union[ChatCompletionResponse, EmbeddingResponse, + ScoreResponse]] = None class BatchRequestOutput(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index f8f136f9d5024..37ae23506acea 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -16,12 +16,14 @@ BatchRequestOutput, BatchResponseData, ChatCompletionResponse, - EmbeddingResponse, ErrorResponse) + EmbeddingResponse, ErrorResponse, + ScoreResponse) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) +from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -167,7 +169,8 @@ async def run_request(serving_engine_func: Callable, tracker: BatchProgressTracker) -> BatchRequestOutput: response = await serving_engine_func(request.body) - if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)): + if isinstance(response, + (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, @@ -239,6 +242,12 @@ async def main(args): chat_template=None, chat_template_content_format="auto", ) if model_config.task == "embed" else None + openai_serving_scores = (OpenAIServingScores( + engine, + model_config, + openai_serving_models, + request_logger=request_logger, + ) if model_config.task == "score" else None) tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) @@ -279,14 +288,28 @@ async def main(args): )) continue + response_futures.append(run_request(handler_fn, request, tracker)) + tracker.submitted() + elif request.url == "/v1/score": + handler_fn = (None if openai_serving_scores is None else + openai_serving_scores.create_score) + if handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Scores API", + )) + continue + response_futures.append(run_request(handler_fn, request, tracker)) tracker.submitted() else: response_futures.append( make_async_error_request_output( request, - error_msg="Only /v1/chat/completions and " - "/v1/embeddings are supported in the batch endpoint.", + error_msg= + "Only /v1/chat/completions, /v1/embeddings, and /v1/score " + "are supported in the batch endpoint.", )) with tracker.pbar(): diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2c9c20caf8119..b0179f78bd635 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -522,11 +522,10 @@ def _create_completion_logprobs( out_top_logprobs.append({ # Convert float("-inf") to the # JSON-serializable float that OpenAI uses - self._get_decoded_token( - top_lp[1], - top_lp[0], - tokenizer, - return_as_token_id=self.return_tokens_as_token_ids): + self._get_decoded_token(top_lp[1], + top_lp[0], + tokenizer, + return_as_token_id=self.return_tokens_as_token_ids): max(top_lp[1].logprob, -9999.0) for i, top_lp in enumerate(step_top_logprobs.items()) if num_output_top_logprobs >= i diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 3da447be06430..8d54164e500eb 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -26,7 +26,8 @@ DetokenizeRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, - ErrorResponse, ScoreRequest, + ErrorResponse, RerankRequest, + ScoreRequest, TokenizeChatRequest, TokenizeCompletionRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels @@ -204,9 +205,9 @@ def _validate_input( token_num = len(input_ids) # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens - if isinstance( - request, - (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)): + if isinstance(request, + (EmbeddingChatRequest, EmbeddingCompletionRequest, + ScoreRequest, RerankRequest)): operation = "score" if isinstance(request, ScoreRequest) \ else "embedding generation" diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py new file mode 100644 index 0000000000000..be4420261afe3 --- /dev/null +++ b/vllm/entrypoints/openai/serving_rerank.py @@ -0,0 +1,206 @@ +import asyncio +from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast + +from fastapi import Request + +from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument, + RerankRequest, RerankResponse, + RerankResult, RerankUsage) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.utils import make_async, merge_async_iterators + +logger = init_logger(__name__) + + +class JinaAIServingRerank(OpenAIServing): + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + ) -> None: + super().__init__(engine_client=engine_client, + model_config=model_config, + models=models, + request_logger=request_logger) + + async def do_rerank( + self, + request: RerankRequest, + raw_request: Optional[Request] = None + ) -> Union[RerankResponse, ErrorResponse]: + """ + Rerank API based on JinaAI's rerank API; implements the same + API interface. Designed for compatibility with off-the-shelf + tooling, since this is a common standard for reranking APIs + + See example client implementations at + https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py + numerous clients use this standard. + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + model_name = request.model + request_id = f"rerank-{self._base_request_id(raw_request)}" + truncate_prompt_tokens = request.truncate_prompt_tokens + query = request.query + documents = request.documents + request_prompts = [] + engine_prompts = [] + top_n = request.top_n if request.top_n > 0 else len(documents) + + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer(lora_request) + + if prompt_adapter_request is not None: + raise NotImplementedError("Prompt adapter is not supported " + "for scoring models") + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "MistralTokenizer not supported for cross-encoding") + + if not self.model_config.is_cross_encoder: + raise ValueError("Model is not cross encoder.") + + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({self.max_model_len})." + f" Please, select a smaller truncation size.") + for doc in documents: + request_prompt = f"{query}{tokenizer.sep_token}{doc}" + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=query, + text_pair=doc, + **tokenization_kwargs) + + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + + try: + pooling_params = request.to_pooling_params() + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs(request_id_item, + request_prompts[i], + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + result_generator = merge_async_iterators(*generators) + + num_prompts = len(engine_prompts) + + # Non-streaming response + final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch = [None] * num_prompts + + try: + async for i, res in result_generator: + final_res_batch[i] = res + + assert all(final_res is not None for final_res in final_res_batch) + + final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch) + + response = self.request_output_to_rerank_response( + final_res_batch_checked, request_id, model_name, documents, + top_n) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + return response + + def request_output_to_rerank_response( + self, final_res_batch: List[PoolingRequestOutput], request_id: str, + model_name: str, documents: List[str], + top_n: int) -> RerankResponse: + """ + Convert the output of do_rank to a RerankResponse + """ + results: List[RerankResult] = [] + num_prompt_tokens = 0 + for idx, final_res in enumerate(final_res_batch): + classify_res = ScoringRequestOutput.from_base(final_res) + + result = RerankResult( + index=idx, + document=RerankDocument(text=documents[idx]), + relevance_score=classify_res.outputs.score, + ) + results.append(result) + prompt_token_ids = final_res.prompt_token_ids + num_prompt_tokens += len(prompt_token_ids) + + # sort by relevance, then return the top n if set + results.sort(key=lambda x: x.relevance_score, reverse=True) + if top_n < len(documents): + results = results[:top_n] + + return RerankResponse( + id=request_id, + model=model_name, + results=results, + usage=RerankUsage(total_tokens=num_prompt_tokens)) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 94db8f379e33a..93e357e8b9f21 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -62,8 +62,8 @@ def extract_tool_calls( start_of_json = match.end() # end_index == the start of the next function call # (if exists) - next_function_call_start = (matches[i + 1].start() - if i + 1 < len(matches) else None) + next_function_call_start = (matches[i + 1].start() if i + + 1 < len(matches) else None) raw_function_calls.append( dec.raw_decode( diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e6f26d2b74b2f..cdd439d0385b6 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -220,8 +220,10 @@ def set_lora( lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ].copy_(embeddings_tensor, non_blocking=True) + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ].copy_(embeddings_tensor, non_blocking=True) if self.embeddings_slice is not None: # TODO(yard1): Optimize this copy, we don't need to copy # everything, just the modified part @@ -1024,8 +1026,10 @@ def set_lora( lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ] = embeddings_tensor + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ] = embeddings_tensor def _get_logits( self, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b77b6b3d72ff4..2e04cb902d009 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -75,8 +75,9 @@ def __init__( # Scaling factor for long context lora model. None if it is not # fine tuned for the long context. self.scaling_factor = scaling_factor - assert (lora_model_id > - 0), f"a valid lora id should be greater than 0, got {self.id}" + assert ( + lora_model_id + > 0), f"a valid lora id should be greater than 0, got {self.id}" self.rank = rank self.loras: Dict[str, LoRALayerWeights] = loras diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index 8af44b703810b..48fa5cd63741f 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -136,9 +136,8 @@ def _sgmv_expand_kernel( c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride + offset_cn[None, :] * output_d1_stride) M = tl.load(seq_lens + cur_batch) - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < - (cur_slice_start + curr_N)) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( + offset_cn[None, :] < (cur_slice_start + curr_N)) if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) tiled_c += tiled_out diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 3d2ebe8286f56..9bb35e8ffd323 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -114,8 +114,8 @@ def _sgmv_shrink_kernel( slice_id * output_d0_stride) c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[ None, :] * output_d2_stride - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < N) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] + < N) accumulator *= scaling # handles write-back with reduction-splitting if SPLIT_K == 1: diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..b6f1d01f88652 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json index 4d4b752fa5d64..66f9106bd1be3 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json @@ -1,21 +1,21 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -23,10 +23,10 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -34,10 +34,10 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 1, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -48,7 +48,7 @@ "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -56,10 +56,10 @@ }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 1, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -67,32 +67,32 @@ }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "64": { "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -100,24 +100,24 @@ }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "256": { @@ -129,7 +129,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 128, @@ -150,7 +150,7 @@ "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 32, + "matrix_instr_nonkdim": 16, "kpack": 2 }, "1536": { @@ -184,7 +184,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4096": { "BLOCK_SIZE_M": 128, @@ -195,6 +195,6 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..0e5fd1eec77d7 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..d6ad63509f157 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8323f512db015 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json index a218fc40642c1..1b46cb5716514 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json @@ -1,10 +1,10 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -19,14 +19,14 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -34,76 +34,76 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "24": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1, - "num_warps": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "64": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -112,24 +112,24 @@ "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 64, @@ -151,7 +151,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1536": { "BLOCK_SIZE_M": 128, @@ -162,7 +162,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2048": { "BLOCK_SIZE_M": 128, @@ -184,7 +184,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4096": { "BLOCK_SIZE_M": 128, @@ -195,6 +195,6 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..81bb765d30031 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..811c77ab41093 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..379ca107a9469 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json index 3682cc548f352..ed5b655d89937 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json @@ -1,21 +1,21 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -23,10 +23,10 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -34,7 +34,7 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, @@ -52,32 +52,32 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, @@ -85,7 +85,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "64": { "BLOCK_SIZE_M": 32, @@ -101,40 +101,40 @@ "96": { "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -151,7 +151,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1536": { "BLOCK_SIZE_M": 128, @@ -173,7 +173,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "3072": { "BLOCK_SIZE_M": 128, @@ -195,6 +195,6 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..48bb5f2ccb8e3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..a64d06c6d1724 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..bd2c6fbc1b941 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json index 21742854c613f..822f04e33e879 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json @@ -1,7 +1,7 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, @@ -12,54 +12,54 @@ }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "24": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 1, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -68,7 +68,7 @@ "32": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 2, @@ -78,32 +78,32 @@ }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -112,18 +112,18 @@ "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, @@ -140,7 +140,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1024": { "BLOCK_SIZE_M": 128, @@ -151,7 +151,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1536": { "BLOCK_SIZE_M": 128, @@ -173,7 +173,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "3072": { "BLOCK_SIZE_M": 128, @@ -187,7 +187,7 @@ "kpack": 2 }, "4096": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, @@ -195,6 +195,6 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..cd4fb8f11b935 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..cf66868e9d57a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b2fc2360f47f1..dd2dd02eaf723 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -9,6 +9,7 @@ QuantizationType) from pydantic import BaseModel +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) @@ -27,6 +28,8 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform +logger = init_logger(__name__) + __all__ = ["CompressedTensorsLinearMethod"] SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config" @@ -79,6 +82,8 @@ def get_quant_method( return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) + if scheme is None: + return UnquantizedLinearMethod() layer.scheme = scheme return CompressedTensorsLinearMethod(self) if isinstance(layer, Attention): @@ -340,10 +345,10 @@ def _get_scheme_from_parts( raise NotImplementedError( "No compressed-tensors compatible scheme was found.") - def get_scheme( - self, - layer: torch.nn.Module, - layer_name: Optional[str] = None) -> "CompressedTensorsScheme": + def get_scheme(self, + layer: torch.nn.Module, + layer_name: Optional[str] = None + ) -> Optional["CompressedTensorsScheme"]: """ compressed-tensors supports non uniform in the following way: @@ -353,10 +358,7 @@ def get_scheme( which can be a full layer_name, a regex for a layer_name, or an nn.Module name. - We first check whether a layer is in the ignore group and use - CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer - - We then detect whether a layer_name is found in any target and + Detect whether a layer_name is found in any target and use the quantization scheme corresponding to the matched target to select the CompressedTensorsScheme used for infernece. """ @@ -394,6 +396,13 @@ def get_scheme( if self.supports_cutlass_24(weight_quant=weight_quant, input_quant=input_quant, sparsity_scheme=sparsity_scheme): + # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels + # currently produce bad output in some cases + if weight_quant is None: + logger.warning_once( + "CompressedTensors24 scheme is disabled for the w16a16 " + "case. Falling back to UnquantizedLinearMethod") + return None # Have a valid sparsity scheme # Validate layer is supported by Cutlass 2:4 Kernel scheme = CompressedTensors24(quantized=weight_quant is not None diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index b04612a9b00d9..915bdc4778929 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -73,12 +73,12 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str], torch.nn.Parameter(new_param.data, requires_grad=False)) def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # w_q - torch.Tensor, # w_s - Optional[torch.Tensor], # w_zp, - Optional[torch.Tensor] # w_gidx - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # w_q + torch.Tensor, # w_s + Optional[torch.Tensor], # w_zp, + Optional[torch.Tensor] # w_gidx + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 75cf91f191136..c4a83b4faafe6 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -48,13 +48,13 @@ def apply_weights(self, raise NotImplementedError def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # weight - torch.Tensor, # weight_scale - Optional[torch.Tensor], # input_scale, - Optional[torch.Tensor], # input_zp - Optional[torch.Tensor], # azp_adj - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # weight + torch.Tensor, # weight_scale + Optional[torch.Tensor], # input_scale, + Optional[torch.Tensor], # input_zp + Optional[torch.Tensor], # azp_adj + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index b6882cc7c837c..43b1997019107 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -72,9 +72,10 @@ def block_quant_to_tensor_quant( x_dq_block = x_q_block.to(torch.float32) x_dq_block_tiles = [[ - x_dq_block[j * block_n:min((j + 1) * block_n, n), - i * block_k:min((i + 1) * block_k, k), ] - for i in range(k_tiles) + x_dq_block[ + j * block_n:min((j + 1) * block_n, n), + i * block_k:min((i + 1) * block_k, k), + ] for i in range(k_tiles) ] for j in range(n_tiles)] for i in range(k_tiles): diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 7cdce67cf1677..9977804188a50 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -73,8 +73,8 @@ def requantize_with_max_scale( # from disk in this case. Skip requantization in this case (since) # we already are quantized with the single scale. # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 - unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo( - torch.float8_e4m3fn).min) + unfused_module_in_checkpoint = (weight_scale[-1] + > torch.finfo(torch.float8_e4m3fn).min) # If unfused checkpoint, need requanize with the single scale. if unfused_module_in_checkpoint: diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index b6e640f83971c..2dfecd73a65f9 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -263,7 +263,6 @@ def forward_neuron( offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - from torch_xla.core import xla_model as xm # TODO(gnovack) - handle edge cases if offsets is not None: diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index c2d12c466ba45..8dc26309d754e 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -716,9 +716,10 @@ def _sample_with_torch( tensors required for Pythonization ''' - categorized_seq_group_ids: Dict[SamplingType, - List[int]] = {t: [] - for t in SamplingType} + categorized_seq_group_ids: Dict[SamplingType, List[int]] = { + t: [] + for t in SamplingType + } categorized_sample_indices = sampling_metadata.categorized_sample_indices for i, seq_group in enumerate(sampling_metadata.seq_groups): sampling_params = seq_group.sampling_params diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 3eb5c39ccf580..f230efacacdbb 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -115,17 +115,17 @@ def num_elements_padded(self) -> int: def __post_init__(self): # sanity checks - assert (self.padded_org_vocab_start_index <= - self.padded_org_vocab_end_index) - assert (self.padded_added_vocab_start_index <= - self.padded_added_vocab_end_index) + assert (self.padded_org_vocab_start_index + <= self.padded_org_vocab_end_index) + assert (self.padded_added_vocab_start_index + <= self.padded_added_vocab_end_index) assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index - assert (self.added_vocab_start_index <= - self.padded_added_vocab_start_index) + assert (self.added_vocab_start_index + <= self.padded_added_vocab_start_index) assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index @@ -141,8 +141,8 @@ def get_masked_input_and_mask( added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: # torch.compile will fuse all of the pointwise ops below # into a single kernel, making it very fast - org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < - org_vocab_end_index) + org_vocab_mask = (input_ >= org_vocab_start_index) & ( + input_ < org_vocab_end_index) added_vocab_mask = (input_ >= added_vocab_start_index) & ( input_ < added_vocab_end_index) added_offset = added_vocab_start_index - ( diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index e9779878710ee..712266ee42639 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -114,7 +114,7 @@ def _initialize_model( all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: # new-style model class - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(vllm_config=vllm_config, prefix=prefix) msg = ("vLLM model class should accept `vllm_config` and `prefix` as " @@ -142,7 +142,7 @@ def _initialize_model( kwargs["lora_config"] = vllm_config.lora_config if "scheduler_config" in all_params: kwargs["scheduler_config"] = vllm_config.scheduler_config - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(**kwargs) @@ -1121,8 +1121,9 @@ def _load_weights(self, model_config: ModelConfig, # from being incorrectly identified as being present in # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight shard_pos = quant_param_name.find(shard_name) - can_correct_rename = (shard_pos > 0) and ( - quant_param_name[shard_pos - 1] == ".") + can_correct_rename = (shard_pos + > 0) and (quant_param_name[shard_pos - 1] + == ".") # If the quant_param_name is packed, it won't occur in the # param_dict before renaming. new_quant_param_name = quant_param_name.replace( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 5b4757072353f..9266ca75ddaac 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -288,7 +288,8 @@ def _init_model(self): model_args.torch_dtype = self.tensorizer_config.dtype assert self.tensorizer_config.model_class is not None # TODO: Do we need to consider old-style model class? - with no_init_or_tensor(), set_current_vllm_config(self.vllm_config): + with no_init_or_tensor(), set_current_vllm_config(self.vllm_config, + check_compile=True): return self.tensorizer_config.model_class( vllm_config=self.vllm_config, ) @@ -297,8 +298,8 @@ def _resize_lora_embeddings(self): to allow for adapter added tokens.""" for child in self.model.modules(): if (isinstance(child, VocabParallelEmbedding) - and child.weight.shape[0] < - child.num_embeddings_per_partition): + and child.weight.shape[0] + < child.num_embeddings_per_partition): new_weight = torch.empty(child.num_embeddings_per_partition, child.embedding_dim, dtype=child.weight.dtype, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 09c5087c2dc36..b559ac677a740 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -481,14 +481,14 @@ def _get_prompt_replacements( bos_token_id = tokenizer.bos_token_id assert isinstance(bos_token_id, int) - image_token_id = vocab["image"] + image_token_id = vocab[""] num_image_tokens = self.info.get_num_image_tokens() image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target="", + target=[bos_token_id], replacement=PromptReplacementDetails( full=image_tokens + [bos_token_id], features=image_tokens, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6de0c866bc2f0..b23aba829c549 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" -from functools import lru_cache +from functools import cache from typing import Iterable, List, Optional, Set, Tuple, Union import torch @@ -48,7 +48,7 @@ logger = init_logger(__name__) -@lru_cache(maxsize=None) +@cache def _get_gemma_act_fn( hidden_act: Optional[str], hidden_activation: Optional[str], diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 1656a3cc9e46d..2f1aa2d68653c 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -258,13 +258,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.transformer = GPT2Model(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "transformer")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head") if self.config.tie_word_embeddings: - self.lm_head = self.transformer.wte - else: - self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.lm_head") + self.lm_head = self.lm_head.tie_weights(self.transformer.wte) + self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( @@ -309,15 +309,12 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if name.startswith("lm_head"): - # GPT-2 ties the weights of the embedding layer and the final - # linear layer. - continue if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. # NOTE: "c_attn.bias" should not be skipped. continue - if not name.startswith("transformer."): + if not name.startswith("transformer.") and not name.startswith( + "lm_head"): name = "transformer." + name if is_pp_missing_parameter(name, self): diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 51296ef0cc08e..cdf9414d5949c 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -348,6 +348,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.lora_config = lora_config + self.quant_config = quant_config # Required by MixtralForCausalLM self.model = GraniteMoeModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -428,10 +429,10 @@ def load_weights(self, weights: Iterable[Tuple[str, for e in range(p.size(0)): w1_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w1.weight" % e) + f".block_sparse_moe.experts.{e}.w1.weight") w3_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w3.weight" % e) + f".block_sparse_moe.experts.{e}.w3.weight") w1_param, w3_param = p[e].chunk(2, dim=0) assert w1_name not in new_weights assert w3_name not in new_weights @@ -441,7 +442,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for e in range(p.size(0)): w2_name = n.replace( '.block_sparse_moe.output_linear.weight', - ".block_sparse_moe.experts.%d.w2.weight" % e) + f".block_sparse_moe.experts.{e}.w2.weight") w2_param = p[e] assert w2_name not in new_weights new_weights[w2_name] = w2_param diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 61baa8e588d74..e15ac84a6049b 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1365,8 +1365,8 @@ def forward( # For 1) text-only prefill and decode, 2) image-present decode. if image_inputs is None: full_text_row_masked_out_mask = ( - attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to( - input_ids.device) + attn_metadata.encoder_seq_lens_tensor + != 0).reshape(-1, 1).to(input_ids.device) skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0 # For image-present prefill. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index d49da5f29aa14..f1d796ca26a16 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -81,8 +81,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: if self.tie_weights: assert ( - self.n_predict > - 1), "You cannot tie weights between stages when only 1 exists" + self.n_predict > 1 + ), "You cannot tie weights between stages when only 1 exists" embedding = VocabParallelEmbedding( config.vocab_size, self.inner_dim, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 881c09ea9db99..6367b770a0aff 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01): # compute mask for sparsity mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf")) @@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01): mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8d2719ca2d00d..8d71b19060bf4 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -462,7 +462,8 @@ def is_hybrid_model( ModelRegistry = _ModelRegistry({ - model_arch: _LazyRegisteredModel( + model_arch: + _LazyRegisteredModel( module_name=f"vllm.model_executor.models.{mod_relname}", class_name=cls_name, ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index d577e545a473b..605a0ecf4e0a9 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -333,10 +333,10 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, - info=UltravoxProcessingInfo, - dummy_inputs=UltravoxDummyInputsBuilder - ) +@MULTIMODAL_REGISTRY.register_processor( + UltravoxMultiModalProcessor, + info=UltravoxProcessingInfo, + dummy_inputs=UltravoxDummyInputsBuilder) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 43b3c973c97b8..01a232fdc76de 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -599,9 +599,8 @@ def make_empty_intermediate_tensors( device: torch.device, ) -> IntermediateTensors: return IntermediateTensors({ - key: torch.zeros((batch_size, hidden_size), - dtype=dtype, - device=device) + key: + torch.zeros((batch_size, hidden_size), dtype=dtype, device=device) for key in keys }) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 1df8f84ed4093..61e8881b64f5d 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -166,7 +166,8 @@ def prepare( pin_memory=pin_memory, ) categorized_sample_indices = { - t: async_tensor_h2d( + t: + async_tensor_h2d( seq_ids, dtype=torch.int, target_device=device, @@ -198,8 +199,12 @@ def _prepare_seq_groups( device: str, generators: Optional[Dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, -) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType, - List[int]], int, ]: +) -> Tuple[ + List[SequenceGroupToSample], + List[int], + Dict[SamplingType, List[int]], + int, +]: """Prepare sequence groups and indices for sampling. Args: diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index e795823bee054..5778107ba0535 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -55,8 +55,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "vllm.worker.neuron_worker.NeuronWorker" - assert (vllm_config.lora_config is - None), "LoRA is not supported for Neuron backend." + assert (vllm_config.lora_config + is None), "LoRA is not supported for Neuron backend." assert (not vllm_config.speculative_config ), "Speculative decoding not yet supported for Neuron backend." diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 9d711b0debcd8..20063a5b4b085 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -121,8 +121,8 @@ def _raw_min(self) -> Union[int, float]: min_raw = max_raw | sign_bit_double return struct.unpack('!d', struct.pack('!Q', min_raw))[0] else: - assert (not self.is_signed() or - self.size_bits <= 64), "Cannot represent min as a int64_t" + assert (not self.is_signed() or self.size_bits + <= 64), "Cannot represent min as a int64_t" if self.is_signed(): return -(1 << (self.size_bits - 1)) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 01b9cdad963da..56fb9ba506a44 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -83,13 +83,13 @@ def score_proposals( if not non_spec_indices: # All sequence groups in batch have spec decoding enabled - contracted = self._contract_batch_all_spec( + return self._contract_batch_all_spec( target_sampler_output=target_sampler_output, proposals=proposals, ) else: # Batch has a mix of spec decode enabled and disabled seq groups - contracted = self._contract_batch( + return self._contract_batch( execute_model_req.seq_group_metadata_list, target_sampler_output=target_sampler_output, proposals=proposals, @@ -99,14 +99,6 @@ def score_proposals( k=execute_model_req.num_lookahead_slots, ) - all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted - return SpeculativeScores( - probs=all_probs, - token_ids=all_tokens, - logprobs=spec_logprobs, - hidden_states=all_hidden_states, - ) - def _expand_batch( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -143,13 +135,57 @@ def _expand_batch( return (spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens) + def _contract_non_speculative( + self, scores: SpeculativeScores, + seq_group_metadata_list: List[SequenceGroupMetadata], + non_spec_indices: List[int], non_spec_outputs: SpeculativeScores, + has_prompt_log: bool) -> SpeculativeScores: + """ + Augment input `scores` with non-speculative requests outputs. + This includes decode requests with speculation turned off, as well + as prefill requests when `enable_chunked_prefill` is set. + For the latter, prefills are further separated into terminal and + non-terminal chunks (from which no token is sampled). + """ + if not non_spec_indices: + return scores + + if has_prompt_log: + # When prompt_logprobs is enabled, prefills yield output token + # (and respective prob) in the last entry (prompt|out): + # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..]. + # With chunked prefill, non-terminal chunks have -1 on each + # position: they're still picked, but they're discarded later. + seq_meta = seq_group_metadata_list + nospec_sizes = torch.tensor([ + seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1 + for i in non_spec_indices + ]) + nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1) + else: + # In this case only sampled tokens are returned, select all. + nospec_sampled_token_idxs = list( + range(len(non_spec_outputs.token_ids))) + + scores.token_ids[non_spec_indices, :1] = \ + non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1) + scores.probs[non_spec_indices, :1, :] = \ + non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1) + scores.logprobs[non_spec_indices, :1, :] = \ + non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1) + if scores.hidden_states is not None: + assert non_spec_outputs.hidden_states is not None + scores.hidden_states[non_spec_indices, :1, :] = \ + non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1) + return scores + def _contract_batch( - self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata], - target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, - num_scoring_tokens: int, non_spec_indices: List[int], - spec_indices: List[int], k: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: + self, + contracted_seq_group_metadata_list: List[SequenceGroupMetadata], + target_sampler_output: SamplerOutput, + proposals: SpeculativeProposals, num_scoring_tokens: int, + non_spec_indices: List[int], spec_indices: List[int], + k: int) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original sequences. @@ -195,23 +231,28 @@ def _contract_batch( else: all_hidden_states = None - # Rule out prefills that produce no tokens. - non_spec_indices = [ - idx for idx in non_spec_indices - if contracted_seq_group_metadata_list[idx].do_sample - ] - if len(non_spec_indices): - all_tokens[non_spec_indices, :1] = \ - non_spec_target_token_ids.unsqueeze(1) - all_probs[non_spec_indices, :1, :] = \ - non_spec_target_probs.unsqueeze(1) - all_logprobs[non_spec_indices, :1, :] = \ - non_spec_target_logprobs.unsqueeze(1) - if all_hidden_states is not None: - assert non_spec_target_hidden_states is not None - all_hidden_states[non_spec_indices, :1, :] = \ - non_spec_target_hidden_states.unsqueeze(1) - + has_prompt_log = any((sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + for sg in contracted_seq_group_metadata_list) + # When prompt logprobs is enabled, lens of returned tensors go from + # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. + # We adjust stride accordingly to get the generated tokens and + # their probs, but pass on prompt_logprobs as is. + prompt_logprobs = None + if (not self._scorer_worker.model_runner.disable_logprobs\ + and has_prompt_log): + prompt_logprobs = [ + o.prompt_logprobs for o in target_sampler_output.outputs + ] + elif not has_prompt_log: + # When prompt logprobs are not to be returned, + # we can ignore non-terminal chunks (no out token). + non_spec_indices = [ + idx for idx in non_spec_indices + if contracted_seq_group_metadata_list[idx].do_sample + ] + + # "Contract" speculative. if spec_indices: all_tokens[spec_indices] = target_token_ids all_probs[spec_indices] = target_probs @@ -219,14 +260,27 @@ def _contract_batch( if all_hidden_states is not None: all_hidden_states[spec_indices] = target_hidden_states - return all_tokens, all_probs, all_logprobs, all_hidden_states + spec_scores = SpeculativeScores(probs=all_probs, + token_ids=all_tokens, + logprobs=all_logprobs, + hidden_states=all_hidden_states, + prompt_logprobs=prompt_logprobs) + + non_spec_outputs = SpeculativeScores( + probs=non_spec_target_probs, + token_ids=non_spec_target_token_ids, + logprobs=non_spec_target_logprobs, + hidden_states=non_spec_target_hidden_states) + # Contract remaining nonspec entries based on non_spec_indices, if any. + return self._contract_non_speculative( + spec_scores, contracted_seq_group_metadata_list, non_spec_indices, + non_spec_outputs, has_prompt_log) def _contract_batch_all_spec( self, target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: + ) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original sequences. @@ -250,8 +304,11 @@ def _contract_batch_all_spec( target_hidden_states = target_hidden_states.reshape( *target_token_ids.shape, target_hidden_states.shape[-1]) - return (target_token_ids, target_probs, target_logprobs, - target_hidden_states) + return SpeculativeScores(probs=target_probs, + token_ids=target_token_ids, + logprobs=target_logprobs, + hidden_states=target_hidden_states, + prompt_logprobs=None) def _create_scoring_model_input( self, diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index a4fe0f13c8db1..c39e98b6cca12 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional, Set, Union +from typing import List, Optional, Set, Union import torch -from vllm.sequence import ExecuteModelRequest +from vllm.sequence import ExecuteModelRequest, PromptLogprobs from vllm.worker.worker_base import WorkerBase @@ -54,6 +54,10 @@ class SpeculativeScores: # Optional last hidden states from the scoring model. hidden_states: Optional[torch.Tensor] = None + # Scoring model may also return logprobs for prompt tokens + # for each request, when chunked prefill is enabled. + prompt_logprobs: Optional[List[PromptLogprobs]] = None + def __repr__(self): return (f"SpeculativeScores(" f"probs={self.probs.shape}, " diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index cbf793e2043e3..3aea2eabb4144 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -72,9 +72,15 @@ def score_proposals( target_token_ids = target_sampler_output.sampled_token_ids target_probs = target_sampler_output.sampled_token_probs target_logprobs = target_sampler_output.logprobs + prompt_logprobs = None + # If all requests have the same number of query tokens, we can avoid # the for loop to build output for better performance. if min(all_proposal_lengths) == k: + # Regular decodes only. + assert all(not sg.is_prompt + for sg in target_seq_group_metadata_list + if sg.is_prompt) bs, _ = proposals.proposal_token_ids.shape all_tokens = target_token_ids.reshape(bs, k + 1) all_probs = target_probs.reshape(bs, k + 1, self._vocab_size) @@ -88,19 +94,56 @@ def score_proposals( all_logprobs = target_logprobs.new_full(size=all_probs.shape, fill_value=-float("inf")) target_token_ids = target_token_ids.flatten() - start_loc = 0 - for i, (proposed_len, seq_meta) in enumerate( - zip(all_proposal_lengths, target_seq_group_metadata_list)): + + # When prompt logprobs is enabled, lens of returned tensors go from + # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. + # We adjust stride accordingly to get the generated tokens and + # their probs, but pass on prompt_logprobs as is, since it may be + # that n_prompts >> K. + has_prompt_log = any((sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + for sg in target_seq_group_metadata_list) + # TODO (NickLucche) we should surface `disable_logprobs` as to not + # break abstraction to get its value. + if (not self._scorer_worker.model_runner.disable_logprobs\ + and has_prompt_log): + prompt_logprobs = [ + o.prompt_logprobs for o in target_sampler_output.outputs + ] + + # Split loop into prefill|decode for readability. + start_loc, i = 0, 0 + while i < len(target_seq_group_metadata_list + ) and target_seq_group_metadata_list[i].is_prompt: + seq_meta = target_seq_group_metadata_list[i] + end_loc = start_loc + if has_prompt_log: + end_loc += seq_meta.token_chunk_size + elif seq_meta.do_sample: + end_loc += 1 + # Skip chunks with no output tokens. if seq_meta.do_sample: - output_len = proposed_len + 1 - end_loc = start_loc + output_len - all_tokens[ - i, :output_len] = target_token_ids[start_loc:end_loc] - all_probs[i, :output_len] = target_probs[start_loc:end_loc] - all_logprobs[ - i, :output_len] = target_logprobs[start_loc:end_loc] - start_loc = end_loc + # Get sampled token (last position in chunk) and its prob. + all_tokens[i, 0] = target_token_ids[end_loc - 1] + all_probs[i, 0] = target_probs[end_loc - 1] + all_logprobs[i, 0] = target_logprobs[end_loc - 1] + + i += 1 + start_loc = end_loc + # Decodes. + while i < len(target_seq_group_metadata_list): + proposed_len, seq_meta = all_proposal_lengths[ + i], target_seq_group_metadata_list[i] + output_len = proposed_len + 1 + end_loc = start_loc + output_len + all_tokens[ + i, :output_len] = target_token_ids[start_loc:end_loc] + all_probs[i, :output_len] = target_probs[start_loc:end_loc] + all_logprobs[ + i, :output_len] = target_logprobs[start_loc:end_loc] + start_loc = end_loc + i += 1 hidden_states = None if target_sampler_output.hidden_states is not None: @@ -110,4 +153,5 @@ def score_proposals( return SpeculativeScores(probs=all_probs, token_ids=all_tokens, logprobs=all_logprobs, - hidden_states=hidden_states) + hidden_states=hidden_states, + prompt_logprobs=prompt_logprobs) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 0d66ede3d907a..af1c4dfcebbc0 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -510,8 +510,8 @@ def _should_disable_all_speculation( self, execute_model_req: ExecuteModelRequest) -> bool: # When the batch size is too large, disable speculative decoding # to stop trading off throughput for latency. - return (execute_model_req.running_queue_size >= - self.disable_by_batch_size) + return (execute_model_req.running_queue_size + >= self.disable_by_batch_size) def _maybe_disable_speculative_tokens( self, disable_all_speculation: bool, @@ -563,50 +563,57 @@ def _serialize_sampler_output_no_logprobs( (seq_id, seq_data) for sg in \ execute_model_req.seq_group_metadata_list \ for seq_id, seq_data in sg.seq_data.items() - if sg.do_sample # ignore empty token sequences ] completion_seq_group_output_list: List[ CompletionSequenceGroupOutput] = [] output_index = 0 # Make sure the non-terminal prefill chunks are still aligned with # their own empty output. - for seq_group_meta in execute_model_req.seq_group_metadata_list: - # Since we can get chunks here, we dont always have a sampled token - # (only on last chunk) but we still have to provide an output. - if not seq_group_meta.do_sample: - completion_seq_group_output_list.append( - CompletionSequenceGroupOutput(samples=[], - prompt_logprobs=None)) - else: - # Sequence with output. - seq_id, seq_data = seq_data_entries[output_index] - needs_prompt_logprobs = seq_output_prompt_logprobs[ - output_index] - if needs_prompt_logprobs: - prompt_token_ids = seq_data.get_prompt_token_ids() - prompt_logprobs = [ - create_logprobs_output( - token_id=p_token_id, - token_id_logprob_rank=-1, - token_id_logprob=0.0, - topk_token_ids=[], - topk_logprobs=[], - ) - # no prompt logprobs for the first token - for p_token_id in prompt_token_ids[1:] - ] - else: - prompt_logprobs = None - completion_seq_group_output_list.append( - create_sequence_group_output( - token_id=sampled_token_ids_list[output_index][0], + for idx, seq_group_meta in enumerate( + execute_model_req.seq_group_metadata_list): + needs_prompt_logprobs = seq_output_prompt_logprobs[idx] + seq_id, seq_data = seq_data_entries[idx] + if needs_prompt_logprobs: + prompt_token_ids = seq_data.get_prompt_token_ids() + + # Some of these sequences may belong to non-terminal chunks, + # which may still have to report logprobs for prompts. + start = 1 if seq_data._num_computed_tokens == 0 \ + else seq_data._num_computed_tokens + end = (seq_data._num_computed_tokens + \ + seq_group_meta.token_chunk_size) + prompt_token_ids = prompt_token_ids[start:end] + prompt_logprobs = [ + create_logprobs_output( + token_id=p_token_id, token_id_logprob_rank=-1, token_id_logprob=0.0, - seq_id=seq_id, topk_token_ids=[], topk_logprobs=[], - prompt_logprobs=prompt_logprobs)) - output_index += 1 + ) for p_token_id in prompt_token_ids + ] + else: + prompt_logprobs = None + + # Since we can get chunks here, we dont always have a sampled token + # (only on last chunk) but we still have to provide an output. + if not seq_group_meta.do_sample: + completion_seq_group_output_list.append( + CompletionSequenceGroupOutput( + samples=[], prompt_logprobs=prompt_logprobs)) + continue + + # Sequence with output. + completion_seq_group_output_list.append( + create_sequence_group_output( + token_id=sampled_token_ids_list[output_index][0], + token_id_logprob_rank=-1, + token_id_logprob=0.0, + seq_id=seq_id, + topk_token_ids=[], + topk_logprobs=[], + prompt_logprobs=prompt_logprobs)) + output_index += 1 return [SamplerOutput(outputs=completion_seq_group_output_list)] @@ -624,24 +631,27 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, assert len(sampler_output) == 1 sampler_output = sampler_output[0] - # Store hidden states from target model execution. + # Store hidden states from target model execution, BxD. hidden_states = sampler_output.hidden_states if hidden_states is not None: - # remove hidden_states for prompt tokens - # TODO Enable `return_hidden_states`: prefill chunks hidden states - # are pruned by the logits processor. Also, they should be arranged - # back into full-prefill latent. Address it to enable MLPSpeculator. - if any(seq.is_prompt - for seq in execute_model_req.seq_group_metadata_list): + # Only decodes and prefill terminal chunks need a hidden state. + seq_group_meta_with_hidden = [ + sg for sg in execute_model_req.seq_group_metadata_list + if sg.do_sample + ] + if any(seq.is_prompt for seq in seq_group_meta_with_hidden): + # Drop hidden_states with no prediction (eg non-terminal chunks) hidden_states = hidden_states[ torch.where(sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID)[0]] - if self.previous_hidden_states is None: + if self.previous_hidden_states is None and len( + seq_group_meta_with_hidden): self.previous_hidden_states = HiddenStates( - hidden_states, execute_model_req.seq_group_metadata_list) - else: - self.previous_hidden_states.update( - hidden_states, execute_model_req.seq_group_metadata_list) + hidden_states, seq_group_meta_with_hidden) + elif self.previous_hidden_states and len( + seq_group_meta_with_hidden): + self.previous_hidden_states.update(hidden_states, + seq_group_meta_with_hidden) if not skip_proposer: # We prepare the prefill hidden states here so that there no @@ -752,13 +762,13 @@ def _run_speculative_decoding_step( ] if len(non_spec_indices): all_hidden_states = proposal_scores.hidden_states - # TODO fix `return_hidden_states`, same as in `_run_no_spec` if all_hidden_states is not None: prefill_hidden_states = all_hidden_states[non_spec_indices] execute_model_req.previous_hidden_states = \ prepare_prefill_hidden_states(prefill_hidden_states) # Sync proposer KV cache for prefills. prefill_req = execute_model_req.clone(non_spec_seqs) + # TODO avoid sampling here? self.proposer_worker.execute_model(prefill_req) with Timer() as verification_timer: @@ -774,6 +784,8 @@ def _run_speculative_decoding_step( execute_model_req.seq_group_metadata_list, accepted_token_ids, target_logprobs=target_logprobs, + prompt_logprobs=proposal_scores.prompt_logprobs + if not self._disable_logprobs else None, k=execute_model_req.num_lookahead_slots, stage_times=stage_times) @@ -845,19 +857,32 @@ def _verify_tokens( # metadata. accepted_token_ids[original_indices] = accepted_token_ids.clone() + # B x K+1 x D hidden_states = proposal_scores.hidden_states if hidden_states is not None: + # Only get terminal hidden states for next step + terminal_metadata = [ + sg for sg in seq_group_metadata_list if sg.do_sample + ] + # Contract hidden states based on accepted tokens hs_size = hidden_states.shape[-1] - accepted_index = accepted_token_ids + 1 # Convert -1 to 0 - accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) - index = accepted_index[:, None, None].expand(-1, 1, hs_size) + accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) # b + # Drop non-terminal prefill chunks hidden states. + hidden_states = hidden_states[ + accepted_index != VLLM_INVALID_TOKEN_ID] + accepted_index = accepted_index[ + accepted_index != VLLM_INVALID_TOKEN_ID] + assert len(accepted_index) == hidden_states.shape[0] == len( + terminal_metadata) + index = accepted_index[:, None, None].expand(-1, 1, + hs_size) # b x 1 x d second_last_token_hidden_states = hidden_states[:, -2] # b x d hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d # Store hidden states from target model for subsequent decode step self.previous_hidden_states = HiddenStates( - hidden_states, seq_group_metadata_list, + hidden_states, terminal_metadata, second_last_token_hidden_states) return accepted_token_ids, logprobs @@ -866,6 +891,8 @@ def _create_output_sampler_list( seq_group_metadata_list: List[SequenceGroupMetadata], accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] target_logprobs: torch.Tensor, # shape: [batch_size, k+1, vocab_size] + prompt_logprobs: Optional[ + torch.Tensor], # shape: [nprompt_tokens, vocab_size] k: int, stage_times: Tuple[float, float, float], ) -> List[SamplerOutput]: @@ -909,15 +936,89 @@ def _create_output_sampler_list( # Construct the output on a per-step, per-sequence basis. # Non-terminal prefill chunks will end up here as rows with just -1s - # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] + # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while + # terminal chunks will only have one generated token at time 0. sampler_output_list: List[SamplerOutput] = [] + + # Prefills are not multi-step (return at most 1 token), in order to + # avoid padding or repetition to fit decodes, we separate them. + for i, sg in enumerate(seq_group_metadata_list): + if not sg.is_prompt: + # Requests are ordered as prefills|decodes=>no more prefills. + break + num_logprobs = num_logprobs_per_seq[i] + seq_kwargs = dict(token_id=-1, + token_id_logprob_rank=0, + token_id_logprob=-float('inf'), + topk_token_ids=[-1] * num_logprobs, + topk_logprobs=[-float('inf')] * num_logprobs, + seq_id=seq_ids[i]) + # Terminal chunk, has token. + if sg.do_sample: + seq_kwargs.update( + dict( + token_id=accepted_token_ids[i][0].item(), + token_id_logprob_rank=accepted_token_id_ranks_by_step[ + 0][i], + token_id_logprob=accepted_token_id_logprobs_by_step[0] + [i], + topk_token_ids=topk_indices_by_step[0][i] + [:num_logprobs], + # output only so step is 0 + topk_logprobs=topk_logprobs_by_step[0][i] + [:num_logprobs], + )) + needs_plogs = (sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + plogs = None + if prompt_logprobs is not None: + # Even non-terminal prompt chunks can have logprobs here. + plogs = prompt_logprobs[i] + elif needs_plogs: + # Prompt logprobs are requested but `_disable_logprobs` is set. + seq_data = next(iter(sg.seq_data.values())) + # Get only the tokens in this chunk! + prompt_token_ids = seq_data.get_prompt_token_ids() + prompt_token_ids = prompt_token_ids[ + seq_data. + _num_computed_tokens:seq_data._num_computed_tokens + + sg.token_chunk_size] + + is_first_chunk = seq_data._num_computed_tokens == 0 + # There's no prob generated for the first token in a sequence. + if is_first_chunk: + prompt_token_ids = prompt_token_ids[1:] + plogs = [ + create_logprobs_output( + token_id=p_token_id, + token_id_logprob_rank=-1, + token_id_logprob=0.0, + topk_token_ids=[], + topk_logprobs=[], + ) for p_token_id in prompt_token_ids + ] + seq_kwargs.update(dict(prompt_logprobs=plogs)) + + sampler_output_list.append( + SamplerOutput( + outputs=[create_sequence_group_output( + **seq_kwargs)])) # type: ignore + + # Decodes, create one SamplerOutput per-step (at most K+1). for step_index in range(num_steps): - if all(token_id == -1 - for token_id in accepted_token_ids_by_step[step_index]): + if all(token_id == -1 for sg, token_id in zip( + seq_group_metadata_list, + accepted_token_ids_by_step[step_index]) + if not sg.is_prompt): break step_output_token_ids: List[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): + seq_meta = seq_group_metadata_list[sequence_index] + # Prompts already processed above. + if seq_meta.is_prompt: + continue + # Each sequence may have a different num_logprobs; retrieve it. num_logprobs = num_logprobs_per_seq[sequence_index] step_output_token_ids.append( @@ -952,6 +1053,8 @@ def _create_output_sampler_list( # This is periodic because the rejection sampler emits metrics # periodically. self._maybe_log_stage_times(*stage_times) + # First `n_prefills` entries will contain prefills SamplerOutput when + # chunked prefill is enabled, the rest is decodes in multi-step format. return sampler_output_list def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float, diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 5a7999a258b2d..6bf7587cdda19 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -104,11 +104,11 @@ def get_spec_proposals( sampler_transposed=transposed, ) - proposals = SpeculativeProposals( - proposal_token_ids=proposal_tokens, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens, - no_proposals=maybe_sampler_output is None) + proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens, + no_proposals=maybe_sampler_output + is None) return proposals def _split_by_proposal_len( diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index da8706658d09a..c88820ab27b69 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -40,13 +40,15 @@ def get_sampled_token_logprobs( """ num_steps, batch_size, vocab_size = logprob_tensor.shape - selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1), - torch.arange(batch_size), - sampled_token_ids, ] + selected_logprobs = logprob_tensor[ + torch.arange(num_steps).unsqueeze(1), + torch.arange(batch_size), + sampled_token_ids, + ] expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( -1, -1, vocab_size) - sampled_token_ids_ranks = (logprob_tensor > - expanded_selected_logprobs).sum(-1).add_(1) + sampled_token_ids_ranks = (logprob_tensor + > expanded_selected_logprobs).sum(-1).add_(1) return sampled_token_ids_ranks, selected_logprobs diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 93fec667d1cf3..1edf36329d83b 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -182,8 +182,8 @@ def _rope_scaling_validation(self): if self.rope_scaling is None: return - if not isinstance(self.rope_scaling, - dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_scaling, dict) or len( + self.rope_scaling) != 2: raise ValueError( "`rope_scaling` must be a dictionary with two fields, " f"`type` and `factor`, got {self.rope_scaling}") diff --git a/vllm/utils.py b/vllm/utils.py index 17bffd2846b46..15481fb06e08e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -29,7 +29,7 @@ from collections import OrderedDict, UserDict, defaultdict from collections.abc import Hashable, Iterable, Mapping from dataclasses import dataclass, field -from functools import lru_cache, partial, wraps +from functools import cache, lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Dict, Generator, Generic, Iterator, List, Literal, NamedTuple, Optional, Tuple, Type, TypeVar, Union, @@ -352,7 +352,7 @@ def reset(self): self._index = 0 -@lru_cache(maxsize=None) +@cache def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" from vllm import _custom_ops as ops @@ -697,7 +697,7 @@ def create_kv_caches_with_random( return key_caches, value_caches -@lru_cache(maxsize=None) +@cache def is_pin_memory_available() -> bool: from vllm.platforms import current_platform return current_platform.is_pin_memory_available() @@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None: init_hf_modules() -@lru_cache(maxsize=None) +@cache def find_library(lib_name: str) -> str: """ Find the library file in the system. @@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): return module -@lru_cache(maxsize=None) +@cache def get_vllm_optional_dependencies(): metadata = importlib.metadata.metadata("vllm") requirements = metadata.get_all("Requires-Dist", []) diff --git a/vllm/v1/attention/backends/neuron_attn.py b/vllm/v1/attention/backends/neuron_attn.py index c2dd3fdcbf1e1..ed432254c012c 100644 --- a/vllm/v1/attention/backends/neuron_attn.py +++ b/vllm/v1/attention/backends/neuron_attn.py @@ -2,7 +2,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionImpl, AttentionMetadataBuilder, AttentionType +from vllm.attention.backends.abstract import AttentionBackend\ + , AttentionImpl, AttentionMetadataBuilder, AttentionType from vllm.attention.backends.utils import CommonAttentionState diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 8ded5e5787133..7a88cc9433b32 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -247,8 +247,8 @@ def schedule(self) -> "SchedulerOutput": token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - has_partial_request = (num_computed_tokens + num_new_tokens < - request.num_tokens) + has_partial_request = (num_computed_tokens + num_new_tokens + < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: @@ -411,6 +411,10 @@ def update_from_output( num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] outputs: List[EngineCoreOutput] = [] + + # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below + # loop can be a performance bottleneck. We should do our best to avoid + # expensive operations inside the loop. for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -421,13 +425,15 @@ def update_from_output( cached_encoder_input_ids = ( self.encoder_cache_manager.get_cached_input_ids(request)) - for input_id in list(cached_encoder_input_ids): - start_pos = request.mm_positions[input_id]["offset"] - num_tokens = request.mm_positions[input_id]["length"] - if start_pos + num_tokens <= request.num_computed_tokens: - # The encoder output is already processed and stored - # in the decoder's KV cache. - self.encoder_cache_manager.free(request, input_id) + # OPTIMIZATION: Avoid list(set) if the set is empty. + if cached_encoder_input_ids: + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6dc68b3a16099..917d52d3220b8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -24,7 +24,8 @@ from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase +from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, + StatLoggerBase) from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -46,13 +47,15 @@ def __init__( assert start_engine_loop + self.model_config = vllm_config.model_config + self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers: List[StatLoggerBase] = [ LoggingStatLogger(), - # TODO(rob): PrometheusStatLogger(), + PrometheusStatLogger(labels=dict( + model_name=self.model_config.served_model_name)), ] - self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( @@ -272,7 +275,7 @@ async def _run_output_handler(self): # 4) Logging. # TODO(rob): make into a coroutine and launch it in - # background thread once we add Prometheus. + # background thread once Prometheus overhead is non-trivial. assert iteration_stats is not None self._log_stats( scheduler_stats=outputs.scheduler_stats, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8feeef17542e6..b84f03fa3267c 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,5 +1,8 @@ import time from abc import ABC, abstractmethod +from typing import Dict + +import prometheus_client from vllm.logger import init_logger from vllm.v1.metrics.stats import SchedulerStats @@ -36,3 +39,36 @@ def log(self, scheduler_stats: SchedulerStats): scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, ) + + +class PrometheusStatLogger(StatLoggerBase): + + def __init__(self, labels: Dict[str, str]): + self.labels = labels + + labelnames = self.labels.keys() + labelvalues = self.labels.values() + + self._unregister_vllm_metrics() + + self.gauge_scheduler_running = prometheus_client.Gauge( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_waiting = prometheus_client.Gauge( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames).labels(*labelvalues) + + def log(self, scheduler_stats: SchedulerStats): + """Log to prometheus.""" + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + + @staticmethod + def _unregister_vllm_metrics(): + # Unregister any existing vLLM collectors (for CI/CD + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + prometheus_client.REGISTRY.unregister(collector) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index acc3a944e21b9..32aee44e3f374 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -8,7 +8,7 @@ class SamplerOutput: # [num_reqs] - sampled_token_ids: List[int] + sampled_token_ids: torch.Tensor # [num_reqs, max_num_logprobs + 1] logprob_token_ids: Optional[torch.Tensor] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index eefcdaf29e753..2cfcd8b63ccb2 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -58,7 +58,8 @@ def __init__( # Sanity check assert len(self.mm_inputs) == len(self.mm_positions) - assert len(self.mm_inputs) == len(self.mm_hashes) + if self.mm_hashes: + assert len(self.mm_inputs) == len(self.mm_hashes) # Cache the computed kv block hashes of the request to avoid # recomputing. diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 7cd42ca211a22..9ad665a64894c 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -50,9 +50,8 @@ def forward( # Use int32 to reduce the tensor size. sampled = sampled.to(torch.int32) - # NOTE: CPU-GPU synchronization happens here. sampler_output = SamplerOutput( - sampled_token_ids=sampled.tolist(), + sampled_token_ids=sampled, logprob_token_ids=topk_indices, logprobs=topk_logprobs, prompt_logprob_token_ids=None, diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py index 500bc356fc179..902800e0573bf 100644 --- a/vllm/v1/stats/common.py +++ b/vllm/v1/stats/common.py @@ -311,8 +311,8 @@ def output_token_latency_s_lst(self) -> List[float]: return [] latency_s_lst = [] for i in range(1, len(self.output_token_ts_s_lst)): - assert (self.output_token_ts_s_lst[i] >= - self.output_token_ts_s_lst[i - 1]) + assert (self.output_token_ts_s_lst[i] + >= self.output_token_ts_s_lst[i - 1]) latency_s = (self.output_token_ts_s_lst[i] - self.output_token_ts_s_lst[i - 1]) latency_s_lst.append(latency_s) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4b3c325ded906..a00c00c307335 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -171,7 +171,8 @@ def __init__( # OPTIMIZATION: Cache the tensors rather than creating them every step. self.arange_np = np.arange(max(self.max_num_reqs + 1, - self.max_model_len), + self.max_model_len, + self.max_num_tokens), dtype=np.int32) # NOTE(woosuk): These tensors are "stateless", i.e., they are literally # a faster version of creating a new tensor every time. Thus, we should @@ -204,7 +205,7 @@ def __init__( def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. - # Keep the states of the pre-empted requests. + # Keep the states of the preempted requests. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) self.encoder_cache.pop(req_id, None) @@ -358,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Get batched arange. # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - arange = np.concatenate( - [self.arange_np[:n] for n in num_scheduled_tokens]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_scheduled_tokens) + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets # Get positions. positions_np = self.positions_np[:total_num_scheduled_tokens] @@ -406,8 +414,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Prepare the attention metadata. self.query_start_loc_np[0] = 0 - np.cumsum(num_scheduled_tokens, - out=self.query_start_loc_np[1:num_reqs + 1]) + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens self.seq_lens_np[:num_reqs] = ( self.input_batch.num_computed_tokens_cpu[:num_reqs] + @@ -775,10 +782,10 @@ def execute_model( sampling_metadata=sampling_metadata, ) - sampled_token_ids = sampler_output.sampled_token_ids # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. num_reqs = self.input_batch.num_reqs + request_seq_lens: List[Tuple[int, CachedRequestState, int]] = [] for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): assert req_id is not None req_state = self.requests[req_id] @@ -787,10 +794,10 @@ def execute_model( assert seq_len <= req_state.num_tokens if seq_len == req_state.num_tokens: # Append the sampled token to the output token ids. - token_id = sampled_token_ids[i] - self.input_batch.token_ids_cpu[i, seq_len] = token_id self.input_batch.num_tokens[i] += 1 - req_state.output_token_ids.append(token_id) + # OPTIMIZATION: Priming the state updates for later updates. + req_state.output_token_ids.append(0) + request_seq_lens.append((i, req_state, seq_len)) else: # Ignore the sampled token from the partial request. # Rewind the generator state as if the token was not sampled. @@ -799,6 +806,21 @@ def execute_model( # This relies on cuda-specific torch-internal impl details generator.set_offset(generator.get_offset() - 4) + # num_reqs entries should be non-None + assert all( + req_id is not None for req_id in + self.input_batch.req_ids[:num_reqs]), "req_ids contains None" + req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) + + # NOTE: GPU -> CPU Sync happens here. + # Move as many CPU operations as possible before this sync point. + sampled_token_ids = sampler_output.sampled_token_ids.tolist() + # Update with the actual token ids + for i, req_state, seq_len in request_seq_lens: + token_id = sampled_token_ids[i] + self.input_batch.token_ids_cpu[i, seq_len] = token_id + req_state.output_token_ids[-1] = token_id + if sampler_output.logprob_token_ids is None: logprob_token_ids = None else: @@ -808,12 +830,6 @@ def execute_model( else: logprobs = sampler_output.logprobs.cpu() - # num_reqs entries should be non-None - assert all( - req_id is not None for req_id in - self.input_batch.req_ids[:num_reqs]), "req_ids contains None" - req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) - model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=self.input_batch.req_id_to_index, diff --git a/vllm/v1/worker/neuron_model_runner.py b/vllm/v1/worker/neuron_model_runner.py index e2ee443f8e074..afc8c223ca846 100644 --- a/vllm/v1/worker/neuron_model_runner.py +++ b/vllm/v1/worker/neuron_model_runner.py @@ -1,4 +1,3 @@ -import gc import time from typing import TYPE_CHECKING, Dict, List, Optional, Tuple @@ -9,17 +8,17 @@ from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.config import CompilationLevel, VllmConfig -from vllm.distributed.parallel_state import graph_capture +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingType -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - LayerBlockType, cdiv, is_pin_memory_available) -from vllm.v1.attention.backends.neuron_attn import NeuronAttentionBackend, NeuronAttentionMetadata +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, + LayerBlockType, cdiv) +from vllm.v1.attention.backends.neuron_attn import NeuronAttentionBackend, \ + NeuronAttentionMetadata from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import bind_kv_cache @@ -160,7 +159,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: if num_new_blocks == 0: continue start_index = len(req_state.block_ids) - end_index = start_index + num_new_blocks req_state.block_ids.extend(req_data.new_block_ids) self.input_batch.block_table.append_row(req_index, start_index, req_data.new_block_ids) diff --git a/vllm/v1/worker/neuron_worker.py b/vllm/v1/worker/neuron_worker.py index 84e0717fbfe96..623918a7d5747 100644 --- a/vllm/v1/worker/neuron_worker.py +++ b/vllm/v1/worker/neuron_worker.py @@ -1,11 +1,9 @@ """A GPU worker class.""" -import os -from typing import TYPE_CHECKING, Optional, Tuple +from typing import TYPE_CHECKING, Optional import torch import torch.distributed import torch_xla.core.xla_model as xm -import torch_xla.runtime as xr from torch_xla._internal.pjrt import initialize_multiprocess from vllm.config import ParallelConfig @@ -20,8 +18,8 @@ logger = init_logger(__name__) -if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput +# if TYPE_CHECKING: +# from vllm.v1.core.scheduler import SchedulerOutput class NeuronWorker(Worker): diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 3c570212625c4..aaf9cb40bf2aa 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -173,13 +173,13 @@ def execute_model( cpu_fallback_ctx as cpu_fallback_local_metric: output = LocalOrDistributedWorkerBase.execute_model( self, execute_model_req) - if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 - ) or log_graph_compilation_all: + if (log_graph_compilation and gc_local_metric.stats()[0][1] + > 0) or log_graph_compilation_all: msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " f"{gc_local_metric.stats()}, {input_stats}") logger.warning(msg) - if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > - 0) or log_cpu_fallbacks_all: + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] + > 0) or log_cpu_fallbacks_all: msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " f"{cpu_fallback_local_metric.stats()}, {input_stats}") logger.warning(msg) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index a3f648f4cc645..8749518284288 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -316,8 +316,8 @@ def warmup_model( logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) num_tokens = batch_size * seq_len - if (num_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (num_tokens + >= self.scheduler_config.max_num_batched_tokens): break seq_len = seq_len * 2 end = time.time() diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c6e6693c54f57..6eeb4aa17051f 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -8,7 +8,8 @@ import torch import torch.nn as nn -from vllm.config import ObservabilityConfig, VllmConfig +from vllm.config import (ObservabilityConfig, VllmConfig, + set_current_vllm_config) from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -498,8 +499,11 @@ def __init__( group. """ self.rpc_rank = rpc_rank - self.vllm_config = vllm_config self.worker: Optional[WorkerBase] = None + # do not store this `vllm_config`, `init_worker` will set the final + # one. TODO: investigate if we can remove this field in + # `WorkerWrapperBase`, `init_cached_hf_modules` should be + # unnecessary now. if vllm_config.model_config is not None: # it can be None in tests trust_remote_code = vllm_config.model_config.trust_remote_code @@ -533,6 +537,9 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: Arguments are passed to the worker class constructor. """ kwargs = all_kwargs[self.rpc_rank] + self.vllm_config = kwargs.get("vllm_config", None) + assert self.vllm_config is not None, ( + "vllm_config is required to initialize the worker") enable_trace_function_call_for_thread(self.vllm_config) from vllm.plugins import load_general_plugins @@ -546,8 +553,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: bytes) worker_class = cloudpickle.loads( self.vllm_config.parallel_config.worker_cls) - self.worker = worker_class(**kwargs) - assert self.worker is not None + with set_current_vllm_config(self.vllm_config): + # To make vLLM config available during worker initialization + self.worker = worker_class(**kwargs) + assert self.worker is not None def execute_method(self, method: Union[str, bytes], *args, **kwargs): try: