Skip to content

Commit a4971e4

Browse files
author
Pradyun Ramadorai
committed
Merge PR vllm-project#24957: Fix varlen issue in Qwen3-Next MTP implementation
Fixes CUDA illegal memory access errors during Qwen3-Next speculative decoding by implementing proper varlen sequence handling and CUDA graph batch size fixes. Key changes from upstream PR vllm-project#24957: - Enhanced GDNAttentionMetadata with num_actual_tokens field - Fixed CUDA graph batch size calculation for speculative decoding scenarios - Added varlen sequence support to causal_conv1d operations - Improved token accounting across MTP verification paths Resolves issues with: - Multi-token prediction verification with unaligned speculative tokens - Variable-length sequence processing in continuous batching - CUDA memory allocation errors in graph capture Co-authored-by: upstream contributors from PR vllm-project#24957 Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
2 parents fec361a + b667c7e commit a4971e4

File tree

140 files changed

+3932
-1401
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+3932
-1401
lines changed

.buildkite/nightly-benchmarks/nightly-descriptions.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This benchmark aims to:
88

99
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
1010

11-
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
11+
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
1212

1313
## Setup
1414

.buildkite/test-pipeline.yaml

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -571,44 +571,98 @@ steps:
571571

572572
##### models test #####
573573

574-
- label: Basic Models Test # 57min
575-
timeout_in_minutes: 75
574+
- label: Basic Models Tests (Initialization)
575+
timeout_in_minutes: 45
576576
mirror_hardwares: [amdexperimental]
577577
torch_nightly: true
578578
source_file_dependencies:
579579
- vllm/
580-
- tests/models
580+
- tests/models/test_initialization.py
581581
commands:
582-
- pytest -v -s models/test_transformers.py
583-
- pytest -v -s models/test_registry.py
584-
- pytest -v -s models/test_utils.py
585-
- pytest -v -s models/test_vision.py
586-
- pytest -v -s models/test_initialization.py
582+
# Run a subset of model initialization tests
583+
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
587584

588-
- label: Language Models Test (Standard) # 35min
585+
- label: Basic Models Tests (Extra Initialization) %N
589586
timeout_in_minutes: 45
590587
mirror_hardwares: [amdexperimental]
591588
torch_nightly: true
592589
source_file_dependencies:
590+
- vllm/model_executor/models/
591+
- tests/models/test_initialization.py
592+
commands:
593+
# Only when vLLM model source is modified - test initialization of a large
594+
# subset of supported models (the complement of the small subset in the above
595+
# test.) Also run if model initialization test file is modified
596+
- pytest -v -s models/test_initialization.py \
597+
-k 'not test_can_initialize_small_subset' \
598+
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
599+
--shard-id=$$BUILDKITE_PARALLEL_JOB
600+
parallelism: 2
601+
602+
- label: Basic Models Tests (Other)
603+
timeout_in_minutes: 45
604+
mirror_hardwares: [amdexperimental]
605+
torch_nightly: true
606+
source_file_dependencies:
607+
- vllm/
608+
- tests/models/test_transformers.py
609+
- tests/models/test_registry.py
610+
- tests/models/test_utils.py
611+
- tests/models/test_vision.py
612+
commands:
613+
- pytest -v -s models/test_transformers.py \
614+
models/test_registry.py \
615+
models/test_utils.py \
616+
models/test_vision.py
617+
618+
- label: Language Models Tests (Standard)
619+
timeout_in_minutes: 25
620+
mirror_hardwares: [amdexperimental]
621+
torch_nightly: true
622+
source_file_dependencies:
593623
- vllm/
594624
- tests/models/language
595625
commands:
626+
# Test standard language models, excluding a subset of slow tests
596627
- pip freeze | grep -E 'torch'
597-
- pytest -v -s models/language -m core_model
628+
- pytest -v -s models/language -m 'core_model and (not slow_test)'
598629

599-
- label: Language Models Test (Hybrid) # 35 min
630+
- label: Language Models Tests (Extra Standard) %N
600631
timeout_in_minutes: 45
601632
mirror_hardwares: [amdexperimental]
602633
torch_nightly: true
603634
source_file_dependencies:
635+
- vllm/model_executor/models/
636+
- tests/models/language/pooling/test_embedding.py
637+
- tests/models/language/generation/test_common.py
638+
- tests/models/language/pooling/test_classification.py
639+
commands:
640+
# Shard slow subset of standard language models tests. Only run when model
641+
# source is modified, or when specified test files are modified
642+
- pip freeze | grep -E 'torch'
643+
- pytest -v -s models/language -m 'core_model and slow_test' \
644+
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
645+
--shard-id=$$BUILDKITE_PARALLEL_JOB
646+
parallelism: 2
647+
648+
- label: Language Models Tests (Hybrid) %N
649+
timeout_in_minutes: 75
650+
mirror_hardwares: [amdexperimental]
651+
torch_nightly: true
652+
source_file_dependencies:
604653
- vllm/
605654
- tests/models/language/generation
606655
commands:
607656
# Install fast path packages for testing against transformers
608657
# Note: also needed to run plamo2 model in vLLM
609658
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
610659
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
611-
- pytest -v -s models/language/generation -m hybrid_model
660+
# Shard hybrid language model tests
661+
- pytest -v -s models/language/generation \
662+
-m hybrid_model \
663+
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
664+
--shard-id=$$BUILDKITE_PARALLEL_JOB
665+
parallelism: 2
612666

613667
- label: Language Models Test (Extended Generation) # 80min
614668
timeout_in_minutes: 110

.github/CODEOWNERS

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,7 @@ mkdocs.yaml @hmellor
101101
/vllm/v1/worker/tpu* @NickLucche
102102
/vllm/platforms/tpu.py @NickLucche
103103
/vllm/v1/sample/tpu @NickLucche
104-
/vllm/tests/v1/tpu @NickLucche
104+
/vllm/tests/v1/tpu @NickLucche
105+
106+
# KVConnector installation files
107+
/requirements/kv_connectors.txt @NickLucche

.github/workflows/bc-lint.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ on:
66
- opened
77
- synchronize
88
- reopened
9+
- labeled
10+
- unlabeled
911

1012
jobs:
1113
bc_lint:

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
1313
# cmake --install . --component _C
1414
project(vllm_extensions LANGUAGES CXX)
1515

16+
set(CMAKE_CXX_STANDARD 17)
17+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
18+
19+
1620
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
1721
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
1822
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ vLLM is flexible and easy to use with:
8181
- Tensor, pipeline, data and expert parallelism support for distributed inference
8282
- Streaming outputs
8383
- OpenAI-compatible API server
84-
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
84+
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
8585
- Prefix caching support
8686
- Multi-LoRA support
8787

0 commit comments

Comments
 (0)