Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
446 commits
Select commit Hold shift + click to select a range
61e2082
Fall back if flashinfer comm module not found (#20936)
sarckk Jul 14, 2025
8cdc371
SM100 Cutlass MLA decode with unrestricted num_heads (< 128) for Deep…
alexm-redhat Jul 15, 2025
ba8c300
[BugFix] VLLM_DISABLE_COMPILE_CACHE=1 should disable all reads and wr…
zou3519 Jul 15, 2025
bcdfb2a
[Bugfix] Fix incorrect dispatch for CutlassBlockScaledGroupedGemm and…
mgoin Jul 15, 2025
946aadb
[CI/Build] Split Entrypoints Test into LLM and API Server (#20945)
mgoin Jul 15, 2025
d4170fa
Use w8a8 quantized matmul Pallas kernel (#19170)
vanbasten23 Jul 15, 2025
054c865
[Docs] Add Kuberay to deployment integrations (#20592)
crypdick Jul 15, 2025
37e2eca
feat: add image zoom to improve image viewing experience (#20763)
reidliu41 Jul 15, 2025
80305c1
[CI] Fix flaky `test_streaming_response` test (#20913)
NickLucche Jul 15, 2025
016b8d1
Enabled BnB NF4 inference on Gaudi (#20172)
rsshaik1 Jul 15, 2025
9ad0a45
[Bugfix] Switch bailout logic for kv-cache-dtype with SM100 Flashinfe…
pavanimajety Jul 15, 2025
fc01791
[Doc] Clearer mistral3 and pixtral model support description (#20926)
Isotr0py Jul 15, 2025
91b3d19
[cold start] replace VLLM_COMPILE_DEPYF with debug_dump_dir (#20940)
BoyuanFeng Jul 15, 2025
85bd659
[Model] Add AutoWeightsLoader support for BERT, RoBERTa (#20534)
jennifurhe Jul 15, 2025
d4d3094
Implement Async Scheduling (#19970)
WoosukKwon Jul 15, 2025
37a7d5d
[Misc] Refactor AllReduceFusionPass. Remove parameter (#20918)
ilmarkov Jul 15, 2025
68d28e3
[frontend] Add --help=page option for paginated help output (#20961)
reidliu41 Jul 15, 2025
235bfd5
[Docs] Improve documentation for RLHF example (#20598)
crypdick Jul 15, 2025
f148c44
[frontend] Refactor CLI Args for a better modular integration (#20206)
kouroshHakha Jul 15, 2025
33d5600
[Docs] Improve documentation for ray cluster launcher helper script (…
crypdick Jul 15, 2025
c586b55
[TPU] Optimize kv cache update kernel (#20415)
tengyifei Jul 15, 2025
3534c39
[V1] [Hybrid] Refactor mamba state shape calculation; enable V1 via c…
tdoublep Jul 15, 2025
20149d8
[MISC] Add init files for python package (#20908)
Potabk Jul 15, 2025
d912781
[doc] Add more details for Ray-based DP (#20948)
ruisearch42 Jul 15, 2025
56fe4be
[Deprecation] Remove `TokenizerPoolConfig` (#20968)
hmellor Jul 15, 2025
4ffd963
[v1][core] Support for attention free models (#20811)
christian-pinto Jul 15, 2025
e7e3e6d
Voxtral (#20970)
patrickvonplaten Jul 15, 2025
c847e34
[CI/Build] Fix wrong path in Transformers Nightly Models Test (#20994)
DarkLight1337 Jul 15, 2025
313ae8c
[Deprecation] Remove everything scheduled for removal in v0.10.0 (#20…
hmellor Jul 15, 2025
5bac613
Configure Gemini (#20971)
hmellor Jul 15, 2025
1e36c86
[Deprecation] Remove `nullable_kvs` (#20969)
hmellor Jul 15, 2025
b637e9d
Add full serve CLI reference back to docs (#20978)
hmellor Jul 15, 2025
ed10f3c
[ROCm] warpSize is being made non constexpr in ROCm 7.0 (#20330)
gshtras Jul 15, 2025
f29fd8a
[BugFix] fix 3 issues: (1) using metadata for causal-conv1d, (2) inde…
thoangtrvn Jul 15, 2025
19c8630
[Frontend] Support cache_salt in /v1/completions and /v1/responses (#…
dr75 Jul 15, 2025
10be209
[Bug Fix] get_distributed_init_method should get the ip from get_ip i…
Relics Jul 15, 2025
30800b0
[Nvidia] Integrate SM100 cudnn prefill API to MLA prefill (#20411)
elfiegg Jul 16, 2025
34cda77
[Frontend] OpenAI Responses API supports input image (#20975)
chaunceyjiang Jul 16, 2025
153c6f1
[Frontend] Remove print left in FrontendArgs.add_cli_args (#21004)
mgoin Jul 16, 2025
6cbc4d4
[Model] Add ModelConfig class for GraniteMoeHybrid to override defaul…
tdoublep Jul 16, 2025
b5c3b68
[Misc] bump xgrammar version to v0.1.21 (#20992)
chaunceyjiang Jul 16, 2025
75a99b9
[Chore] Remove outdated transformers check (#20989)
b8zhong Jul 16, 2025
fa83956
[Misc] Refactor: Improve argument handling for `conda` command (#20481)
reidliu41 Jul 16, 2025
3ed94f9
[Docs] Enhance Anyscale documentation, add quickstart links for vLLM …
crypdick Jul 16, 2025
fcb9f87
[Bugfix] Correct per_act_token in CompressedTensorsW8A8Fp8MoECutlassM…
minosfuture Jul 16, 2025
7976446
Add Dockerfile argument for VLLM_USE_PRECOMPILED environment (#20943)
dougbtv Jul 16, 2025
e9534c7
[CI][HPU] update for v0 deprecate by switching to VLLM_TARGET_DEVICE=…
xuechendi Jul 16, 2025
f460983
[Bugfix] Fix Mistral3 support on SM100/SM120 (#20998)
mgoin Jul 16, 2025
76ddeff
[Doc] Remove duplicate docstring (#21012)
yewentao256 Jul 16, 2025
cfbcb9e
[Voxtral] Add more tests (#21010)
patrickvonplaten Jul 16, 2025
6ebf313
Avoid direct comparison of floating point numbers (#21002)
maxdebayser Jul 16, 2025
1eb2b9c
[CI] update typos config for CI pre-commit and fix some spells (#20919)
panpan0000 Jul 16, 2025
c11013d
[Meta] Llama4 EAGLE Support (#20591)
morgendave Jul 16, 2025
85431bd
[TPU] fix kv_cache_update kernel block size choosing logic (#21007)
yaochengji Jul 16, 2025
d31a647
[BugFix] Fix import error on non-blackwell machines (#21020)
LucasWilkinson Jul 16, 2025
d0dc4cf
Fix inadvertently silenced PP tests for `mp`, add DeepSeek V2/V3 mode…
eicherseiji Jul 16, 2025
d7309e8
Realign with RHAIIS
tarukumar Jul 16, 2025
260127e
[Docs] Add intro and fix 1-2-3 list in frameworks/open-webui.md (#19199)
windsonsea Jul 16, 2025
1c3198b
[Model] Consolidate pooler implementations (#20927)
DarkLight1337 Jul 16, 2025
18bdcf4
feat - add a new endpoint `get_tokenizer_info` to provide tokenizer/c…
m-misiura Jul 16, 2025
a0f8a79
[fix] fix qwen image_embeds input (#21049)
h-avsha Jul 16, 2025
a931b4c
Remove Qwen Omni workaround that's no longer necessary (#21057)
hmellor Jul 16, 2025
ac2bf41
[Model] Remove model sampler (#21059)
DarkLight1337 Jul 16, 2025
01513a3
Support FP8 Quantization and Inference Run on Intel Gaudi (HPU) using…
nirda7 Jul 16, 2025
72ad273
Remove torch_xla.tpu.version() from pallas.py. (#21065)
QiliangCui Jul 17, 2025
4e7dfbe
Update PyTorch to `torch==2.7.1` for CUDA (#21011)
mgoin Jul 17, 2025
c9ba810
[Bugfix] weight loading use correct tp_group with patch_tensor_parall…
Kevin-XiongC Jul 17, 2025
a50d918
[Docker] Allow FlashInfer to be built in the ARM CUDA Dockerfile (#21…
mgoin Jul 17, 2025
58760e1
[TPU] Start using python 3.12 (#21000)
vanbasten23 Jul 17, 2025
28a6d54
[Bugfix] Fix Machete zero point issue for GPTQ models on SM90 (#21066)
mgoin Jul 17, 2025
76b4944
[Attention] Refactor attention metadata builder interface (#20466)
LucasWilkinson Jul 17, 2025
8a4e5c5
[V1][P/D]Enhance Performance and code readability for P2pNcclConnecto…
Abatom Jul 17, 2025
4fcef49
[V1] [KVConnector] Fix MultiprocExecutor worker output aggregation (#…
sdavidbd Jul 17, 2025
c5b8b59
[Misc] Fix PhiMoE expert mapping (#21085)
jeejeelee Jul 17, 2025
b9b8713
symlink
tarukumar Jul 17, 2025
fdc5b43
[Bugfix]: Fix final_res_batch list index out of range error (#21055)
chaunceyjiang Jul 17, 2025
11dfdf2
[Kernel] DeepGemm MoE : Integrate triton permute / unpermute kernels …
varun-sundar-rabindranath Jul 17, 2025
5a7fb3a
[Model] Add ToolParser and MoE Config for Hunyuan A13B (#20820)
kzjeef Jul 17, 2025
4ef00b5
[VLM] Add Nemotron-Nano-VL-8B-V1 support (#20349)
kylehh Jul 17, 2025
fe8a2c5
[Docs] Improve docstring formatting for `FusedMoEParallelConfig.make`…
hmellor Jul 17, 2025
89e3c4e
[Misc] Avoid unnecessary import (#21106)
wangxiyuan Jul 17, 2025
2d6a382
[Docs] Move code block out of admonition now that it's short (#21118)
hmellor Jul 17, 2025
9fb2d22
[Performance] Performance improvements in non-blockwise fp8 CUTLASS M…
ElizaWszola Jul 17, 2025
90bd2ab
[Model] Update pooling model interface (#21058)
DarkLight1337 Jul 17, 2025
a3a6c69
[Misc] Qwen MoE model supports LoRA (#20932)
jeejeelee Jul 17, 2025
ac9fb73
On environments where numa cannot be detected we get 0 (#21115)
ericcurtin Jul 17, 2025
4de7146
[V0 deprecation] Remove V0 HPU backend (#21131)
WoosukKwon Jul 17, 2025
8a8fc94
[Log] Debugging Log with more Information (#20770)
yewentao256 Jul 18, 2025
8dfb45c
[Bugfix] Fix the tensor non-contiguous issue for Flashinfer TRT-LLM b…
elvischenv Jul 18, 2025
c4e3b12
[Docs] Add minimal demo of Ray Data API usage (#21080)
crypdick Jul 18, 2025
b9a21e9
[Docs] Update supported models documentation with missing models (#20…
luccafong Jul 18, 2025
89cab4d
[Attention] Make local attention backend agnostic (#21093)
LucasWilkinson Jul 18, 2025
b38baab
[Doc] Add inplace weights loading example (#19640)
22quinn Jul 18, 2025
c7d8724
[Core] FlashInfer CUTLASS fused MoE backend (NVFP4) (#20037)
wenscarl Jul 18, 2025
5780121
[Perf] Add swap_ab to SM90 FP8 non-block CUTLASS moe grouped gemm (#2…
shixianc Jul 18, 2025
54cf1ca
[Misc] Do not print async output warning for v1 (#21151)
WoosukKwon Jul 18, 2025
1bf6513
[benchmark] Sending request strictly follows the random intervals (#2…
Jialin Jul 18, 2025
ba2dfbb
[Misc] Make MM embedding merge interface explicit in model runner (#2…
Jul 18, 2025
ca4eb82
[Model] Re-add the implicit conversion feature for as_seq_cls_model (…
noooop Jul 18, 2025
5895afd
[Bugfix] The special_tokens in tokenizer should also be controlled by…
noooop Jul 18, 2025
55ad648
[Doc] Fix typo in model name (#21178)
DarkLight1337 Jul 18, 2025
4adc66f
[Bugfix] Allocate less memory in non-batched CUTLASS MoE (#21121)
ElizaWszola Jul 18, 2025
45badd0
[Core] Set pooling params based on task and model (#21128)
DarkLight1337 Jul 18, 2025
ed8cbfe
Let GraniteMoeAttention use YaRN (#21174)
tdoublep Jul 18, 2025
21274ab
[CI] Update CODEOWNERS for vllm/compilation (#21185)
zou3519 Jul 18, 2025
b2eb2b5
[Kernel] Apply torch.Tag.needs_fixed_stride_order only for torch==2.6…
zou3519 Jul 18, 2025
0f199f1
[Core] Avoid KVCacheBlock.__eq__ invocations in FreeKVCacheBlockQueue…
JialinOuyang-Meta Jul 18, 2025
5782581
[Bugfix] Voxtral on Blackwell GPUs (RTX 50 series) (#21077)
hax0r31337 Jul 18, 2025
2179372
Elastic Expert Parallel Initial Support (#20775)
ruisearch42 Jul 19, 2025
466e878
[Quantization] Enable BNB support for more MoE models (#21100)
jeejeelee Jul 19, 2025
9a9fda1
[Core] Support Local Chunked Attention for Hybrid KV Cache (#19351)
luccafong Jul 19, 2025
9ffe905
[Bugfix][Model] Fix LoRA for Mistral-Small-3.1-24B-Instruct-2503 (#21…
varun-sundar-rabindranath Jul 19, 2025
dd572c0
[V0 Deprecation] Remove V0 Spec Decode workers (#21152)
WoosukKwon Jul 19, 2025
dcc6cfb
[Kernel][Performance] Tweak MoE Batched silu_mul_fp8_quant_deep_gemm …
varun-sundar-rabindranath Jul 19, 2025
468e240
[BugFix][CPU] Fix `TorchSDPABackendImpl` doesn't have `use_irope` (#…
LucasWilkinson Jul 19, 2025
37bd8d6
[Bug] DeepGemm: Fix TypeError: per_block_cast_to_fp8() missing 1 requ…
yewentao256 Jul 19, 2025
3e04107
[Model] EXAONE 4.0 model support (#21060)
Deepfocused Jul 19, 2025
3a2cb26
[Misc][Tools][Benchmark] Add readme file for auto_tune script (#20779)
Chenyaaang Jul 19, 2025
cf8cc32
Fix a couple of Voxtral tests (#21218)
huydhn Jul 19, 2025
1eaff27
[V0 deprecation] Remove long context LoRA (#21169)
jeejeelee Jul 19, 2025
18e519e
[Bugfix] Fix ndarray video color from VideoAsset (#21064)
Isotr0py Jul 19, 2025
59f9353
[BugFix] Fix potential cuda-graph IMA (#21196)
LucasWilkinson Jul 19, 2025
7d94577
Add torch golden impl for moe_align_block_size kernel test (#20653)
shixianc Jul 19, 2025
6d0734c
[NVIDIA] Add SM100 Flashinfer MoE blockscale fp8 backend for low late…
kaixih Jul 19, 2025
b3d8210
[Bugfix][Frontend] Fix openai CLI arg `middleware` (#21220)
22quinn Jul 19, 2025
e3a0e43
[bugfix] Fix auto thread-binding when world_size > 1 in CPU backend a…
bigPYJ1151 Jul 19, 2025
c81259d
Fix/remove some broken model executor tests (#21224)
rabi Jul 19, 2025
da6579b
[CI/CD][bugfix]fix: error argument to loads has incompatible type (#2…
llsj14 Jul 19, 2025
6a971ed
[Docs] Update the link to the 'Prometheus/Grafana' example (#21225)
1195343015 Jul 19, 2025
9f414a1
[BugFix] Make PD work with Ray (#21072)
kouroshHakha Jul 19, 2025
881e3cb
[V1] [Hybrid] Enable piecewise CUDA Graph for mamba layers (#21194)
tdoublep Jul 19, 2025
752c6ad
[V0 Deprecation] Deprecate BlockSparse Attention & Phi3-Small (#21217)
WoosukKwon Jul 19, 2025
2e8cbb5
[BugFix] Fix full cuda graph slot_mapping (#21228)
fhl2000 Jul 19, 2025
10eb24c
GLM-4 Update (#20736)
zRzRzRzRzRzRzR Jul 19, 2025
2b504eb
[Docs] [V1] Update docs to remove enforce_eager limitation for hybrid…
tdoublep Jul 19, 2025
3a1d894
[TPU] support fp8 kv cache quantization (#19292)
yaochengji Jul 20, 2025
d1fb65b
Enable v1 metrics tests (#20953)
eicherseiji Jul 20, 2025
51ba839
[Model] use AutoWeightsLoader for bart (#18299)
calvin0327 Jul 20, 2025
9499e26
[Model] Support VLMs with transformers backend (#20543)
zucchini-nlp Jul 20, 2025
7ba34b1
[bugfix] fix syntax warning caused by backslash (#21251)
1195343015 Jul 20, 2025
8188196
[CI] Cleanup modelscope version constraint in Dockerfile (#21243)
yankay Jul 21, 2025
92615d7
[Docs] Add RFC Meeting to Issue Template (#21279)
simon-mo Jul 21, 2025
940af1f
Add the instruction to run e2e validation manually before release (#2…
huydhn Jul 21, 2025
378d33c
[Bugfix] Fix missing placeholder in logger debug (#21280)
DarkLight1337 Jul 21, 2025
042af0c
[Model][1/N] Support multiple poolers at model level (#21227)
DarkLight1337 Jul 21, 2025
be54a95
[Docs] Fix hardcoded links in docs (#21287)
hmellor Jul 21, 2025
e6b90a2
[Docs] Make tables more space efficient in `supported_models.md` (#21…
hmellor Jul 21, 2025
94edfd3
Realign the template path with RHAIIS (#256)
tarukumar Jul 21, 2025
d978410
[Misc] unify variable for LLM instance (#20996)
andyxning Jul 21, 2025
6b46c4b
Add Nvidia ModelOpt config adaptation (#19815)
Edwardf0t1 Jul 21, 2025
6dda13c
[Misc] Add sliding window to flashinfer test (#21282)
WoosukKwon Jul 21, 2025
a15a50f
[CPU] Enable shared-memory based pipeline parallel for CPU backend (#…
bigPYJ1151 Jul 21, 2025
a0e827e
[BugFix] make utils.current_stream thread-safety (#21252) (#21253)
simpx Jul 21, 2025
6ece16c
[Misc] Add dummy maverick test (#21199)
minosfuture Jul 21, 2025
304dce7
[Attention] Clean up iRoPE in V1 (#21188)
LucasWilkinson Jul 21, 2025
29d1ffc
[DP] Fix Prometheus Logging (#21257)
robertgshaw2-redhat Jul 21, 2025
005ae9b
Fix bad lm-eval fork (#21318)
mgoin Jul 21, 2025
0ec82ed
[perf] Speed up align sum kernels (#21079)
hj-mistral Jul 21, 2025
e57acd9
Sync to upstream's v0.10.0rc1
heyselbi Jul 21, 2025
8d0a01a
[v1][sampler] Inplace logprobs comparison to get the token rank (#21283)
houseroad Jul 21, 2025
25d585a
[XPU] Enable external_launcher to serve as an executor via torchrun (…
chaojun-zhang Jul 22, 2025
5e70dcd
[Doc] Fix CPU doc format (#21316)
bigPYJ1151 Jul 22, 2025
90f1e55
[Intel GPU] Ray Compiled Graph avoid NCCL for Intel GPU (#21338)
ratnampa Jul 22, 2025
e7b2042
Revert "[Performance] Performance improvements in non-blockwise fp8 C…
minosfuture Jul 22, 2025
af376ca
[Core] Minimize number of dict lookup in _maybe_evict_cached_block (#…
Jialin Jul 22, 2025
488d8a9
[V1] [Hybrid] Add new test to verify that hybrid views into KVCacheTe…
tdoublep Jul 22, 2025
6e5b5ca
[Refactor] Fix Compile Warning #1444-D (#21208)
yewentao256 Jul 22, 2025
c17231e
Fix kv_cache_dtype handling for out-of-tree HPU plugin (#21302)
kzawora-intel Jul 22, 2025
8425f78
[Misc] DeepEPHighThroughtput - Enable Inductor pass (#21311)
varun-sundar-rabindranath Jul 22, 2025
e69a92a
[Bug] DeepGemm: Fix Cuda Init Error (#21312)
yewentao256 Jul 22, 2025
9e23ad9
Update fp4 quantize API (#21327)
wenscarl Jul 22, 2025
3779eb8
[Feature][eplb] add verify ep or tp or dp (#21102)
lengrongfu Jul 22, 2025
82b8027
Add arcee model (#21296)
alyosha-swamy Jul 22, 2025
32142b3
[Bugfix] Fix eviction cached blocked logic (#21357)
simon-mo Jul 22, 2025
bc8a8ce
[Misc] Remove deprecated args in v0.10 (#21349)
kebe7jun Jul 22, 2025
a322376
[Core] Optimize update checks in LogitsProcessor (#21245)
Jialin Jul 22, 2025
10904e6
[benchmark] Port benchmark request sent optimization to benchmark_ser…
Jialin Jul 22, 2025
ed25054
[Core] Introduce popleft_n and append_n in FreeKVCacheBlockQueue to f…
Jialin Jul 22, 2025
0df4d9b
[Misc] unify variable for LLM instance v2 (#21356)
andyxning Jul 22, 2025
4fb5691
[perf] Add fused MLA QKV + strided layernorm (#21116)
mickaelseznec Jul 22, 2025
2c8db17
[feat]: add SM100 support for cutlass FP8 groupGEMM (#20447)
djmmoss Jul 22, 2025
774d0c0
[Perf] Cuda Kernel for Per Token Group Quant (#21083)
yewentao256 Jul 22, 2025
320b605
Sync multiple directories and files with v0.10.0rc1 tag
heyselbi Jul 22, 2025
b194557
Adds parallel model weight loading for runai_streamer (#21330)
bbartels Jul 22, 2025
f38ee34
[feat] Enable mm caching for transformers backend (#21358)
zucchini-nlp Jul 22, 2025
226b452
Revert "[Refactor] Fix Compile Warning #1444-D (#21208)" (#21384)
yewentao256 Jul 22, 2025
44554a0
Add tokenization_kwargs to encode for embedding model truncation (#21…
Receiling Jul 22, 2025
2226d5b
[Bugfix] Decode Tokenized IDs to Strings for `hf_processor` in `llm.c…
ariG23498 Jul 22, 2025
35366ae
[CI/Build] Fix test failure due to updated model repo (#21375)
DarkLight1337 Jul 22, 2025
ae268b6
Fix Flashinfer Allreduce+Norm enable disable calculation based on `fi…
xinli-sw Jul 22, 2025
4594fc3
[Model] Add Qwen3CoderToolParser (#21396)
ranpox Jul 22, 2025
35bc8bd
[Misc] Copy HF_TOKEN env var to Ray workers (#21406)
ruisearch42 Jul 22, 2025
b77c7d3
[BugFix] Fix ray import error mem cleanup bug (#21381)
joerunde Jul 22, 2025
c401c64
[CI/Build] Fix model executor tests (#21387)
DarkLight1337 Jul 23, 2025
3ec7170
[Bugfix][ROCm][Build] Fix build regression on ROCm (#21393)
gshtras Jul 23, 2025
f154bb9
Simplify weight loading in Transformers backend (#21382)
hmellor Jul 23, 2025
4f76a05
[BugFix] Update python to python3 calls for image; fix prefix & input…
ericehanley Jul 23, 2025
08d2bd7
[BUGFIX] deepseek-v2-lite failed due to fused_qkv_a_proj name update …
xuechendi Jul 23, 2025
2dec7c1
[Bugfix][CUDA] fixes CUDA FP8 kv cache dtype supported (#21420)
elvischenv Jul 23, 2025
107111a
Changing "amdproduction" allocation. (#21409)
Alexei-V-Ivanov-AMD Jul 23, 2025
4ecedd1
[Bugfix] Fix nightly transformers CI failure (#21427)
Isotr0py Jul 23, 2025
a1f3610
[Core] Add basic unit test for maybe_evict_cached_block (#21400)
Jialin Jul 23, 2025
f002e9a
[Cleanup] Only log MoE DP setup warning if DP is enabled (#21315)
mgoin Jul 23, 2025
2f5c14d
add clear messages for deprecated models (#21424)
youkaichao Jul 23, 2025
7aaa2bd
[Bugfix] ensure tool_choice is popped when `tool_choice:null` is pass…
gcalmettes Jul 23, 2025
6364af9
Fixed typo in profiling logs (#21441)
sergiopaniego Jul 23, 2025
23637dc
[Docs] Fix bullets and grammars in tool_calling.md (#21440)
windsonsea Jul 23, 2025
accac82
[Sampler] Introduce logprobs mode for logging (#21398)
houseroad Jul 23, 2025
32ec9e2
Mamba V2 Test not Asserting Failures. (#21379)
fabianlim Jul 23, 2025
6929f8b
[Misc] fixed nvfp4_moe test failures due to invalid kwargs (#21246)
chenyang78 Jul 23, 2025
2cc5016
[Docs] Clean up v1/metrics.md (#21449)
windsonsea Jul 23, 2025
2671334
[Model] add Hunyuan V1 Dense Model support. (#21368)
kzjeef Jul 23, 2025
f59ec35
[V1] Check all pooling tasks during profiling (#21299)
DarkLight1337 Jul 23, 2025
7c734ee
[Bugfix][Qwen][DCA] fixes bug in dual-chunk-flash-attn backend for qw…
sighingnow Jul 23, 2025
316b1bf
[Tests] Add tests for headless internal DP LB (#21450)
njhill Jul 23, 2025
8560a5b
[Core][Model] PrithviMAE Enablement on vLLM v1 engine (#20577)
christian-pinto Jul 23, 2025
4ac7713
Add test case for compiling multiple graphs (#21044)
sarckk Jul 23, 2025
14bf19e
[TPU][TEST] Fix the downloading issue in TPU v1 test 11. (#21418)
QiliangCui Jul 23, 2025
5c9b807
[Core] Add `reload_weights` RPC method (#20096)
22quinn Jul 23, 2025
78c13e3
[V1] Fix local chunked attention always disabled (#21419)
sarckk Jul 23, 2025
82ec66f
[V0 Deprecation] Remove Prompt Adapters (#20588)
mgoin Jul 23, 2025
f3137cd
[Core] Freeze gc during cuda graph capture to speed up init (#21146)
mgoin Jul 24, 2025
11599b0
feat(gguf_loader): accept HF repo paths & URLs for GGUF (#20793)
hardikkgupta Jul 24, 2025
63d92ab
[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hard…
deven-labovitch Jul 24, 2025
772ce5a
[Misc] Add dummy maverick test to CI (#21324)
minosfuture Jul 24, 2025
13e4ee1
[XPU][UT] increase intel xpu CI test scope (#21492)
Liangliang-Ma Jul 24, 2025
aa08a95
[Bugfix] Fix casing warning (#21468)
MatthewBonanni Jul 24, 2025
f8c15c4
[Bugfix] Fix example disagg_example_p2p_nccl_xpyd.sh zombie process (…
david6666666 Jul 24, 2025
fd48d99
[BugFix]: Batch generation from prompt_embeds fails for long prompts …
KazusatoOoko Jul 24, 2025
eec6942
[BugFix] Fix KVConnector TP worker aggregation (#21473)
njhill Jul 24, 2025
d5b981f
[DP] Internal Load Balancing Per Node [`one-pod-per-node`] (#21238)
robertgshaw2-redhat Jul 24, 2025
dc2f159
Dump input metadata on crash for async scheduling (#21258)
WoosukKwon Jul 24, 2025
11ef7a6
[BugFix] Set CUDA_VISIBLE_DEVICES before spawning the subprocesses (#…
yinghai Jul 24, 2025
6d8d0a2
Add think chunk (#21333)
juliendenize Jul 24, 2025
8d0d798
Merge branch 'v0.10.0-tag' into sync-v0.10.0rc1
heyselbi Jul 25, 2025
0dd9ef1
INFERENG-1235: Sync v0.10.0 (#257)
heyselbi Jul 30, 2025
a2cffea
Fix CUDA permute/unpermute for use with DeepGemm Moe (#17934)
CalebDu Jul 27, 2025
a5caa42
[Bugfix][ROCm] Fix for warp_size uses on host (#21205)
gshtras Jul 24, 2025
297ea4e
cherry pick fix for hf[xet]
hmellor Aug 7, 2025
f531f02
Merge branch 'v0.10.0-midstream' into rhoai-main
heyselbi Aug 8, 2025
b89b5d8
Add Daniele's commits and upgrade nccl
heyselbi Aug 8, 2025
01e9d83
Update tgis adapter to 0.8.0
heyselbi Aug 8, 2025
d3548b7
remove VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
heyselbi Aug 8, 2025
7053645
update vllm version
heyselbi Aug 11, 2025
9d8b4d6
[TEMP] Make openai entrypoint default and add template files
heyselbi Aug 11, 2025
f777d8d
Correct chown
heyselbi Aug 11, 2025
73b8071
update template to examples directory
heyselbi Aug 11, 2025
efe58e0
install blobfile for Kimi model support
heyselbi Aug 11, 2025
32bb10d
fix the typo
heyselbi Aug 11, 2025
6acccf0
add nccl upgrade to tgis adapter
heyselbi Aug 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done

lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size "$BATCH_SIZE"
4 changes: 3 additions & 1 deletion .buildkite/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@

def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}"
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}"
)
results = lm_eval.simple_evaluate(
model="vllm",
Expand Down
1 change: 0 additions & 1 deletion .buildkite/scripts/hardware_ci/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_blocksparse_attention.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
Expand Down
50 changes: 25 additions & 25 deletions .buildkite/scripts/hardware_ci/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -ex

# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
# used for TP/PP E2E test
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}

Expand All @@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

function cpu_tests() {
set -e
Expand All @@ -48,10 +49,16 @@ function cpu_tests() {
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
# Note: disable until supports V1
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

# Note: disable Bart until supports V1
pytest -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py

pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
Expand All @@ -62,33 +69,26 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"

# Note: disable it until supports V1
# Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py"

# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# VLLM_USE_V1=0 pytest -s -v \
# tests/quantization/test_ipex_quant.py"

# online serving
docker exec cpu-test-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
--endpoint /v1/completions'

# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
Expand Down
8 changes: 3 additions & 5 deletions .buildkite/scripts/hardware_ci/run-hpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,17 @@ set -exuo pipefail

# Try building the docker image
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
FROM 1.22-413-pt2.7.1:latest
FROM gaudi-base-image:latest

COPY ./ /workspace/vllm

WORKDIR /workspace/vllm

RUN pip install -v -r requirements/hpu.txt
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
RUN VLLM_TARGET_DEVICE=empty pip install .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
Expand Down
7 changes: 4 additions & 3 deletions .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,16 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"

echo "--- Hardware Information ---"
tpu-info
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
Expand Down Expand Up @@ -150,7 +151,7 @@ run_and_track_test 9 "test_multimodal.py" \
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
Expand Down
19 changes: 15 additions & 4 deletions .buildkite/scripts/hardware_ci/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
docker build -t ${image_name} -f docker/Dockerfile.xpu .

# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
Expand All @@ -26,7 +26,18 @@ docker run \
--name "${container_name}" \
"${image_name}" \
sh -c '
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
pytest -v -s v1/test_serial_utils.py
pytest -v -s v1/test_utils.py
pytest -v -s v1/test_metrics_reader.py
'
10 changes: 0 additions & 10 deletions .buildkite/scripts/tpu/docker_run_bm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,6 @@ trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container

# Build docker image.
# TODO: build the image outside the script and share the image with other
# tpu test if building time is too long.
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg GIT_REPO_CHECK=0 \
--tag vllm/vllm-tpu-bm \
--progress plain -f docker/Dockerfile.tpu .

LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
Expand Down
Loading