Skip to content

Commit 7d26f48

Browse files
committed
merge: branch 'main' of github.com:vllm-project/vllm into feat/jump-forward-structured-outputs
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2 parents 13b6c19 + c9c1b59 commit 7d26f48

File tree

97 files changed

+2298
-623
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+2298
-623
lines changed

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 44 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -75,37 +75,51 @@ HF_MOUNT="/root/.cache/huggingface"
7575
commands=$@
7676
echo "Commands:$commands"
7777
#ignore certain kernels tests
78-
if [[ $commands == *" kernels "* ]]; then
78+
if [[ $commands == *" kernels/core"* ]]; then
7979
commands="${commands} \
80-
--ignore=kernels/test_attention_selector.py \
81-
--ignore=kernels/test_blocksparse_attention.py \
82-
--ignore=kernels/test_causal_conv1d.py \
83-
--ignore=kernels/test_cutlass.py \
84-
--ignore=kernels/test_encoder_decoder_attn.py \
85-
--ignore=kernels/test_flash_attn.py \
86-
--ignore=kernels/test_flashinfer.py \
87-
--ignore=kernels/test_int8_quant.py \
88-
--ignore=kernels/test_machete_gemm.py \
89-
--ignore=kernels/test_mamba_ssm.py \
90-
--ignore=kernels/test_marlin_gemm.py \
91-
--ignore=kernels/test_moe.py \
92-
--ignore=kernels/test_prefix_prefill.py \
93-
--ignore=kernels/test_rand.py \
94-
--ignore=kernels/test_sampler.py \
95-
--ignore=kernels/test_cascade_flash_attn.py \
96-
--ignore=kernels/test_mamba_mixer2.py \
97-
--ignore=kernels/test_aqlm.py \
98-
--ignore=kernels/test_machete_mm.py \
99-
--ignore=kernels/test_mha_attn.py \
100-
--ignore=kernels/test_block_fp8.py \
101-
--ignore=kernels/test_cutlass_moe.py \
102-
--ignore=kernels/test_mamba_ssm_ssd.py \
103-
--ignore=kernels/test_attention.py \
104-
--ignore=kernels/test_block_int8.py \
105-
--ignore=kernels/test_fused_quant_layernorm.py \
106-
--ignore=kernels/test_int8_kernel.py \
107-
--ignore=kernels/test_triton_moe_ptpc_fp8.py \
108-
--ignore=kernels/test_permute_cols.py"
80+
--ignore=kernels/core/test_fused_quant_layernorm.py \
81+
--ignore=kernels/core/test_permute_cols.py"
82+
fi
83+
84+
if [[ $commands == *" kernels/attention"* ]]; then
85+
commands="${commands} \
86+
--ignore=kernels/attention/stest_attention_selector.py \
87+
--ignore=kernels/attention/test_blocksparse_attention.py \
88+
--ignore=kernels/attention/test_encoder_decoder_attn.py \
89+
--ignore=kernels/attention/test_attention_selector.py \
90+
--ignore=kernels/attention/test_flash_attn.py \
91+
--ignore=kernels/attention/test_flashinfer.py \
92+
--ignore=kernels/attention/test_prefix_prefill.py \
93+
--ignore=kernels/attention/test_cascade_flash_attn.py \
94+
--ignore=kernels/attention/test_mha_attn.py \
95+
--ignore=kernels/attention/test_lightning_attn.py \
96+
--ignore=kernels/attention/test_attention.py"
97+
fi
98+
99+
if [[ $commands == *" kernels/quantization"* ]]; then
100+
commands="${commands} \
101+
--ignore=kernels/quantization/test_int8_quant.py \
102+
--ignore=kernels/quantization/test_aqlm.py \
103+
--ignore=kernels/quantization/test_machete_mm.py \
104+
--ignore=kernels/quantization/test_block_fp8.py \
105+
--ignore=kernels/quantization/test_block_int8.py \
106+
--ignore=kernels/quantization/test_marlin_gemm.py \
107+
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
108+
--ignore=kernels/quantization/test_int8_kernel.py"
109+
fi
110+
111+
if [[ $commands == *" kernels/mamba"* ]]; then
112+
commands="${commands} \
113+
--ignore=kernels/mamba/test_mamba_mixer2.py \
114+
--ignore=kernels/mamba/test_causal_conv1d.py \
115+
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
116+
fi
117+
118+
if [[ $commands == *" kernels/moe"* ]]; then
119+
commands="${commands} \
120+
--ignore=kernels/moe/test_moe.py \
121+
--ignore=kernels/moe/test_cutlass_moe.py \
122+
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
109123
fi
110124

111125
#ignore certain Entrypoints/openai tests

.buildkite/test-pipeline.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,13 +319,15 @@ steps:
319319
- pytest -v -s compile/test_full_graph.py
320320

321321
- label: Kernels Core Operation Test
322+
mirror_hardwares: [amd]
322323
source_file_dependencies:
323324
- csrc/
324325
- tests/kernels/core
325326
commands:
326327
- pytest -v -s kernels/core
327328

328329
- label: Kernels Attention Test %N
330+
mirror_hardwares: [amd]
329331
source_file_dependencies:
330332
- csrc/attention/
331333
- vllm/attention
@@ -336,6 +338,7 @@ steps:
336338
parallelism: 2
337339

338340
- label: Kernels Quantization Test %N
341+
mirror_hardwares: [amd]
339342
source_file_dependencies:
340343
- csrc/quantization/
341344
- vllm/model_executor/layers/quantization
@@ -345,6 +348,7 @@ steps:
345348
parallelism: 2
346349

347350
- label: Kernels MoE Test
351+
#mirror_hardwares: [amd]
348352
source_file_dependencies:
349353
- csrc/moe/
350354
- tests/kernels/moe
@@ -353,6 +357,7 @@ steps:
353357
- pytest -v -s kernels/moe
354358

355359
- label: Kernels Mamba Test
360+
#mirror_hardwares: [amd]
356361
source_file_dependencies:
357362
- csrc/mamba/
358363
- tests/kernels/mamba

.pre-commit-config.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,29 @@ repos:
1212
- id: yapf
1313
args: [--in-place, --verbose]
1414
- repo: https://github.com/astral-sh/ruff-pre-commit
15-
rev: v0.9.3
15+
rev: v0.11.7
1616
hooks:
1717
- id: ruff
1818
args: [--output-format, github, --fix]
1919
- repo: https://github.com/codespell-project/codespell
20-
rev: v2.4.0
20+
rev: v2.4.1
2121
hooks:
2222
- id: codespell
2323
additional_dependencies: ['tomli']
2424
args: ['--toml', 'pyproject.toml']
2525
- repo: https://github.com/PyCQA/isort
26-
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
26+
rev: 6.0.1
2727
hooks:
2828
- id: isort
2929
- repo: https://github.com/pre-commit/mirrors-clang-format
30-
rev: v19.1.7
30+
rev: v20.1.3
3131
hooks:
3232
- id: clang-format
3333
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
3434
types_or: [c++, cuda]
3535
args: [--style=file, --verbose]
3636
- repo: https://github.com/jackdewinter/pymarkdown
37-
rev: v0.9.27
37+
rev: v0.9.29
3838
hooks:
3939
- id: pymarkdown
4040
args: [fix]
@@ -43,7 +43,7 @@ repos:
4343
hooks:
4444
- id: actionlint
4545
- repo: https://github.com/astral-sh/uv-pre-commit
46-
rev: 0.6.2
46+
rev: 0.6.17
4747
hooks:
4848
- id: pip-compile
4949
args: [requirements/test.in, -o, requirements/test.txt]

csrc/moe/marlin_kernels/marlin_moe_kernel.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
138138
const int HI = 0x00f000f0;
139139
const int EX = 0x64006400;
140140
// Guarantee that the `(a & b) | c` operations are LOP3s.
141-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
142-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
141+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
142+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
143143
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
144144
// directly into `SUB` and `ADD`.
145145
const int SUB = 0x64086408;
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
182182
const int HI = 0x00f000f0;
183183
const int EX = 0x64006400;
184184
// Guarantee that the `(a & b) | c` operations are LOP3s.
185-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
186-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
185+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
186+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
187187

188188
const int SUB = 0x64006400;
189189
const int MUL = 0x2c002c00;

csrc/moe/marlin_moe_wna16/marlin_template.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,8 @@ __device__ inline typename ScalarType<half>::FragB dequant<half, 4>(
209209
const int HI = 0x00f000f0;
210210
const int EX = 0x64006400;
211211
// Guarantee that the `(a & b) | c` operations are LOP3s.
212-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
213-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
212+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
213+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
214214
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
215215
// directly into `SUB` and `ADD`.
216216
const int SUB = 0x64086408;
@@ -233,9 +233,9 @@ dequant<nv_bfloat16, 4>(int q,
233233

234234
// Guarantee that the `(a & b) | c` operations are LOP3s.
235235

236-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
236+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
237237
q >>= 4;
238-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
238+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
239239

240240
static constexpr uint32_t MUL = 0x3F803F80;
241241
static constexpr uint32_t ADD = 0xC308C308;

csrc/moe/moe_wna16_utils.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,11 @@ __device__ inline void dequant<half2, 4>(int q, half2* res) {
108108
const int MUL = 0x2c002c00;
109109
const int ADD = 0xd400d400;
110110

111-
int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
112-
int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
111+
int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
112+
int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
113113
q >>= 8;
114-
int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
115-
int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
114+
int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
115+
int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
116116

117117
res[0] = __hsub2(*reinterpret_cast<half2*>(&lo0),
118118
*reinterpret_cast<const half2*>(&SUB));
@@ -149,13 +149,13 @@ __device__ inline void dequant<nv_bfloat162, 4>(int q, nv_bfloat162* res) {
149149
static constexpr uint32_t MASK = 0x000f000f;
150150
static constexpr uint32_t EX = 0x43004300;
151151

152-
int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
152+
int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
153153
q >>= 4;
154-
int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
154+
int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
155155
q >>= 4;
156-
int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
156+
int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
157157
q >>= 4;
158-
int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
158+
int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
159159

160160
static constexpr uint32_t MUL = 0x3F803F80;
161161
static constexpr uint32_t ADD = 0xC300C300;

csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
9696
std::optional<at::Tensor> const& scale_ub,
9797
std::optional<at::Tensor>& residual) {
9898
int32_t hidden_size = input.size(-1);
99-
int32_t num_tokens = input.numel() / hidden_size;
99+
auto num_tokens = input.numel() / hidden_size;
100100

101101
dim3 grid(num_tokens);
102102
dim3 block(std::min(hidden_size, 1024));

csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
347347
for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
348348
hmma16816_f32<FType>(
349349
C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx],
350-
reinterpret_cast<uint32_t(&)[2]>(BF_frag[reg_buf_idx][n_idx]));
350+
reinterpret_cast<uint32_t (&)[2]>(BF_frag[reg_buf_idx][n_idx]));
351351
}
352352
}
353353
}

csrc/quantization/gptq_marlin/gptq_marlin.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
173173
const int HI = 0x00f000f0;
174174
const int EX = 0x64006400;
175175
// Guarantee that the `(a & b) | c` operations are LOP3s.
176-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
177-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
176+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
177+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
178178
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
179179
// directly into `SUB` and `ADD`.
180180
const int SUB = 0x64086408;
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
197197

198198
// Guarantee that the `(a & b) | c` operations are LOP3s.
199199

200-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
200+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
201201
q >>= 4;
202-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
202+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
203203

204204
typename ScalarType<nv_bfloat16>::FragB frag_b;
205205
static constexpr uint32_t MUL = 0x3F803F80;
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
221221
const int HI = 0x00f000f0;
222222
const int EX = 0x64006400;
223223
// Guarantee that the `(a & b) | c` operations are LOP3s.
224-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
225-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
224+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
225+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
226226

227227
const int SUB = 0x64006400;
228228
const int MUL = 0x2c002c00;
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
244244

245245
// Guarantee that the `(a & b) | c` operations are LOP3s.
246246

247-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
247+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
248248
q >>= 4;
249-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
249+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
250250

251251
typename ScalarType<nv_bfloat16>::FragB frag_b;
252252
static constexpr uint32_t MUL = 0x3F803F80;

csrc/quantization/marlin/dense/marlin_cuda_kernel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
9696
const int HI = 0x00f000f0;
9797
const int EX = 0x64006400;
9898
// Guarantee that the `(a & b) | c` operations are LOP3s.
99-
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
100-
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
99+
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
100+
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
101101
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
102102
// directly into `SUB` and `ADD`.
103103
const int SUB = 0x64086408;

0 commit comments

Comments
 (0)