vllm-project · mgoin · May 14, 2025
@@ -1,34 +1 @@
-gptq_marlin, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
-gptq_marlin, TheBloke/Llama-2-7B-GPTQ, main
-gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
-gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
-gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
-gptq_marlin, TechxGenus/gemma-1.1-2b-it-GPTQ, main
-gptq, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
-gptq, TheBloke/Llama-2-7B-GPTQ, main
-gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
-gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
-gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
-gptq, TechxGenus/gemma-1.1-2b-it-GPTQ, main
-compressed-tensors, nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change, main
-compressed-tensors, nm-testing/tinyllama-oneshot-w8-channel-a8-tensor, main
-compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2, main
-compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2, main
-compressed-tensors, nm-testing/tinyllama-oneshot-w4a16-group128-v2, main
-compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main
-compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
-compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
-compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
-compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
-#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
-compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
-compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
-awq, casperhansen/mixtral-instruct-awq, main
-awq_marlin, casperhansen/mixtral-instruct-awq, main
-fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
-marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
-marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
-qqq, HandH1998/QQQ-Llama-3-8b-g128, main
-qqq, HandH1998/QQQ-Llama-3-8b, main
-hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
-None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
+gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
@@ -35,6 +35,7 @@ def test_weight_loading(vllm_runner):
             dtype=torch.half if NEEDS_FP16 else "auto",
             quantization=None if QUANTIZATION == "None" else QUANTIZATION,
             max_model_len=MAX_MODEL_LEN,
+            enforce_eager=True,
             tensor_parallel_size=2) as model:
 
         output = model.generate_greedy("Hello world!", max_tokens=20)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -1280,9 +1280,11 @@ def forward(
         # Only fuse bias add into GEMM for rank 0 (this ensures that
         # bias will not get added more than once in TP>1 case)
         bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        torch.cuda.synchronize()
         output_parallel = self.quant_method.apply(self,
                                                   input_parallel,
                                                   bias=bias_)
+        torch.cuda.synchronize()
         if self.reduce_results and self.tp_size > 1:
             output = tensor_model_parallel_all_reduce(output_parallel)
         else:

@@ -405,7 +405,7 @@ def apply_gptq_marlin_linear(
                                                   k=reshaped_x.size(1),
                                                   device=input.device,
                                                   dtype=input.dtype)
-
+    torch.cuda.synchronize()
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   None,
                                   weight,
@@ -423,7 +423,7 @@ def apply_gptq_marlin_linear(
                                   use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
-
+    torch.cuda.synchronize()
     if bias is not None:
         output.add_(bias)  # In-place add
 

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -200,16 +200,6 @@ def __init__(
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
-        # Persistent batch.
-        self.input_batch = InputBatch(
-            max_num_reqs=self.max_num_reqs,
-            max_model_len=self.max_model_len,
-            max_num_blocks_per_req=self.max_num_blocks_per_req,
-            max_num_batched_tokens=self.max_num_tokens,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            vocab_size=model_config.get_vocab_size(),
-        )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE
@@ -1834,6 +1824,16 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
         self.kv_cache_config = kv_cache_config
+        # Persistent batch.
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_blocks_per_req=self.max_num_blocks_per_req,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            vocab_size=self.model_config.get_vocab_size(),
+        )
 
         kv_caches: dict[str, torch.Tensor] = {}