-
-
Notifications
You must be signed in to change notification settings - Fork 11.1k
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Your current environment
The output of python collect_env.py
Your output of `python collect_env.py` here
🐛 Describe the bug
vllm bench throughput --model Qwen/Qwen3-30B-A3B-FP8 --load-format dummy --input-len 1000 --output-len 100 --trust_remote_code --enable-expert-parallel
(EngineCore_DP0 pid=2439401) self.run()
(EngineCore_DP0 pid=2439401) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=2439401) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/engine/core.py", line 712, in run_engine_core
(EngineCore_DP0 pid=2439401) raise e
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/engine/core.py", line 695, in run_engine_core
(EngineCore_DP0 pid=2439401) engine_core = DPEngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/engine/core.py", line 965, in __init__
(EngineCore_DP0 pid=2439401) super().__init__(vllm_config, local_client, handshake_address,
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/engine/core.py", line 498, in __init__
(EngineCore_DP0 pid=2439401) super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/engine/core.py", line 92, in __init__
(EngineCore_DP0 pid=2439401) self._initialize_kv_caches(vllm_config)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/engine/core.py", line 190, in _initialize_kv_caches
(EngineCore_DP0 pid=2439401) self.model_executor.determine_available_memory())
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/executor/abstract.py", line 85, in determine_available_memory
(EngineCore_DP0 pid=2439401) return self.collective_rpc("determine_available_memory")
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
(EngineCore_DP0 pid=2439401) return [run_method(self.driver_worker, method, args, kwargs)]
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/utils/__init__.py", line 3049, in run_method
(EngineCore_DP0 pid=2439401) return func(*args, **kwargs)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(EngineCore_DP0 pid=2439401) return func(*args, **kwargs)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/worker/gpu_worker.py", line 271, in determine_available_memory
(EngineCore_DP0 pid=2439401) self.model_runner.profile_run()
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/worker/gpu_model_runner.py", line 3292, in profile_run
(EngineCore_DP0 pid=2439401) = self._dummy_run(self.max_num_tokens, is_profile=True)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(EngineCore_DP0 pid=2439401) return func(*args, **kwargs)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/v1/worker/gpu_model_runner.py", line 3069, in _dummy_run
(EngineCore_DP0 pid=2439401) outputs = self.model(
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
(EngineCore_DP0 pid=2439401) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
(EngineCore_DP0 pid=2439401) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/models/deepseek_v2.py", line 907, in forward
(EngineCore_DP0 pid=2439401) hidden_states = self.model(input_ids, positions, intermediate_tensors,
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/compilation/decorators.py", line 310, in __call__
(EngineCore_DP0 pid=2439401) output = self.compiled_callable(*args, **kwargs)
(EngineCore_DP0 pid=2439401) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=2439401) File "/home/wentao/.venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 659, in _fn
(EngineCore_DP0 pid=2439401) raise e.with_traceback(None) from None
(EngineCore_DP0 pid=2439401) torch._dynamo.exc.Unsupported: non-function or method super: <built-in function _disabled_torch_function_impl>
(EngineCore_DP0 pid=2439401)
(EngineCore_DP0 pid=2439401) from user code:
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/models/deepseek_v2.py", line 783, in forward
(EngineCore_DP0 pid=2439401) hidden_states, residual = layer(positions, hidden_states, residual)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/models/deepseek_v2.py", line 711, in forward
(EngineCore_DP0 pid=2439401) hidden_states = self.mlp(hidden_states)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/models/deepseek_v2.py", line 285, in forward
(EngineCore_DP0 pid=2439401) router_logits, _ = self.gate(hidden_states)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/layers/linear.py", line 381, in forward
(EngineCore_DP0 pid=2439401) output = self.quant_method.apply(self, x, bias)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/layers/linear.py", line 236, in apply
(EngineCore_DP0 pid=2439401) return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/layers/utils.py", line 92, in default_unquantized_gemm
(EngineCore_DP0 pid=2439401) return torch.nn.functional.linear(x, weight, bias)
(EngineCore_DP0 pid=2439401) File "/home/wentao/vllm-source/vllm/model_executor/parameter.py", line 119, in __torch_function__
(EngineCore_DP0 pid=2439401) return super().__torch_function__(func, types, args, kwargs)
(EngineCore_DP0 pid=2439401)Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working