3434from torch .nn .parameter import Parameter
3535from transformers import PretrainedConfig
3636from vllm .attention import Attention , AttentionMetadata
37- from vllm .config import (CacheConfig , ModelConfig , VllmConfig ,
38- get_current_vllm_config )
37+ from vllm .config import (CacheConfig , ModelConfig , VllmConfig )
3938from vllm .distributed import (get_dp_group , get_pp_group ,
4039 get_tensor_model_parallel_rank ,
4140 get_tensor_model_parallel_world_size ,
@@ -335,10 +334,6 @@ def __init__(
335334
336335 self .tp_group = get_tp_group ().device_group
337336 self .tp_rank = get_tp_group ().rank_in_group
338- self .kv_consumer = None
339- transfer_config = get_current_vllm_config ().kv_transfer_config
340- if transfer_config is not None :
341- self .kv_consumer = transfer_config .kv_role == "kv_consumer"
342337
343338 def forward (
344339 self ,
@@ -353,10 +348,6 @@ def forward(
353348 enable_force_load_balance = forward_context .in_profile_run
354349
355350 is_prefill = forward_context .with_prefill
356- # If this node is kv_consumer, we force the moe always runs in decode path to make sure
357- # the behaviour aligned between dummy_run and normal model_execute.
358- if self .kv_consumer :
359- is_prefill = False
360351
361352 # router_logits: (num_tokens, n_experts)
362353 if self .enable_multistream_moe :
0 commit comments