2626 tensor_model_parallel_all_reduce )
2727from vllm .distributed .parallel_state import get_dp_group
2828from vllm .model_executor .layers .fused_moe .layer import (
29- FusedMoE , UnquantizedFusedMoEMethod , determine_expert_map )
30-
31- from vllm_ascend .utils import vllm_version_is
32-
33- if not (vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" )):
34- from vllm .model_executor .layers .fused_moe .layer import (
35- FusedMoEParallelConfig , MoEConfig )
36- else :
37- MoEConfig = None
38-
39- from vllm .model_executor .layers .quantization .base_config import (
40- QuantizationConfig , QuantizeMethodBase )
29+ FusedMoE , FusedMoEParallelConfig , MoEConfig , UnquantizedFusedMoEMethod ,
30+ determine_expert_map )
31+ from vllm .model_executor .layers .quantization .base_config import \
32+ QuantizationConfig
4133
4234import vllm_ascend .envs as envs_ascend
4335from vllm_ascend .distributed .parallel_state import get_ep_group , get_etp_group
@@ -587,10 +579,8 @@ def select_experts(
587579class AscendUnquantizedFusedMoEMethod (UnquantizedFusedMoEMethod ):
588580
589581 def __init__ (self , moe : MoEConfig = None ):
590- if vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" ):
591- super ().__init__ ()
592- else :
593- super ().__init__ (moe = moe )
582+
583+ super ().__init__ (moe = moe )
594584 vllm_config = get_current_vllm_config ()
595585
596586 ep_group = get_ep_group ()
@@ -731,23 +721,16 @@ def __init__(
731721 params_dtype = torch .get_default_dtype ()
732722
733723 vllm_config = get_current_vllm_config ()
734- if vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" ):
735- self .ep_size = get_ep_group ().world_size
736- self .tp_size = get_etp_group ().world_size
737- self .dp_size = (dp_size if dp_size is not None else
738- get_dp_group ().world_size )
739- self .dp_rank = (0 if self .dp_size == 1 else
740- get_dp_group ().rank_in_group )
741- else :
742- self .moe_parallel_config : FusedMoEParallelConfig = (
743- FusedMoEParallelConfig .make (
744- tp_size_ = (tp_size if tp_size is not None else
745- get_tensor_model_parallel_world_size ()),
746- dp_size_ = (dp_size if dp_size is not None else
747- get_dp_group ().world_size ),
748- vllm_parallel_config = vllm_config .parallel_config ))
749724
750- self .moe_parallel_config .ep_size = get_ep_group ().world_size
725+ self .moe_parallel_config : FusedMoEParallelConfig = (
726+ FusedMoEParallelConfig .make (
727+ tp_size_ = (tp_size if tp_size is not None else
728+ get_tensor_model_parallel_world_size ()),
729+ dp_size_ = (dp_size if dp_size is not None else
730+ get_dp_group ().world_size ),
731+ vllm_parallel_config = vllm_config .parallel_config ))
732+
733+ self .moe_parallel_config .ep_size = get_ep_group ().world_size
751734
752735 self .top_k = top_k
753736 self .num_experts = num_experts
@@ -772,54 +755,39 @@ def __init__(
772755 self .local_num_experts , self .expert_map = determine_expert_map (
773756 self .ep_size ,
774757 get_ep_group ().rank_in_group , self .global_num_experts )
775- if vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" ):
776- self .tp_rank = get_etp_group ().rank_in_group
777- self .ep_rank = get_ep_group ().rank_in_group
778- else :
779- self .moe_parallel_config .tp_rank = get_etp_group (
780- ).rank_in_group
781- self .moe_parallel_config .ep_rank = get_ep_group ().rank_in_group
758+
759+ self .moe_parallel_config .tp_rank = get_etp_group ().rank_in_group
760+ self .moe_parallel_config .ep_rank = get_ep_group ().rank_in_group
782761
783762 else :
784763 # Adjust TP size for DP attention
785764 # haven't test its functionality yet, may remove in the future
786- if vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" ):
787- self .tp_rank = self .tp_size * self .dp_rank
788- self .ep_rank = 0
789- self .tp_size = self .tp_size * self .dp_size
790- self .ep_size = 1
791- else :
792- self .moe_parallel_config .tp_rank = self .tp_size * self .dp_rank
793- self .moe_parallel_config .ep_rank = 0
794- self .moe_parallel_config .tp_size = self .tp_size * self .dp_size
795- self .moe_parallel_config .ep_size = 1
765+
766+ self .moe_parallel_config .tp_rank = self .tp_size * self .dp_rank
767+ self .moe_parallel_config .ep_rank = 0
768+ self .moe_parallel_config .tp_size = self .tp_size * self .dp_size
769+ self .moe_parallel_config .ep_size = 1
796770
797771 self .local_num_experts , self .expert_map = (self .global_num_experts ,
798772 None )
799773 if self .scoring_func != "softmax" and not self .use_grouped_topk :
800774 raise ValueError ("Only softmax scoring function is supported for "
801775 "non-grouped topk." )
802- if vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" ):
803- if quant_config is None :
804- self .quant_method : Optional [QuantizeMethodBase ] = (
805- AscendUnquantizedFusedMoEMethod ())
806- else :
807- self .quant_method = quant_config .get_quant_method (self , prefix )
808- else :
809- moe = MoEConfig (
810- num_experts = self .global_num_experts ,
811- experts_per_token = top_k ,
812- hidden_dim = hidden_size ,
813- num_local_experts = self .local_num_experts ,
814- moe_parallel_config = self .moe_parallel_config ,
815- # TODO (bnell): this needs to be fixed for quantized types.
816- in_dtype = params_dtype ,
817- )
818776
819- if quant_config is None :
820- self .quant_method = AscendUnquantizedFusedMoEMethod (moe )
821- else :
822- self .quant_method = quant_config .get_quant_method (self , prefix )
777+ moe = MoEConfig (
778+ num_experts = self .global_num_experts ,
779+ experts_per_token = top_k ,
780+ hidden_dim = hidden_size ,
781+ num_local_experts = self .local_num_experts ,
782+ moe_parallel_config = self .moe_parallel_config ,
783+ # TODO (bnell): this needs to be fixed for quantized types.
784+ in_dtype = params_dtype ,
785+ )
786+
787+ if quant_config is None :
788+ self .quant_method = AscendUnquantizedFusedMoEMethod (moe )
789+ else :
790+ self .quant_method = quant_config .get_quant_method (self , prefix )
823791
824792 assert self .quant_method is not None
825793
0 commit comments