|
14 | 14 | from vllm import _custom_ops as ops |
15 | 15 | from vllm.logger import init_logger |
16 | 16 | from vllm.model_executor.layers.fused_moe import ( |
17 | | - CutlassExpertsFp8, FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, |
18 | | - FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, |
19 | | - FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported, fused_experts) |
| 17 | + FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, |
| 18 | + FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, |
| 19 | + FusedMoeWeightScaleSupported) |
20 | 20 | from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa |
21 | 21 | WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP) |
22 | 22 | from vllm.model_executor.layers.quantization.utils import replace_parameter |
@@ -570,6 +570,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: |
570 | 570 | del layer.w2_input_scale |
571 | 571 | self.fused_experts_func = None |
572 | 572 | else: |
| 573 | + from vllm.model_executor.layers.fused_moe import fused_experts |
573 | 574 | self.fused_experts_func = fused_experts |
574 | 575 |
|
575 | 576 | def apply( |
@@ -826,6 +827,7 @@ def select_gemm_impl( |
826 | 827 | prepare_finalize: FusedMoEPrepareAndFinalize, |
827 | 828 | moe: FusedMoEConfig, |
828 | 829 | ) -> FusedMoEPermuteExpertsUnpermute: |
| 830 | + from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8 |
829 | 831 |
|
830 | 832 | use_batched_format = (prepare_finalize.activation_format == |
831 | 833 | FusedMoEActivationFormat.BatchedExperts) |
|
0 commit comments