File tree Expand file tree Collapse file tree 3 files changed +26
-1
lines changed
model_executor/layers/quantization/compressed_tensors Expand file tree Collapse file tree 3 files changed +26
-1
lines changed Original file line number Diff line number Diff line change 133133 VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS : int = 300
134134 VLLM_KV_CACHE_LAYOUT : Optional [str ] = None
135135 VLLM_COMPUTE_NANS_IN_LOGITS : bool = False
136+ VLLM_USE_NVFP4_CT_EMULATIONS : bool = False
136137
137138
138139def get_default_cache_root ():
@@ -918,6 +919,12 @@ def get_vllm_port() -> Optional[int]:
918919 # or bad hardware but it may add compute overhead.
919920 "VLLM_COMPUTE_NANS_IN_LOGITS" :
920921 lambda : bool (int (os .getenv ("VLLM_COMPUTE_NANS_IN_LOGITS" , "0" ))),
922+
923+ # Controls whether or not emulations are used for NVFP4
924+ # generations on machines < 100 for compressed-tensors
925+ # models
926+ "VLLM_USE_NVFP4_CT_EMULATIONS" :
927+ lambda : bool (int (os .getenv ("VLLM_USE_NVFP4_CT_EMULATIONS" , "0" )))
921928}
922929
923930# --8<-- [end:env-vars-definition]
Original file line number Diff line number Diff line change 1313 QuantizationType )
1414from pydantic import BaseModel
1515
16+ import vllm .envs as envs
1617from vllm .logger import init_logger
1718from vllm .model_executor .layers .fused_moe import FusedMoE
1819from vllm .model_executor .layers .linear import (LinearBase , LinearMethodBase ,
@@ -374,7 +375,8 @@ def _get_scheme_from_parts(
374375
375376 if is_activation_quantization_format (self .quant_format ):
376377 if self ._is_fp4a4_nvfp4 (weight_quant , input_quant ):
377- if CompressedTensorsW4A4Fp4 .cutlass_fp4_supported ():
378+ if CompressedTensorsW4A4Fp4 .cutlass_fp4_supported (
379+ ) or envs .VLLM_USE_NVFP4_CT_EMULATIONS :
378380 return CompressedTensorsW4A4Fp4 ()
379381 else :
380382 logger .warning_once (
Original file line number Diff line number Diff line change 44import torch
55from torch .nn .parameter import Parameter
66
7+ import vllm .envs as envs
78from vllm ._custom_ops import (cutlass_scaled_fp4_mm ,
89 cutlass_scaled_mm_supports_fp4 , scaled_fp4_quant )
910from vllm .logger import init_logger
1011from vllm .model_executor .layers .quantization .compressed_tensors .schemes import (
1112 CompressedTensorsScheme )
13+ from vllm .model_executor .layers .quantization .utils .nvfp4_emulation_utils import ( # noqa: E501
14+ run_nvfp4_emulations )
1215from vllm .model_executor .parameter import (GroupQuantScaleParameter ,
1316 ModelWeightParameter ,
1417 PerTensorScaleParameter )
@@ -26,6 +29,8 @@ def __init__(self):
2629
2730 @classmethod
2831 def get_min_capability (cls ) -> int :
32+ if envs .VLLM_USE_NVFP4_CT_EMULATIONS :
33+ return 80
2934 return 100
3035
3136 @classmethod
@@ -129,6 +134,17 @@ def apply_weights(self,
129134 x : torch .Tensor ,
130135 bias : Optional [torch .Tensor ] = None ) -> torch .Tensor :
131136
137+ if envs .VLLM_USE_NVFP4_CT_EMULATIONS :
138+ out = run_nvfp4_emulations (
139+ x = x ,
140+ input_global_scale = layer .input_global_scale ,
141+ weight = layer .weight ,
142+ weight_scale_swizzled = layer .weight_scale_swizzled ,
143+ weight_global_scale = layer .weight_global_scale )
144+ if bias is not None :
145+ out = out + bias
146+ return out
147+
132148 output_dtype = x .dtype
133149 output_shape = [x .shape [0 ], layer .weight .shape [0 ]]
134150
You can’t perform that action at this time.
0 commit comments