@@ -46,23 +46,15 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
4646 "Falling back to default sampling implementation."
4747 )
4848 self .forward = self .forward_native
49- elif envs .VLLM_USE_FLASHINFER_SAMPLER is not False :
50- # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
51- # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
52- # default it is unused). For backward compatibility, we set
53- # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
54- # interpret it differently in V0 and V1 samplers: In V0,
55- # None means False, while in V1, None means True. This is
56- # why we use the condition
57- # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
49+ elif envs .VLLM_USE_FLASHINFER_SAMPLER :
50+ # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
5851 logger .info_once ("Using FlashInfer for top-p & top-k sampling." )
5952 self .forward = self .forward_cuda
6053 else :
61- logger .warning_once (
62- "FlashInfer is available, but it is not enabled. "
63- "Falling back to the PyTorch-native implementation of "
64- "top-p & top-k sampling. For the best performance, "
65- "please set VLLM_USE_FLASHINFER_SAMPLER=1."
54+ logger .debug_once (
55+ "FlashInfer top-p/top-k sampling is available but disabled "
56+ "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in "
57+ "after verifying accuracy for your workloads."
6658 )
6759 self .forward = self .forward_native
6860 else :
0 commit comments