From fd6441698ffa77ff1d07eac7b65a464ef1524953 Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:57:24 +0800 Subject: [PATCH 1/4] Update topk_topp_sampler.py --- vllm/v1/sample/ops/topk_topp_sampler.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 7d70e839b6f4..9e7c4712932f 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -24,7 +24,23 @@ def __init__(self): super().__init__() if current_platform.is_cuda(): if is_flashinfer_available: - if envs.VLLM_USE_FLASHINFER_SAMPLER is not False: + flashinfer_version = flashinfer.__version__ + if flashinfer_version >= "v0.2.3": + # FIXME(DefTrue): Currently, we have errors when using + # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a + # workaround, we disable FlashInfer for top-p & top-k + # sampling by default while FlashInfer>=v0.2.3. + # The sampling API removes the success return value + # of all sampling API, which is not compatible with + # earlier design. + # https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.3 + logger.info( + "Currently, FlashInfer top-p & top-k sampling sampler is " + f"disabled because {flashinfer_version} is not backward " + "compatible. Falling back to the PyTorch-native " + "implementation of top-p & top-k sampling.") + self.forward = self.forward_native + elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False: # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by # default it is unused). For backward compatibility, we set From 7df4aac288cf48ea5ad06cc893392b43646353ee Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:58:19 +0800 Subject: [PATCH 2/4] Update topk_topp_sampler.py --- vllm/v1/sample/ops/topk_topp_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 9e7c4712932f..6bd9cd4600e7 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -25,7 +25,7 @@ def __init__(self): if current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ - if flashinfer_version >= "v0.2.3": + if flashinfer_version >= "0.2.3": # FIXME(DefTrue): Currently, we have errors when using # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a # workaround, we disable FlashInfer for top-p & top-k From 5e5d4f4d8f7d08ea1abc0a51a2472d20871ba86f Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Fri, 14 Mar 2025 19:04:23 +0800 Subject: [PATCH 3/4] Update topk_topp_sampler.py --- vllm/v1/sample/ops/topk_topp_sampler.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 6bd9cd4600e7..b505fa329172 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -33,12 +33,13 @@ def __init__(self): # The sampling API removes the success return value # of all sampling API, which is not compatible with # earlier design. - # https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.3 + # https://github.com/flashinfer-ai/flashinfer/releases/ + # tag/v0.2.3 logger.info( - "Currently, FlashInfer top-p & top-k sampling sampler is " - f"disabled because {flashinfer_version} is not backward " - "compatible. Falling back to the PyTorch-native " - "implementation of top-p & top-k sampling.") + "Currently, FlashInfer top-p & top-k sampling sampler " + "is disabled because FlashInfer>=v0.2.3 is not " + "backward compatible. Falling back to the PyTorch-" + "native implementation of top-p & top-k sampling.") self.forward = self.forward_native elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False: # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for From d5e16913fa9f1424b67e65dd57184fd264161cb7 Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Fri, 14 Mar 2025 20:18:39 +0800 Subject: [PATCH 4/4] Update topk_topp_sampler.py --- vllm/v1/sample/ops/topk_topp_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index b505fa329172..d461a8098933 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -26,7 +26,7 @@ def __init__(self): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ if flashinfer_version >= "0.2.3": - # FIXME(DefTrue): Currently, we have errors when using + # FIXME(DefTruth): Currently, we have errors when using # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a # workaround, we disable FlashInfer for top-p & top-k # sampling by default while FlashInfer>=v0.2.3.