From 91c321364fbc9dbb49d862c45bfb250f67417a34 Mon Sep 17 00:00:00 2001 From: Saumya Gandhi Date: Fri, 16 May 2025 13:03:02 -0700 Subject: [PATCH 1/3] potential fix --- src/art/local/vllm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/art/local/vllm.py b/src/art/local/vllm.py index 3c613baf..ee96b188 100644 --- a/src/art/local/vllm.py +++ b/src/art/local/vllm.py @@ -264,12 +264,19 @@ def patch_get_lora_tokenizer_async() -> None: Specifically, Unsloth patches get_lora_tokenizer_async with a non-async function, which causes issues. """ - import vllm.transformers_utils.tokenizer_group as tg + import vllm.transformers_utils.tokenizer + + async def _return_nothing(*_, **__) -> None: + return None + + vllm.transformers_utils.tokenizer.get_lora_tokenizer_async = _return_nothing # type: ignore async def get_self_lora_tokenizer_async(self, *args, **kwargs): return self.tokenizer - tg.TokenizerGroup.get_lora_tokenizer_async = get_self_lora_tokenizer_async # type: ignore + import vllm.transformers_utils.tokenizer_group + + vllm.transformers_utils.tokenizer_group.TokenizerGroup.get_lora_tokenizer_async = get_self_lora_tokenizer_async # type: ignore def patch_listen_for_disconnect() -> None: From 0a53765adcf2a6159395f914629da0049c815f46 Mon Sep 17 00:00:00 2001 From: Saumya Gandhi Date: Tue, 20 May 2025 08:56:06 -0700 Subject: [PATCH 2/3] return nothing everything --- src/art/local/vllm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/art/local/vllm.py b/src/art/local/vllm.py index ee96b188..a070741b 100644 --- a/src/art/local/vllm.py +++ b/src/art/local/vllm.py @@ -265,17 +265,18 @@ def patch_get_lora_tokenizer_async() -> None: Specifically, Unsloth patches get_lora_tokenizer_async with a non-async function, which causes issues. """ import vllm.transformers_utils.tokenizer + import vllm.transformers_utils.tokenizer_group async def _return_nothing(*_, **__) -> None: return None - - vllm.transformers_utils.tokenizer.get_lora_tokenizer_async = _return_nothing # type: ignore - + async def get_self_lora_tokenizer_async(self, *args, **kwargs): return self.tokenizer - import vllm.transformers_utils.tokenizer_group - + vllm.transformers_utils.tokenizer.get_lora_tokenizer_async = _return_nothing # type: ignore + vllm.transformers_utils.tokenizer_group.get_lora_tokenizer_async = ( + _return_nothing # type: ignore + ) vllm.transformers_utils.tokenizer_group.TokenizerGroup.get_lora_tokenizer_async = get_self_lora_tokenizer_async # type: ignore From c01b1339cf5f7cc2d5f708bda78665fc99a4ef56 Mon Sep 17 00:00:00 2001 From: Saumya Gandhi Date: Fri, 23 May 2025 11:51:50 -0700 Subject: [PATCH 3/3] default art to vllm default generation config, instead of model --- src/art/dev/model.py | 1 + src/art/dev/openai_server.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/art/dev/model.py b/src/art/dev/model.py index d29307cf..984d78df 100644 --- a/src/art/dev/model.py +++ b/src/art/dev/model.py @@ -43,6 +43,7 @@ def get_model_config( # which is the fallback for devices with compute capability < 8.0 num_scheduler_steps=16 if torch.cuda.get_device_capability()[0] >= 8 else 1, enable_sleep_mode=enable_sleep_mode, + generation_config="vllm", ) engine_args.update(config.get("engine_args", {})) init_args.update(config.get("init_args", {})) diff --git a/src/art/dev/openai_server.py b/src/art/dev/openai_server.py index b03a0bd9..018e64c5 100644 --- a/src/art/dev/openai_server.py +++ b/src/art/dev/openai_server.py @@ -27,6 +27,7 @@ def get_openai_server_config( num_scheduler_steps=16, served_model_name=base_model, disable_log_requests=True, + generation_config="vllm", ) engine_args.update(config.get("engine_args", {})) return OpenAIServerConfig(