From 1258b2f3cd4e77631c32c66546b15bafb051cdcd Mon Sep 17 00:00:00 2001 From: zhaoyinglia Date: Tue, 29 Apr 2025 15:56:43 +0800 Subject: [PATCH 1/5] [vLLM] Update to v0.8.5 --- third_party/vllm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/vllm b/third_party/vllm index dc1b4a6f1..ba41cc90e 160000 --- a/third_party/vllm +++ b/third_party/vllm @@ -1 +1 @@ -Subproject commit dc1b4a6f1300003ae27f033afbdff5e2683721ce +Subproject commit ba41cc90e8ef7f236347b2f1599eec2cbb9e1f0d From 308ada00a3b9be76ab813a20ef4f2317bcdaf1ee Mon Sep 17 00:00:00 2001 From: zhaoyinglia Date: Tue, 29 Apr 2025 16:03:43 +0800 Subject: [PATCH 2/5] update cfg --- .../backends/vllm/vllm/core/scheduler.py | 2 +- .../backends/vllm/vllm/engine/llm_engine.py | 49 ++---- flagscale/backends/vllm/vllm/inputs/data.py | 161 +----------------- .../backends/vllm/vllm/inputs/preprocess.py | 62 ++----- .../vllm/model_executor/models/registry.py | 15 +- .../backends/vllm/vllm/sampling_params.py | 24 ++- flagscale/backends/vllm/vllm/sequence.py | 47 ++--- .../backends/vllm/vllm/worker/model_runner.py | 59 +++---- 8 files changed, 107 insertions(+), 312 deletions(-) diff --git a/flagscale/backends/vllm/vllm/core/scheduler.py b/flagscale/backends/vllm/vllm/core/scheduler.py index 9a9b0e8d2..1f7eb23ec 100644 --- a/flagscale/backends/vllm/vllm/core/scheduler.py +++ b/flagscale/backends/vllm/vllm/core/scheduler.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import contextlib # --- FLAGSCALE MODIFICATION --- import enum import os import random import time -import contextlib # --- FLAGSCALE MODIFICATION --- from collections import deque from dataclasses import dataclass, field from typing import Callable, Deque, Dict, Iterable, List, Optional diff --git a/flagscale/backends/vllm/vllm/engine/llm_engine.py b/flagscale/backends/vllm/vllm/engine/llm_engine.py index d6998f694..77afd8492 100644 --- a/flagscale/backends/vllm/vllm/engine/llm_engine.py +++ b/flagscale/backends/vllm/vllm/engine/llm_engine.py @@ -29,8 +29,7 @@ from vllm.entrypoints.openai.logits_processors import ( get_logits_processors as get_openai_logits_processors) from vllm.executor.executor_base import ExecutorBase -from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, - PromptType, SingletonInputs) +from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger @@ -55,7 +54,7 @@ from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import ( - BaseTokenizerGroup, init_tokenizer_from_configs) + TokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) from vllm.utils import (Counter, Device, deprecate_kwargs, @@ -66,7 +65,6 @@ logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 -_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup) _O = TypeVar("_O", RequestOutput, PoolingRequestOutput) _R = TypeVar("_R", default=Any) @@ -205,7 +203,7 @@ def validate_outputs( return outputs_ - tokenizer: Optional[BaseTokenizerGroup] + tokenizer: Optional[TokenizerGroup] def __init__( self, @@ -214,7 +212,6 @@ def __init__( log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, ) -> None: @@ -275,11 +272,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: self.tokenizer, mm_registry) - self.input_registry = input_registry - self.input_processor = input_registry.create_input_processor( - self.model_config) - - self.model_executor = executor_class(vllm_config=vllm_config, ) + self.model_executor = executor_class(vllm_config=vllm_config) if self.model_config.runner_type != "pooling": self._initialize_kv_caches() @@ -321,11 +314,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: self.parallel_config.disable_custom_all_reduce, }) - if self.tokenizer: - # Ping the tokenizer to ensure liveness if it runs in a - # different process. - self.tokenizer.ping() - self.cached_scheduler_outputs = [ SchedulerOutputState() for _ in range(self.parallel_config.pipeline_parallel_size) @@ -537,21 +525,12 @@ def __del__(self): if model_executor := getattr(self, "model_executor", None): model_executor.shutdown() - def get_tokenizer_group( - self, - group_type: Type[_G] = BaseTokenizerGroup, - ) -> _G: - tokenizer_group = self.tokenizer - - if tokenizer_group is None: + def get_tokenizer_group(self) -> TokenizerGroup: + if self.tokenizer is None: raise ValueError("Unable to get tokenizer because " "skip_tokenizer_init is True") - if not isinstance(tokenizer_group, group_type): - raise TypeError("Invalid type of tokenizer group. " - f"Expected type: {group_type}, but " - f"found type: {type(tokenizer_group)}") - return tokenizer_group + return self.tokenizer def get_tokenizer( self, @@ -559,11 +538,10 @@ def get_tokenizer( ) -> AnyTokenizer: return self.get_tokenizer_group().get_lora_tokenizer(lora_request) - def _init_tokenizer(self) -> BaseTokenizerGroup: + def _init_tokenizer(self) -> TokenizerGroup: return init_tokenizer_from_configs( model_config=self.model_config, scheduler_config=self.scheduler_config, - parallel_config=self.parallel_config, lora_config=self.lora_config) def _verify_args(self) -> None: @@ -799,12 +777,11 @@ def add_request( prompt, tokenizer=self.get_tokenizer(lora_request=lora_request)) - preprocessed_inputs = self.input_preprocessor.preprocess( + processed_inputs = self.input_preprocessor.preprocess( prompt, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) - processed_inputs = self.input_processor(preprocessed_inputs) self._add_processed_request( request_id=request_id, @@ -937,6 +914,10 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: scheduler.abort_seq_group( request_id, seq_id_to_seq_group=self.seq_id_to_seq_group) + def get_vllm_config(self) -> VllmConfig: + """Gets the vllm configuration.""" + return self.vllm_config + def get_model_config(self) -> ModelConfig: """Gets the model configuration.""" return self.model_config @@ -1989,8 +1970,6 @@ def is_sleeping(self) -> bool: return self.model_executor.is_sleeping def check_health(self) -> None: - if self.tokenizer: - self.tokenizer.check_health() self.model_executor.check_health() def is_tracing_enabled(self) -> bool: @@ -2099,7 +2078,7 @@ def _validate_model_input( raise ValueError(f"The {prompt_type} prompt cannot be empty") max_prompt_len = self.model_config.max_model_len - if len(prompt_ids) >= max_prompt_len: + if len(prompt_ids) > max_prompt_len: if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( diff --git a/flagscale/backends/vllm/vllm/inputs/data.py b/flagscale/backends/vllm/vllm/inputs/data.py index a4760c00d..91c840386 100644 --- a/flagscale/backends/vllm/vllm/inputs/data.py +++ b/flagscale/backends/vllm/vllm/inputs/data.py @@ -1,17 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 - from collections.abc import Iterable -from dataclasses import dataclass -from functools import cached_property from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast -import torch -from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never +from typing_extensions import NotRequired, TypedDict, TypeVar if TYPE_CHECKING: - from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, - MultiModalPlaceholderDict) - from vllm.multimodal.inputs import MultiModalInputs + from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs class TextPrompt(TypedDict): @@ -157,36 +151,6 @@ class TokenInputs(TypedDict): The original prompt text corresponding to the token IDs, if available. """ - multi_modal_data: NotRequired["MultiModalDataDict"] - """ - Optional multi-modal data to pass to the model, - if the model supports it. - """ - - multi_modal_inputs: NotRequired["MultiModalKwargs"] - """ - Optional multi-modal inputs to pass to the model, - if the model supports it. - """ - - multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"] - """ - Placeholder ranges for the multi-modal data. - """ - - multi_modal_hashes: NotRequired[list[str]] - """ - The hashes of the multi-modal data. - """ - - mm_processor_kwargs: NotRequired[dict[str, Any]] - """ - Optional multi-modal processor kwargs to be forwarded to the - multimodal input mapper & processor. Note that if multiple modalities - have registered mappers etc for the model being considered, we attempt - to pass the mm_processor_kwargs to each of them. - """ - # --- FLAGSCALE MODIFICATION BEG --- negative_prompt_token_ids: NotRequired[Optional[list[int]]] negative_prompt: NotRequired[Optional[str]] @@ -197,11 +161,6 @@ def token_inputs( prompt_token_ids: list[int], token_type_ids: Optional[list[int]] = None, prompt: Optional[str] = None, - multi_modal_data: Optional["MultiModalDataDict"] = None, - multi_modal_inputs: Optional["MultiModalKwargs"] = None, - multi_modal_hashes: Optional[list[str]] = None, - multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, # --- FLAGSCALE MODIFICATION BEG --- negative_prompt_token_ids: Optional[list[int]] = None, negative_prompt: Optional[str] = None, @@ -219,16 +178,6 @@ def token_inputs( # --- FLAGSCALE MODIFICATION END --- if token_type_ids is not None: inputs["token_type_ids"] = token_type_ids - if multi_modal_data is not None: - inputs["multi_modal_data"] = multi_modal_data - if multi_modal_inputs is not None: - inputs["multi_modal_inputs"] = multi_modal_inputs - if multi_modal_hashes is not None: - inputs["multi_modal_hashes"] = multi_modal_hashes - if multi_modal_placeholders is not None: - inputs["multi_modal_placeholders"] = multi_modal_placeholders - if mm_processor_kwargs is not None: - inputs["mm_processor_kwargs"] = mm_processor_kwargs return inputs @@ -261,112 +210,6 @@ class EncoderDecoderInputs(TypedDict): :class:`vllm.sequence.Sequence`. """ - -@dataclass -class SingletonInputsAdapter: - """ - Unified interface to access the components of :class:`SingletonInputs`. - """ - inputs: SingletonInputs - - @cached_property - def prompt(self) -> Optional[str]: - inputs = self.inputs - - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return inputs.get("prompt") - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def prompt_token_ids(self) -> list[int]: - inputs = self.inputs - - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return inputs.get("prompt_token_ids", []) - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def token_type_ids(self) -> list[int]: - inputs = self.inputs - - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return inputs.get("token_type_ids", []) - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def prompt_embeds(self) -> Optional[torch.Tensor]: - inputs = self.inputs - - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return None - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_data(self) -> "MultiModalDataDict": - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("multi_modal_data", {}) - - if inputs["type"] == "multimodal": - return inputs.get("mm_kwargs", {}) - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("multi_modal_inputs", {}) - - if inputs["type"] == "multimodal": - return inputs.get("mm_kwargs", {}) - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_hashes(self) -> list[str]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("multi_modal_hashes", []) - - if inputs["type"] == "multimodal": - # only the case when we use MultiModalInputs - return inputs.get("mm_hashes", []) # type: ignore[return-value] - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("multi_modal_placeholders", {}) - - if inputs["type"] == "multimodal": - return inputs.get("mm_placeholders", {}) - - assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def mm_processor_kwargs(self) -> dict[str, Any]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("mm_processor_kwargs", {}) - - if inputs["type"] == "multimodal": - return {} - - assert_never(inputs) # type: ignore[arg-type] - - ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] """ The inputs to :data:`vllm.inputs.InputProcessor`. diff --git a/flagscale/backends/vllm/vllm/inputs/preprocess.py b/flagscale/backends/vllm/vllm/inputs/preprocess.py index c3219fa17..cf49de500 100644 --- a/flagscale/backends/vllm/vllm/inputs/preprocess.py +++ b/flagscale/backends/vllm/vllm/inputs/preprocess.py @@ -13,7 +13,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalInputs) from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup +from vllm.transformers_utils.tokenizer_group import TokenizerGroup from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, PromptType, SingletonInputs, SingletonPrompt, token_inputs) @@ -27,7 +27,7 @@ class InputPreprocessor: def __init__( self, model_config: ModelConfig, - tokenizer: Optional[BaseTokenizerGroup], + tokenizer: Optional[TokenizerGroup], mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: super().__init__() @@ -36,7 +36,7 @@ def __init__( self.tokenizer = tokenizer self.mm_registry = mm_registry - def get_tokenizer_group(self) -> BaseTokenizerGroup: + def get_tokenizer_group(self) -> TokenizerGroup: if self.tokenizer is None: raise ValueError("You cannot pass text prompts when " "`skip_tokenizer_init` is True") @@ -223,28 +223,6 @@ async def _tokenize_prompt_async( lora_request=lora_request, add_special_tokens=add_special_tokens) - def _can_process_multimodal(self) -> bool: - model_config = self.model_config - - if not model_config.is_multimodal_model: - raise ValueError("Your model does not support multi-modal inputs") - - # Interim measure so we can handle models that have yet to be - # updated to use the new multi-modal processor - can_process_multimodal = self.mm_registry.has_processor(model_config) - if not can_process_multimodal: - from vllm.model_executor.models.registry import _VLLM_MODELS - if not any(arch in _VLLM_MODELS - for arch in model_config.architectures): - logger.warning_once( - "Your model uses the legacy input pipeline, which will be " - "removed in an upcoming release. " - "Please upgrade to the new multi-modal processing pipeline " - "(https://docs.vllm.ai/en/latest/design/mm_processing.html)" - ) - - return can_process_multimodal - def _process_multimodal( self, prompt: Union[str, list[int]], @@ -258,8 +236,7 @@ def _process_multimodal( returning the corresponding token IDs and metadata. """ # At the moment on model (PrithviGeoSpatialMAE) requires to be - # initialized without a tokenizer while using also multi-modal - # input. + # initialized without a tokenizer while using also multi-modal input if not self.tokenizer: tokenizer = object() # Dummy else: @@ -285,8 +262,7 @@ async def _process_multimodal_async( ) -> MultiModalInputs: """Async version of :meth:`_process_multimodal`.""" # At the moment on model (PrithviGeoSpatialMAE) requires to be - # initialized without a tokenizer while using also multi-modal - # input. + # initialized without a tokenizer while using also multi-modal input if not self.tokenizer: tokenizer = object() # Dummy else: @@ -344,7 +320,7 @@ def _prompt_to_llm_inputs( mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") negative_prompt_token_ids = tokens_content.get("negative_prompt_token_ids") # --- FLAGSCALE MODIFICATION --- - if multi_modal_data is not None and self._can_process_multimodal(): + if multi_modal_data is not None: return self._process_multimodal( prompt_token_ids, multi_modal_data, @@ -356,8 +332,6 @@ def _prompt_to_llm_inputs( return token_inputs( prompt_token_ids=prompt_token_ids, token_type_ids=token_type_ids, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs, negative_prompt_token_ids=negative_prompt_token_ids, # --- FLAGSCALE MODIFICATION --- ) @@ -369,7 +343,7 @@ def _prompt_to_llm_inputs( mm_processor_kwargs = text_content.get("mm_processor_kwargs") negative_prompt_text = text_content.get("negative_prompt") # --- FLAGSCALE MODIFICATION --- - if multi_modal_data is not None and self._can_process_multimodal(): + if multi_modal_data is not None: return self._process_multimodal( prompt_text, multi_modal_data, @@ -395,8 +369,6 @@ def _prompt_to_llm_inputs( return token_inputs( prompt=prompt_text, prompt_token_ids=prompt_token_ids, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs, # --- FLAGSCALE MODIFICATION BEG --- negative_prompt=negative_prompt_text, negative_prompt_token_ids=negative_prompt_token_ids, @@ -434,7 +406,7 @@ async def _prompt_to_llm_inputs_async( mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") negative_prompt_token_ids = tokens_content.get("negative_prompt_token_ids") # --- FLAGSCALE MODIFICATION --- - if multi_modal_data is not None and self._can_process_multimodal(): + if multi_modal_data is not None: return await self._process_multimodal_async( prompt_token_ids, multi_modal_data, @@ -445,8 +417,6 @@ async def _prompt_to_llm_inputs_async( return token_inputs( prompt_token_ids=prompt_token_ids, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs, negative_prompt_token_ids=negative_prompt_token_ids, # --- FLAGSCALE MODIFICATION --- ) @@ -458,7 +428,7 @@ async def _prompt_to_llm_inputs_async( mm_processor_kwargs = text_content.get("mm_processor_kwargs") negative_prompt_text = text_content.get("negative_prompt") # --- FLAGSCALE MODIFICATION --- - if multi_modal_data is not None and self._can_process_multimodal(): + if multi_modal_data is not None: return await self._process_multimodal_async( prompt_text, multi_modal_data, @@ -484,8 +454,6 @@ async def _prompt_to_llm_inputs_async( return token_inputs( prompt=prompt_text, prompt_token_ids=prompt_token_ids, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs, # --- FLAGSCALE MODIFICATION BEG --- negative_prompt=negative_prompt_text, negative_prompt_token_ids=negative_prompt_token_ids @@ -626,15 +594,13 @@ def _process_encoder_decoder_prompt( decoder_inputs = self._prompt_to_llm_inputs(decoder_input) # For multimodal model, override decoder prompt from processor # with explicit decoder prompt. - if self.model_config.is_multimodal_model and ( - self._can_process_multimodal()): + if self.model_config.is_multimodal_model: encoder_inputs, decoder_inputs = ( self._separate_enc_dec_inputs_from_mm_processor_outputs( encoder_inputs, decoder_inputs)) else: inputs = self._prompt_to_llm_inputs(prompt) - if self.model_config.is_multimodal_model and ( - self._can_process_multimodal()): + if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model encoder_inputs, decoder_inputs = ( self._separate_enc_dec_inputs_from_mm_processor_outputs( @@ -669,15 +635,13 @@ async def _process_encoder_decoder_prompt_async( # For multimodal model, override decoder prompt from processor # with explicit decoder prompt. - if self.model_config.is_multimodal_model and ( - self._can_process_multimodal()): + if self.model_config.is_multimodal_model: encoder_inputs, decoder_inputs = ( self._separate_enc_dec_inputs_from_mm_processor_outputs( encoder_inputs, decoder_inputs)) else: inputs = await self._prompt_to_llm_inputs_async(prompt) - if self.model_config.is_multimodal_model and ( - self._can_process_multimodal()): + if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model encoder_inputs, decoder_inputs = ( self._separate_enc_dec_inputs_from_mm_processor_outputs( diff --git a/flagscale/backends/vllm/vllm/model_executor/models/registry.py b/flagscale/backends/vllm/vllm/model_executor/models/registry.py index 935146d49..0a040d263 100644 --- a/flagscale/backends/vllm/vllm/model_executor/models/registry.py +++ b/flagscale/backends/vllm/vllm/model_executor/models/registry.py @@ -99,6 +99,7 @@ "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), + "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), @@ -121,13 +122,11 @@ _EMBEDDING_MODELS = { # [Text-only] "BertModel": ("bert", "BertEmbeddingModel"), - "RobertaModel": ("roberta", "RobertaEmbeddingModel"), - "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"), - "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), "GritLM": ("gritlm", "GritLM"), + "GteModel": ("bert", "GteEmbeddingModel"), "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"), "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501 "LlamaModel": ("llama", "LlamaForCausalLM"), @@ -137,12 +136,16 @@ if arch == "LlamaForCausalLM" }, "MistralModel": ("llama", "LlamaForCausalLM"), + "NomicBertModel": ("bert", "NomicBertEmbeddingModel"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), + "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"), + "RobertaModel": ("roberta", "RobertaEmbeddingModel"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), + "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), @@ -161,6 +164,8 @@ "RobertaForSequenceClassification"), "XLMRobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"), + "ModernBertForSequenceClassification": ("modernbert", + "ModernBertForSequenceClassification"), } _MULTIMODAL_MODELS = { @@ -174,10 +179,12 @@ "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), + "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 + "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 @@ -195,6 +202,7 @@ "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 + "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"), # [Encoder-decoder] @@ -208,6 +216,7 @@ _SPECULATIVE_DECODING_MODELS = { "EAGLEModel": ("eagle", "EAGLE"), "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"), + "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "MedusaModel": ("medusa", "Medusa"), "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), diff --git a/flagscale/backends/vllm/vllm/sampling_params.py b/flagscale/backends/vllm/vllm/sampling_params.py index 5ae9dc349..bdd70d100 100644 --- a/flagscale/backends/vllm/vllm/sampling_params.py +++ b/flagscale/backends/vllm/vllm/sampling_params.py @@ -38,6 +38,7 @@ class GuidedDecodingParams: """These are other options that can be set""" backend: Optional[str] = None whitespace_pattern: Optional[str] = None + structural_tag: Optional[str] = None @staticmethod def from_optional( @@ -48,9 +49,10 @@ def from_optional( json_object: Optional[bool] = None, backend: Optional[str] = None, whitespace_pattern: Optional[str] = None, + structural_tag: Optional[str] = None, ) -> Optional["GuidedDecodingParams"]: - if all(arg is None - for arg in (json, regex, choice, grammar, json_object)): + if all(arg is None for arg in (json, regex, choice, grammar, + json_object, structural_tag)): return None # Extract json schemas from pydantic models if isinstance(json, (BaseModel, type(BaseModel))): @@ -63,6 +65,7 @@ def from_optional( json_object=json_object, backend=backend, whitespace_pattern=whitespace_pattern, + structural_tag=structural_tag, ) @property @@ -79,6 +82,17 @@ def backend_options(self) -> list[str]: return [] return self.backend.split(":")[1].split(",") + def add_option(self, opt_name: str) -> None: + """Adds an option to the backend options.""" + if not self.backend: + self.backend = f":{opt_name}" + elif ":" not in self.backend: + self.backend += f":{opt_name}" + else: + options = set(self.backend_options()) + options.add(opt_name) + self.backend = f"{self.backend_name}:{','.join(sorted(options))}" + def no_fallback(self) -> bool: """Returns True if the "no-fallback" option is supplied for the guided decoding backend""" @@ -424,6 +438,10 @@ def _verify_args(self) -> None: and self.truncate_prompt_tokens < 1): raise ValueError(f"truncate_prompt_tokens must be >= 1, " f"got {self.truncate_prompt_tokens}") + assert isinstance(self.stop_token_ids, list) + if not all(isinstance(st_id, int) for st_id in self.stop_token_ids): + raise ValueError(f"stop_token_ids must contain only integers, " + f"got {self.stop_token_ids}.") assert isinstance(self.stop, list) if any(not stop_str for stop_str in self.stop): raise ValueError("stop cannot contain an empty string.") @@ -564,7 +582,7 @@ def __repr__(self) -> str: f"{self.spaces_between_special_tokens}, " f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " f"guided_decoding={self.guided_decoding}, " - f"extra_args={self.extra_args}" + f"extra_args={self.extra_args}, " f"guidance_scale={self.guidance_scale})") # --- FLAGSCALE MODIFICATION --- diff --git a/flagscale/backends/vllm/vllm/sequence.py b/flagscale/backends/vllm/vllm/sequence.py index 977e73c30..e7a994fa8 100644 --- a/flagscale/backends/vllm/vllm/sequence.py +++ b/flagscale/backends/vllm/vllm/sequence.py @@ -14,9 +14,9 @@ import msgspec import torch -from vllm.inputs import SingletonInputs, SingletonInputsAdapter +from vllm.inputs import SingletonInputs from vllm.lora.request import LoRARequest -from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict +from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -419,7 +419,7 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> None: self.seq_id = seq_id - self.inputs = SingletonInputsAdapter(inputs) + self.inputs = inputs self.block_size = block_size self.eos_token_id = eos_token_id self.lora_request = lora_request @@ -448,31 +448,29 @@ def n_blocks(self) -> int: @property def prompt(self) -> Optional[str]: - return self.inputs.prompt + return self.inputs.get("prompt") @property def prompt_token_ids(self) -> list[int]: - return self.inputs.prompt_token_ids - - @property - def prompt_embeds(self) -> Optional[torch.Tensor]: - return self.inputs.prompt_embeds + return self.inputs["prompt_token_ids"] @property def token_type_ids(self) -> list[int]: - return self.inputs.token_type_ids + return self.inputs.get("token_type_ids", []) @property - def multi_modal_data(self) -> "MultiModalDataDict": - return self.inputs.multi_modal_data + def multi_modal_data(self) -> MultiModalKwargs: + if self.inputs["type"] == "multimodal": + return self.inputs["mm_kwargs"] + + return MultiModalKwargs({}) @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - return self.inputs.multi_modal_placeholders + if self.inputs["type"] == "multimodal": + return self.inputs["mm_placeholders"] - @property - def mm_processor_kwargs(self) -> dict[str, Any]: - return self.inputs.mm_processor_kwargs + return {} @property def lora_int_id(self) -> int: @@ -733,12 +731,12 @@ def token_type_ids(self) -> Optional[list[int]]: return self.first_seq.token_type_ids @property - def multi_modal_data(self) -> MultiModalDataDict: + def multi_modal_data(self) -> MultiModalKwargs: if self.first_seq.multi_modal_data: return self.first_seq.multi_modal_data elif self.encoder_seq is not None: return self.encoder_seq.multi_modal_data - return {} + return MultiModalKwargs({}) @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: @@ -748,14 +746,6 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: return self.encoder_seq.multi_modal_placeholders return {} - @property - def mm_processor_kwargs(self) -> dict[str, Any]: - if self.first_seq.multi_modal_data: - return self.first_seq.mm_processor_kwargs - elif self.encoder_seq is not None: - return self.encoder_seq.mm_processor_kwargs - return {} - @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 @@ -1004,12 +994,9 @@ class SequenceGroupMetadata( computed_block_nums: Optional[list[int]] = None state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) - # "MultiModalDataDict" types. We have to use Any due to msgspec - # doesn't allow to have union of 2 different dicts. token_type_ids: Optional[list[int]] = None - multi_modal_data: Optional[Any] = None + multi_modal_data: Optional[MultiModalKwargs] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None - mm_processor_kwargs: Optional[dict[str, Any]] = None encoder_seq_data: Optional[SequenceData] = None cross_block_table: Optional[list[int]] = None prompt_adapter_request: Optional[PromptAdapterRequest] = None diff --git a/flagscale/backends/vllm/vllm/worker/model_runner.py b/flagscale/backends/vllm/vllm/worker/model_runner.py index f44e66712..35203fb23 100644 --- a/flagscale/backends/vllm/vllm/worker/model_runner.py +++ b/flagscale/backends/vllm/vllm/worker/model_runner.py @@ -23,7 +23,8 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import get_kv_transfer_group, get_pp_group +from vllm.distributed import get_pp_group +from vllm.distributed.kv_transfer import get_kv_transfer_group from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, graph_capture) from vllm.forward_context import get_forward_context, set_forward_context @@ -34,7 +35,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata, SamplingMetadataCache from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import supports_lora, supports_multimodal @@ -74,6 +75,7 @@ torch._dynamo.config.cache_size_limit = 128 torch._dynamo.config.accumulated_cache_size_limit = 128 + # --- FLAGSCALE MODIFICATION BEG --- # Know more about FlagGems: https://github.com/FlagOpen/FlagGems import os @@ -470,7 +472,6 @@ def __init__(self, self.enable_lora = self.runner.lora_config is not None self.enable_prompt_adapter = (self.runner.prompt_adapter_config is not None) - self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper # Attention metadata inputs. if self.attn_backend is not None: @@ -698,23 +699,15 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, seq_group_metadata: SequenceGroupMetadata, is_negative: bool = False): # --- FLAGSCALE MODIFICATION --- """If multi-modal data is given, add it to the input.""" - # NOTE: mm_data only includes the subset of multi-modal items that + # NOTE: mm_kwargs only includes the subset of multi-modal items that # intersect with the current prefill positions. positions = inter_data.input_positions[0] - mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( + mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( seq_group_metadata, range(positions[0], positions[0] + len(positions))) - if not mm_data: + if not mm_kwargs: return - if self.runner.mm_registry.has_processor(self.runner.model_config): - mm_kwargs = mm_data - else: - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - seq_group_metadata.mm_processor_kwargs, - ) - inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps @@ -722,11 +715,17 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if self.runner.model_config.uses_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) - assert image_grid_thw is not None or video_grid_thw is not None, ( - "mrope embedding type requires multi-modal input mapper " - "returns 'image_grid_thw' or 'video_grid_thw'.") + audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", + None) + assert ( + image_grid_thw is not None or video_grid_thw is not None + or audio_feature_lengths is not None), ( + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw' or " + "'audio_feature_lengths'.") second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) + use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) hf_config = self.runner.model_config.hf_config inter_data.mrope_input_positions = [None] * inter_data.n_seqs @@ -744,6 +743,8 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, second_per_grid_ts=second_per_grid_ts, context_len=inter_data.context_lens[seq_idx], seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, ) seq_data.mrope_position_delta = mrope_position_delta @@ -1120,15 +1121,13 @@ def __init__( # Multi-modal data support self.input_registry = input_registry self.mm_registry = mm_registry - self.multi_modal_input_mapper = mm_registry \ - .create_input_mapper(model_config) - self.mm_registry.init_mm_limits_per_prompt(self.model_config) # Lazy initialization self.model: nn.Module # Set after load_model # Set after load_model. self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None + self.sampler = get_sampler() set_cpu_offload_max_bytes( int(self.cache_config.cpu_offload_gb * 1024**3)) @@ -1164,14 +1163,9 @@ def load_model(self) -> None: logger.warning( "Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # It's necessary to distinguish between the - # max_position_embeddings of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = ( - self.model.config.max_position_embeddings) - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) + + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -1181,7 +1175,8 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config. + max_position_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model) time_after_load = time.perf_counter() @@ -1365,8 +1360,8 @@ def _dummy_run(self, dummy_data = self.input_registry \ .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) + seq_len, + self.mm_registry) seq = SequenceGroupMetadata( request_id=str(group_id), @@ -1867,7 +1862,7 @@ def execute_model( model_input.async_callback() # Sample the next token. - output: SamplerOutput = self.model.sample( + output: SamplerOutput = self.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, ) From 9b424fe680068d705bcb02f01d15ec231240a24f Mon Sep 17 00:00:00 2001 From: zhaoyinglia Date: Tue, 29 Apr 2025 20:30:22 +0800 Subject: [PATCH 3/5] update cfg&server for v0.8.5 --- examples/aquila/conf/config_inference.yaml | 5 +- flagscale/backends/vllm/pyproject.toml | 173 ++++++++++++++++++ .../backends/vllm/requirements/build.txt | 9 + .../backends/vllm/vllm/core/scheduler.py | 1 - .../device_communicators/pynccl_wrapper.py | 5 +- .../kv_transfer/kv_connector/factory.py | 62 ++++++- 6 files changed, 241 insertions(+), 14 deletions(-) create mode 100644 flagscale/backends/vllm/pyproject.toml create mode 100644 flagscale/backends/vllm/requirements/build.txt diff --git a/examples/aquila/conf/config_inference.yaml b/examples/aquila/conf/config_inference.yaml index 56ded9e30..45fba784b 100644 --- a/examples/aquila/conf/config_inference.yaml +++ b/examples/aquila/conf/config_inference.yaml @@ -4,7 +4,7 @@ defaults: experiment: exp_name: aquila2 - exp_dir: ./outputs + exp_dir: ./outputs/${experiment.exp_name} task: type: inference backend: vllm @@ -12,8 +12,9 @@ experiment: runner: hostfile: null cmds: - before_start: source activate flagscale + before_start: source /root/miniconda3/bin/activate flagscale envs: + VLLM_LOGGING_LEVEL: "INFO" CUDA_VISIBLE_DEVICES: 0 CUDA_DEVICE_MAX_CONNECTIONS: 1 diff --git a/flagscale/backends/vllm/pyproject.toml b/flagscale/backends/vllm/pyproject.toml new file mode 100644 index 000000000..86b7e00b5 --- /dev/null +++ b/flagscale/backends/vllm/pyproject.toml @@ -0,0 +1,173 @@ +[build-system] +# Should be mirrored in requirements/build.txt +requires = [ + "cmake>=3.26", + "ninja", + "packaging", + "setuptools>=77.0.3", + "setuptools-scm>=8.0", + "torch == 2.6.0", + "wheel", + "jinja2", +] +build-backend = "setuptools.build_meta" + +[project] +name = "vllm" +authors = [{name = "vLLM Team"}] +license = "Apache-2.0" +license-files = ["LICENSE"] +readme = "README.md" +description = "A high-throughput and memory-efficient inference and serving engine for LLMs" +classifiers = [ + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis", +] +requires-python = ">=3.9,<3.13" +dynamic = [ "version", "dependencies", "optional-dependencies"] + +[project.urls] +Homepage="https://github.com/vllm-project/vllm" +Documentation="https://vllm.readthedocs.io/en/latest/" +Slack="http://slack.vllm.ai/" + +[project.scripts] +vllm = "vllm.entrypoints.cli.main:main" + +[tool.setuptools_scm] +# no extra settings needed, presence enables setuptools-scm + +[tool.setuptools.packages.find] +where = ["."] +include = ["vllm*"] + +[tool.yapfignore] +ignore_patterns = [ + "build/**", +] + +[tool.ruff] +# Allow lines to be as long as 80. +line-length = 80 +exclude = [ + # External file, leaving license intact + "examples/other/fp8/quantizer/quantize.py", + "vllm/vllm_flash_attn/flash_attn_interface.pyi" +] + +[tool.ruff.lint.per-file-ignores] +"vllm/third_party/**" = ["ALL"] +"vllm/version.py" = ["F401"] +"vllm/_version.py" = ["ALL"] +# Python 3.8 typing. TODO: Remove these excludes after v1.0.0 +"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"] +"vllm/attention/**/*.py" = ["UP006", "UP035"] +"vllm/compilation/**/*.py" = ["UP006", "UP035"] +"vllm/core/**/*.py" = ["UP006", "UP035"] +"vllm/device_allocator/**/*.py" = ["UP006", "UP035"] +"vllm/distributed/**/*.py" = ["UP006", "UP035"] +"vllm/engine/**/*.py" = ["UP006", "UP035"] +"vllm/executor/**/*.py" = ["UP006", "UP035"] +"vllm/lora/**/*.py" = ["UP006", "UP035"] +"vllm/model_executor/**/*.py" = ["UP006", "UP035"] +"vllm/platforms/**/*.py" = ["UP006", "UP035"] +"vllm/plugins/**/*.py" = ["UP006", "UP035"] +"vllm/profiler/**/*.py" = ["UP006", "UP035"] +"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] +"vllm/spec_decode/**/*.py" = ["UP006", "UP035"] +"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"] +"vllm/triton_utils/**/*.py" = ["UP006", "UP035"] +"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"] +"vllm/worker/**/*.py" = ["UP006", "UP035"] +"vllm/utils.py" = ["UP006", "UP035"] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + # "I", + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", + # f-string format + "UP032", + # Can remove once 3.10+ is the minimum Python version + "UP007", +] + +[tool.mypy] +ignore_missing_imports = true +check_untyped_defs = true +follow_imports = "silent" + +# After fixing type errors resulting from follow_imports: "skip" -> "silent", +# move the directory here and remove it from tools/mypy.sh +files = [ + "vllm/*.py", + "vllm/adapter_commons", + "vllm/assets", + "vllm/entrypoints", + "vllm/core", + "vllm/inputs", + "vllm/logging_utils", + "vllm/multimodal", + "vllm/platforms", + "vllm/transformers_utils", + "vllm/triton_utils", + "vllm/usage", +] +# TODO(woosuk): Include the code from Megatron and HuggingFace. +exclude = [ + "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", + # Ignore triton kernels in ops. + 'vllm/attention/ops/.*\.py$' +] + +[tool.codespell] +ignore-words-list = "dout, te, indicies, subtile, ElementE" +skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*" + +[tool.isort] +use_parentheses = true +skip_gitignore = true + +[tool.pytest.ini_options] +markers = [ + "skip_global_cleanup", + "core_model: enable this model test in each PR instead of only nightly", + "cpu_model: enable this model test in CPU tests", + "quant_model: run this model test under Quantized category", + "split: run this test as part of a split", + "distributed: run this test only in distributed GPU tests", + "skip_v1: do not run this test with v1", + "optional: optional tests that are automatically skipped, include --optional to run them", +] + +[tool.pymarkdown] +plugins.md004.style = "sublist" # ul-style +plugins.md013.enabled = false # line-length +plugins.md041.enabled = false # first-line-h1 +plugins.md033.enabled = false # inline-html +plugins.md024.allow_different_nesting = true # no-duplicate-headers diff --git a/flagscale/backends/vllm/requirements/build.txt b/flagscale/backends/vllm/requirements/build.txt new file mode 100644 index 000000000..901a89a73 --- /dev/null +++ b/flagscale/backends/vllm/requirements/build.txt @@ -0,0 +1,9 @@ +# Should be mirrored in pyproject.toml +cmake>=3.26 +ninja +packaging +setuptools>=77.0.3 +setuptools-scm>=8 +torch==2.6.0 +wheel +jinja2>=3.1.6 diff --git a/flagscale/backends/vllm/vllm/core/scheduler.py b/flagscale/backends/vllm/vllm/core/scheduler.py index 1f7eb23ec..74563d0e8 100644 --- a/flagscale/backends/vllm/vllm/core/scheduler.py +++ b/flagscale/backends/vllm/vllm/core/scheduler.py @@ -1720,7 +1720,6 @@ def schedule( multi_modal_placeholders=( seq_group.multi_modal_placeholders if scheduler_outputs.num_prefill_groups > 0 else None), - mm_processor_kwargs=seq_group.mm_processor_kwargs, prompt_adapter_request=seq_group.prompt_adapter_request, ) else: diff --git a/flagscale/backends/vllm/vllm/distributed/device_communicators/pynccl_wrapper.py b/flagscale/backends/vllm/vllm/distributed/device_communicators/pynccl_wrapper.py index 7b490ca56..fa6f562f3 100644 --- a/flagscale/backends/vllm/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/flagscale/backends/vllm/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -1,6 +1,3 @@ -# Copied from https://github.com/vllm-project/vllm/blob/1ad957950ffc1552af5abda78c03d88ddb67945b/vllm/distributed/device_communicators/pynccl_wrapper.py. -# Below is the original copyright: - # SPDX-License-Identifier: Apache-2.0 # This file is a pure Python wrapper for the NCCL library. @@ -274,6 +271,7 @@ def ncclGetUniqueId(self) -> ncclUniqueId: ctypes.byref(unique_id))) return unique_id + # --- FLAGSCALE MODIFICATION BEG --- def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: """ Reconstructs an `ncclUniqueId` object from bytes data. @@ -294,6 +292,7 @@ def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: unique_id = ncclUniqueId() ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128) return unique_id + # --- FLAGSCALE MODIFICATION END --- def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId, rank: int) -> ncclComm_t: diff --git a/flagscale/backends/vllm/vllm/distributed/kv_transfer/kv_connector/factory.py b/flagscale/backends/vllm/vllm/distributed/kv_transfer/kv_connector/factory.py index 52de6757e..f4d1171d7 100644 --- a/flagscale/backends/vllm/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/flagscale/backends/vllm/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,18 +1,24 @@ -# Copied from https://github.com/vllm-project/vllm/blob/1ad957950ffc1552af5abda78c03d88ddb67945b/vllm/distributed/kv_transfer/kv_connector/factory.py. -# Below is the original copyright: # SPDX-License-Identifier: Apache-2.0 import importlib from typing import TYPE_CHECKING, Callable, Dict, Type +import vllm.envs as envs +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType +from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, + KVConnectorRole) +from vllm.logger import init_logger + from .base import KVConnectorBase if TYPE_CHECKING: from vllm.config import VllmConfig +logger = init_logger(__name__) + class KVConnectorFactory: - _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {} + _registry: Dict[str, Callable[[], Type[KVConnectorBaseType]]] = {} @classmethod def register_connector(cls, name: str, module_path: str, @@ -21,30 +27,60 @@ def register_connector(cls, name: str, module_path: str, if name in cls._registry: raise ValueError(f"Connector '{name}' is already registered.") - def loader() -> Type[KVConnectorBase]: + def loader() -> Type[KVConnectorBaseType]: module = importlib.import_module(module_path) return getattr(module, class_name) cls._registry[name] = loader @classmethod - def create_connector(cls, rank: int, local_rank: int, - config: "VllmConfig") -> KVConnectorBase: + def create_connector_v0(cls, rank: int, local_rank: int, + config: "VllmConfig") -> KVConnectorBase: + if envs.VLLM_USE_V1: + raise ValueError("Attempting to initialize a V0 Connector, " + f"but found {envs.VLLM_USE_V1=}") + connector_name = config.kv_transfer_config.kv_connector if connector_name not in cls._registry: raise ValueError(f"Unsupported connector type: {connector_name}") connector_cls = cls._registry[connector_name]() + assert issubclass(connector_cls, KVConnectorBase) return connector_cls(rank, local_rank, config) + @classmethod + def create_connector_v1( + cls, + config: "VllmConfig", + role: KVConnectorRole, + ) -> KVConnectorBase_V1: + if not envs.VLLM_USE_V1: + raise ValueError("Attempting to initialize a V1 Connector, " + f"but found {envs.VLLM_USE_V1=}") + + connector_name = config.kv_transfer_config.kv_connector + connector_cls = cls._registry[connector_name]() + assert issubclass(connector_cls, KVConnectorBase_V1) + logger.info("Creating v1 connector with name: %s", connector_name) + # NOTE(Kuntai): v1 connector is explicitly separated into two roles. + # Scheduler connector: + # - Co-locate with scheduler process + # - Should only be used inside the Scheduler class + # Worker connector: + # - Co-locate with worker process + # - Should only be used inside the forward context & attention layer + # We build separately to enforce strict separation + return connector_cls(config, role) + # Register various connectors here. # The registration should not be done in each individual file, as we want to # only load the files corresponding to the current connector. +# --- FLAGSCALE MODIFICATION BEG --- KVConnectorFactory.register_connector( "P2pConnector", "vllm.distributed.kv_transfer.kv_connector.p2p_connector", "P2pConnector") - +# --- FLAGSCALE MODIFICATION END --- KVConnectorFactory.register_connector( "PyNcclConnector", "vllm.distributed.kv_transfer.kv_connector.simple_connector", @@ -63,4 +99,14 @@ def create_connector(cls, rank: int, local_rank: int, KVConnectorFactory.register_connector( "MooncakeStoreConnector", "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector", - "MooncakeStoreConnector") \ No newline at end of file + "MooncakeStoreConnector") + +KVConnectorFactory.register_connector( + "SharedStorageConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector", + "SharedStorageConnector") + +KVConnectorFactory.register_connector( + "LMCacheConnectorV1", + "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector", + "LMCacheConnectorV1") From 9655f0cf760f67246803688214b897e556bc6269 Mon Sep 17 00:00:00 2001 From: zhaoyinglia Date: Tue, 29 Apr 2025 20:49:30 +0800 Subject: [PATCH 4/5] fix install error --- flagscale/backends/vllm/pyproject.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/flagscale/backends/vllm/pyproject.toml b/flagscale/backends/vllm/pyproject.toml index 86b7e00b5..616cb1327 100644 --- a/flagscale/backends/vllm/pyproject.toml +++ b/flagscale/backends/vllm/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "cmake>=3.26", "ninja", "packaging", - "setuptools>=77.0.3", + "setuptools>=61", "setuptools-scm>=8.0", "torch == 2.6.0", "wheel", @@ -15,8 +15,7 @@ build-backend = "setuptools.build_meta" [project] name = "vllm" authors = [{name = "vLLM Team"}] -license = "Apache-2.0" -license-files = ["LICENSE"] +license = {text = "Apache-2.0"} readme = "README.md" description = "A high-throughput and memory-efficient inference and serving engine for LLMs" classifiers = [ From 8a44c458b07fe6c9651ed03ed78f46d838c6e598 Mon Sep 17 00:00:00 2001 From: zhaoyinglia Date: Tue, 29 Apr 2025 20:51:30 +0800 Subject: [PATCH 5/5] rm build.txt --- flagscale/backends/vllm/requirements/build.txt | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 flagscale/backends/vllm/requirements/build.txt diff --git a/flagscale/backends/vllm/requirements/build.txt b/flagscale/backends/vllm/requirements/build.txt deleted file mode 100644 index 901a89a73..000000000 --- a/flagscale/backends/vllm/requirements/build.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Should be mirrored in pyproject.toml -cmake>=3.26 -ninja -packaging -setuptools>=77.0.3 -setuptools-scm>=8 -torch==2.6.0 -wheel -jinja2>=3.1.6