@@ -1522,6 +1522,9 @@ def __post_init__(self):
15221522 self .ignore_patterns = ["original/**/*" ]
15231523
15241524
1525+ DistributedExecutorBackend = Literal ["ray" , "mp" , "uni" , "external_launcher" ]
1526+
1527+
15251528@config
15261529@dataclass
15271530class ParallelConfig :
@@ -1563,7 +1566,7 @@ class ParallelConfig:
15631566 placement_group : Optional ["PlacementGroup" ] = None
15641567 """ray distributed model workers placement group."""
15651568
1566- distributed_executor_backend : Optional [Union [str ,
1569+ distributed_executor_backend : Optional [Union [DistributedExecutorBackend ,
15671570 type ["ExecutorBase" ]]] = None
15681571 """Backend to use for distributed model
15691572 workers, either "ray" or "mp" (multiprocessing). If the product
@@ -1687,7 +1690,7 @@ def __post_init__(self) -> None:
16871690 # current node and we aren't in a ray placement group.
16881691
16891692 from vllm .executor import ray_utils
1690- backend = "mp"
1693+ backend : DistributedExecutorBackend = "mp"
16911694 ray_found = ray_utils .ray_is_available ()
16921695 if current_platform .is_neuron ():
16931696 # neuron uses single process to control multiple devices
@@ -1755,92 +1758,124 @@ def _verify_args(self) -> None:
17551758 "worker_extension_cls must be a string (qualified class name)." )
17561759
17571760
1761+ SchedulerPolicy = Literal ["fcfs" , "priority" ]
1762+
1763+
1764+ @config
17581765@dataclass
17591766class SchedulerConfig :
17601767 """Scheduler configuration."""
17611768
1762- runner_type : str = "generate" # The runner type to launch for the model.
1769+ runner_type : RunnerType = "generate"
1770+ """The runner type to launch for the model."""
17631771
1764- # Maximum number of tokens to be processed in a single iteration.
1765- max_num_batched_tokens : int = field (default = None ) # type: ignore
1772+ max_num_batched_tokens : int = None # type: ignore
1773+ """Maximum number of tokens to be processed in a single iteration.
1774+
1775+ This config has no static default. If left unspecified by the user, it will
1776+ be set in `EngineArgs.create_engine_config` based on the usage context."""
17661777
1767- # Maximum number of sequences to be processed in a single iteration.
1768- max_num_seqs : int = 128
1778+ max_num_seqs : int = None # type: ignore
1779+ """Maximum number of sequences to be processed in a single iteration.
1780+
1781+ This config has no static default. If left unspecified by the user, it will
1782+ be set in `EngineArgs.create_engine_config` based on the usage context."""
17691783
1770- # Maximum length of a sequence (including prompt and generated text).
1771- max_model_len : int = 8192
1784+ max_model_len : int = None # type: ignore
1785+ """Maximum length of a sequence (including prompt and generated text). This
1786+ is primarily set in `ModelConfig` and that value should be manually
1787+ duplicated here."""
17721788
1773- # Maximum number of sequences that can be partially prefilled concurrently
17741789 max_num_partial_prefills : int = 1
1790+ """For chunked prefill, the maximum number of sequences that can be
1791+ partially prefilled concurrently."""
17751792
1776- # Maximum number of "very long prompt" sequences that can be prefilled
1777- # concurrently (long is defined by long_prefill_threshold)
17781793 max_long_partial_prefills : int = 1
1794+ """For chunked prefill, the maximum number of prompts longer than
1795+ long_prefill_token_threshold that will be prefilled concurrently. Setting
1796+ this less than max_num_partial_prefills will allow shorter prompts to jump
1797+ the queue in front of longer prompts in some cases, improving latency."""
17791798
1780- # calculate context length that determines which sequences are
1781- # considered "long"
17821799 long_prefill_token_threshold : int = 0
1800+ """For chunked prefill, a request is considered long if the prompt is
1801+ longer than this number of tokens."""
17831802
1784- # The number of slots to allocate per sequence per
1785- # step, beyond the known token ids. This is used in speculative
1786- # decoding to store KV activations of tokens which may or may not be
1787- # accepted.
17881803 num_lookahead_slots : int = 0
1804+ """The number of slots to allocate per sequence per
1805+ step, beyond the known token ids. This is used in speculative
1806+ decoding to store KV activations of tokens which may or may not be
1807+ accepted.
1808+
1809+ NOTE: This will be replaced by speculative config in the future; it is
1810+ present to enable correctness tests until then."""
17891811
1790- # Apply a delay (of delay factor multiplied by previous
1791- # prompt latency) before scheduling next prompt.
17921812 delay_factor : float = 0.0
1813+ """Apply a delay (of delay factor multiplied by previous
1814+ prompt latency) before scheduling next prompt."""
17931815
1794- # If True, prefill requests can be chunked based
1795- # on the remaining max_num_batched_tokens.
1796- enable_chunked_prefill : bool = False
1816+ enable_chunked_prefill : bool = None # type: ignore
1817+ """If True, prefill requests can be chunked based
1818+ on the remaining max_num_batched_tokens."""
17971819
17981820 is_multimodal_model : bool = False
1821+ """True if the model is multimodal."""
1822+
1823+ # TODO (ywang96): Make this configurable.
1824+ max_num_encoder_input_tokens : int = field (init = False )
1825+ """Multimodal encoder compute budget, only used in V1.
1826+
1827+ NOTE: This is not currently configurable. It will be overridden by
1828+ max_num_batched_tokens in case max multimodal embedding size is larger."""
1829+
1830+ # TODO (ywang96): Make this configurable.
1831+ encoder_cache_size : int = field (init = False )
1832+ """Multimodal encoder cache size, only used in V1.
1833+
1834+ NOTE: This is not currently configurable. It will be overridden by
1835+ max_num_batched_tokens in case max multimodal embedding size is larger."""
17991836
1800- # NOTE: The following multimodal encoder budget will be initialized to
1801- # max_num_batched_tokens and overridden in case max multimodal embedding
1802- # size is larger.
1803- # TODO (ywang96): Make these configurable.
1804- # Multimodal encoder compute budget, only used in V1
1805- max_num_encoder_input_tokens : int = field (default = None ) # type: ignore
1806-
1807- # Multimodal encoder cache size, only used in V1
1808- encoder_cache_size : int = field (default = None ) # type: ignore
1809-
1810- # Whether to perform preemption by swapping or
1811- # recomputation. If not specified, we determine the mode as follows:
1812- # We use recomputation by default since it incurs lower overhead than
1813- # swapping. However, when the sequence group has multiple sequences
1814- # (e.g., beam search), recomputation is not currently supported. In
1815- # such a case, we use swapping instead.
18161837 preemption_mode : Optional [str ] = None
1838+ """Whether to perform preemption by swapping or
1839+ recomputation. If not specified, we determine the mode as follows:
1840+ We use recomputation by default since it incurs lower overhead than
1841+ swapping. However, when the sequence group has multiple sequences
1842+ (e.g., beam search), recomputation is not currently supported. In
1843+ such a case, we use swapping instead."""
18171844
18181845 num_scheduler_steps : int = 1
1846+ """Maximum number of forward steps per scheduler call."""
18191847
1820- multi_step_stream_outputs : bool = False
1848+ multi_step_stream_outputs : bool = True
1849+ """If False, then multi-step will stream outputs at the end of all steps"""
18211850
1822- # Private API. If used, scheduler sends delta data to
1823- # workers instead of an entire data. It should be enabled only
1824- # when SPMD worker architecture is enabled. I.e.,
1825- # VLLM_USE_RAY_SPMD_WORKER=1
18261851 send_delta_data : bool = False
1827-
1828- # The scheduling policy to use. "fcfs" (default) or "priority".
1829- policy : str = "fcfs"
1852+ """Private API. If used, scheduler sends delta data to
1853+ workers instead of an entire data. It should be enabled only
1854+ when SPMD worker architecture is enabled. I.e.,
1855+ VLLM_USE_RAY_SPMD_WORKER=1"""
1856+
1857+ policy : SchedulerPolicy = "fcfs"
1858+ """The scheduling policy to use:\n
1859+ - "fcfs" means first come first served, i.e. requests are handled in order
1860+ of arrival.\n
1861+ - "priority" means requests are handled based on given priority (lower
1862+ value means earlier handling) and time of arrival deciding any ties)."""
18301863
18311864 chunked_prefill_enabled : bool = field (init = False )
1865+ """True if chunked prefill is enabled."""
18321866
1833- # If set to true and chunked prefill is enabled, we do not want to
1834- # partially schedule a multimodal item. Only used in V1
1835- # This ensures that if a request has a mixed prompt
1836- # (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1837- # some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1838- # it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
18391867 disable_chunked_mm_input : bool = False
1868+ """If set to true and chunked prefill is enabled, we do not want to
1869+ partially schedule a multimodal item. Only used in V1
1870+ This ensures that if a request has a mixed prompt
1871+ (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1872+ some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1873+ it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
18401874
1841- # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
1842- # or "mod.custom_class".
18431875 scheduler_cls : Union [str , type [object ]] = "vllm.core.scheduler.Scheduler"
1876+ """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
1877+ default scheduler. Can be a class directly or the path to a class of form
1878+ "mod.custom_class"."""
18441879
18451880 def compute_hash (self ) -> str :
18461881 """
@@ -1862,6 +1897,18 @@ def compute_hash(self) -> str:
18621897 return hash_str
18631898
18641899 def __post_init__ (self ) -> None :
1900+ if self .max_model_len is None :
1901+ self .max_model_len = 8192
1902+ logger .warning (
1903+ "max_model_len was is not set. Defaulting to arbitrary value "
1904+ "of %d." , self .max_model_len )
1905+
1906+ if self .max_num_seqs is None :
1907+ self .max_num_seqs = 128
1908+ logger .warning (
1909+ "max_num_seqs was is not set. Defaulting to arbitrary value "
1910+ "of %d." , self .max_num_seqs )
1911+
18651912 if self .max_num_batched_tokens is None :
18661913 if self .enable_chunked_prefill :
18671914 if self .num_scheduler_steps > 1 :
0 commit comments