|
34 | 34 | from vllm.config.compilation import (CompilationConfig, CompilationLevel, |
35 | 35 | PassConfig) |
36 | 36 | from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig |
| 37 | +from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy |
37 | 38 | from vllm.config.utils import ConfigType, config |
38 | 39 | from vllm.logger import init_logger |
39 | 40 | from vllm.model_executor.layers.quantization import QuantizationMethods |
|
47 | 48 | try_get_tokenizer_config, uses_mrope) |
48 | 49 | from vllm.transformers_utils.s3_utils import S3Model |
49 | 50 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect |
50 | | -# yapf conflicts with isort for this block |
51 | | -# yapf: disable |
52 | | -from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, |
53 | | - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
54 | | - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType, |
| 51 | +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType, |
55 | 52 | LazyLoader, common_broadcastable_dtype, random_uuid) |
56 | 53 |
|
57 | | -# yapf: enable |
58 | | - |
59 | 54 | if TYPE_CHECKING: |
60 | 55 | from _typeshed import DataclassInstance |
61 | 56 | from transformers.configuration_utils import PretrainedConfig |
@@ -1820,313 +1815,6 @@ def __post_init__(self): |
1820 | 1815 | self.ignore_patterns = ["original/**/*"] |
1821 | 1816 |
|
1822 | 1817 |
|
1823 | | -PreemptionMode = Literal["swap", "recompute"] |
1824 | | -SchedulerPolicy = Literal["fcfs", "priority"] |
1825 | | - |
1826 | | - |
1827 | | -@config |
1828 | | -@dataclass |
1829 | | -class SchedulerConfig: |
1830 | | - """Scheduler configuration.""" |
1831 | | - |
1832 | | - runner_type: RunnerType = "generate" |
1833 | | - """The runner type to launch for the model.""" |
1834 | | - |
1835 | | - max_num_batched_tokens: SkipValidation[int] = None # type: ignore |
1836 | | - """Maximum number of tokens to be processed in a single iteration. |
1837 | | -
|
1838 | | - This config has no static default. If left unspecified by the user, it will |
1839 | | - be set in `EngineArgs.create_engine_config` based on the usage context.""" |
1840 | | - |
1841 | | - max_num_seqs: SkipValidation[int] = None # type: ignore |
1842 | | - """Maximum number of sequences to be processed in a single iteration. |
1843 | | -
|
1844 | | - This config has no static default. If left unspecified by the user, it will |
1845 | | - be set in `EngineArgs.create_engine_config` based on the usage context.""" |
1846 | | - |
1847 | | - max_model_len: SkipValidation[int] = None # type: ignore |
1848 | | - """Maximum length of a sequence (including prompt and generated text). This |
1849 | | - is primarily set in `ModelConfig` and that value should be manually |
1850 | | - duplicated here.""" |
1851 | | - |
1852 | | - max_num_partial_prefills: int = 1 |
1853 | | - """For chunked prefill, the maximum number of sequences that can be |
1854 | | - partially prefilled concurrently.""" |
1855 | | - |
1856 | | - max_long_partial_prefills: int = 1 |
1857 | | - """For chunked prefill, the maximum number of prompts longer than |
1858 | | - long_prefill_token_threshold that will be prefilled concurrently. Setting |
1859 | | - this less than max_num_partial_prefills will allow shorter prompts to jump |
1860 | | - the queue in front of longer prompts in some cases, improving latency.""" |
1861 | | - |
1862 | | - long_prefill_token_threshold: int = 0 |
1863 | | - """For chunked prefill, a request is considered long if the prompt is |
1864 | | - longer than this number of tokens.""" |
1865 | | - |
1866 | | - num_lookahead_slots: int = 0 |
1867 | | - """The number of slots to allocate per sequence per |
1868 | | - step, beyond the known token ids. This is used in speculative |
1869 | | - decoding to store KV activations of tokens which may or may not be |
1870 | | - accepted. |
1871 | | -
|
1872 | | - NOTE: This will be replaced by speculative config in the future; it is |
1873 | | - present to enable correctness tests until then.""" |
1874 | | - |
1875 | | - cuda_graph_sizes: list[int] = field(default_factory=list) |
1876 | | - """Cuda graph capture sizes |
1877 | | - 1. if none provided, then default set to [min(max_num_seqs * 2, 512)] |
1878 | | - 2. if one value is provided, then the capture list would follow the |
1879 | | - pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)] |
1880 | | - 3. more than one value (e.g. 1 2 128) is provided, then the capture list |
1881 | | - will follow the provided list.""" |
1882 | | - |
1883 | | - delay_factor: float = 0.0 |
1884 | | - """Apply a delay (of delay factor multiplied by previous |
1885 | | - prompt latency) before scheduling next prompt.""" |
1886 | | - |
1887 | | - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore |
1888 | | - """If True, prefill requests can be chunked based |
1889 | | - on the remaining max_num_batched_tokens.""" |
1890 | | - |
1891 | | - is_multimodal_model: bool = False |
1892 | | - """True if the model is multimodal.""" |
1893 | | - |
1894 | | - # TODO (ywang96): Make this configurable. |
1895 | | - max_num_encoder_input_tokens: int = field(init=False) |
1896 | | - """Multimodal encoder compute budget, only used in V1. |
1897 | | -
|
1898 | | - NOTE: This is not currently configurable. It will be overridden by |
1899 | | - max_num_batched_tokens in case max multimodal embedding size is larger.""" |
1900 | | - |
1901 | | - # TODO (ywang96): Make this configurable. |
1902 | | - encoder_cache_size: int = field(init=False) |
1903 | | - """Multimodal encoder cache size, only used in V1. |
1904 | | -
|
1905 | | - NOTE: This is not currently configurable. It will be overridden by |
1906 | | - max_num_batched_tokens in case max multimodal embedding size is larger.""" |
1907 | | - |
1908 | | - preemption_mode: Optional[PreemptionMode] = None |
1909 | | - """Whether to perform preemption by swapping or |
1910 | | - recomputation. If not specified, we determine the mode as follows: |
1911 | | - We use recomputation by default since it incurs lower overhead than |
1912 | | - swapping. However, when the sequence group has multiple sequences |
1913 | | - (e.g., beam search), recomputation is not currently supported. In |
1914 | | - such a case, we use swapping instead.""" |
1915 | | - |
1916 | | - num_scheduler_steps: int = 1 |
1917 | | - """Maximum number of forward steps per scheduler call.""" |
1918 | | - |
1919 | | - multi_step_stream_outputs: bool = True |
1920 | | - """If False, then multi-step will stream outputs at the end of all steps""" |
1921 | | - |
1922 | | - send_delta_data: bool = False |
1923 | | - """Private API. If used, scheduler sends delta data to |
1924 | | - workers instead of an entire data. It should be enabled only |
1925 | | - when SPMD worker architecture is enabled. I.e., |
1926 | | - VLLM_USE_RAY_SPMD_WORKER=1""" |
1927 | | - |
1928 | | - policy: SchedulerPolicy = "fcfs" |
1929 | | - """The scheduling policy to use:\n |
1930 | | - - "fcfs" means first come first served, i.e. requests are handled in order |
1931 | | - of arrival.\n |
1932 | | - - "priority" means requests are handled based on given priority (lower |
1933 | | - value means earlier handling) and time of arrival deciding any ties).""" |
1934 | | - |
1935 | | - chunked_prefill_enabled: bool = field(init=False) |
1936 | | - """True if chunked prefill is enabled.""" |
1937 | | - |
1938 | | - disable_chunked_mm_input: bool = False |
1939 | | - """If set to true and chunked prefill is enabled, we do not want to |
1940 | | - partially schedule a multimodal item. Only used in V1 |
1941 | | - This ensures that if a request has a mixed prompt |
1942 | | - (like text tokens TTTT followed by image tokens IIIIIIIIII) where only |
1943 | | - some image tokens can be scheduled (like TTTTIIIII, leaving IIIII), |
1944 | | - it will be scheduled as TTTT in one step and IIIIIIIIII in the next.""" |
1945 | | - |
1946 | | - # scheduler class or path. "vllm.core.scheduler.Scheduler" (default) |
1947 | | - # or "mod.custom_class". |
1948 | | - scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" |
1949 | | - """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the |
1950 | | - default scheduler. Can be a class directly or the path to a class of form |
1951 | | - "mod.custom_class".""" |
1952 | | - |
1953 | | - disable_hybrid_kv_cache_manager: bool = False |
1954 | | - """If set to True, KV cache manager will allocate the same size of KV cache |
1955 | | - for all attention layers even if there are multiple type of attention layers |
1956 | | - like full attention and sliding window attention. |
1957 | | - """ |
1958 | | - |
1959 | | - async_scheduling: bool = False |
1960 | | - """EXPERIMENTAL: If set to True, perform async scheduling. This may help |
1961 | | - reduce the CPU overheads, leading to better latency and throughput. However, |
1962 | | - async scheduling is currently not supported with some features such as |
1963 | | - structured outputs, speculative decoding, and pipeline parallelism. |
1964 | | - """ |
1965 | | - |
1966 | | - def compute_hash(self) -> str: |
1967 | | - """ |
1968 | | - WARNING: Whenever a new field is added to this config, |
1969 | | - ensure that it is included in the factors list if |
1970 | | - it affects the computation graph. |
1971 | | -
|
1972 | | - Provide a hash that uniquely identifies all the configs |
1973 | | - that affect the structure of the computation |
1974 | | - graph from input ids/embeddings to the final hidden states, |
1975 | | - excluding anything before input ids/embeddings and after |
1976 | | - the final hidden states. |
1977 | | - """ |
1978 | | - # no factors to consider. |
1979 | | - # this config will not affect the computation graph. |
1980 | | - factors: list[Any] = [] |
1981 | | - hash_str = hashlib.md5(str(factors).encode(), |
1982 | | - usedforsecurity=False).hexdigest() |
1983 | | - return hash_str |
1984 | | - |
1985 | | - def __post_init__(self) -> None: |
1986 | | - if self.max_model_len is None: |
1987 | | - self.max_model_len = 8192 |
1988 | | - |
1989 | | - if self.max_num_seqs is None: |
1990 | | - self.max_num_seqs = 128 |
1991 | | - |
1992 | | - if self.max_num_batched_tokens is None: |
1993 | | - if self.enable_chunked_prefill: |
1994 | | - if self.num_scheduler_steps > 1: |
1995 | | - # Multi-step Chunked-Prefill doesn't allow prompt-chunking |
1996 | | - # for now. Have max_num_batched_tokens set to max_model_len |
1997 | | - # so we don't reject sequences on account of a short |
1998 | | - # max_num_batched_tokens. |
1999 | | - self.max_num_batched_tokens = max( |
2000 | | - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2001 | | - else: |
2002 | | - self.max_num_batched_tokens = ( |
2003 | | - DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2004 | | - else: |
2005 | | - # If max_model_len is too short, use |
2006 | | - # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value |
2007 | | - # for higher throughput. |
2008 | | - self.max_num_batched_tokens = max( |
2009 | | - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2010 | | - |
2011 | | - if self.runner_type == "pooling": |
2012 | | - # Choose specific value for higher throughput |
2013 | | - self.max_num_batched_tokens = max( |
2014 | | - self.max_num_batched_tokens, |
2015 | | - POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, |
2016 | | - ) |
2017 | | - if self.is_multimodal_model: |
2018 | | - # The value needs to be at least the number of multimodal tokens |
2019 | | - self.max_num_batched_tokens = max( |
2020 | | - self.max_num_batched_tokens, |
2021 | | - MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
2022 | | - ) |
2023 | | - |
2024 | | - # When using default settings, |
2025 | | - # Ensure max_num_batched_tokens does not exceed model limit. |
2026 | | - # Some models (e.g., Whisper) have embeddings tied to max length. |
2027 | | - self.max_num_batched_tokens = min( |
2028 | | - self.max_num_seqs * self.max_model_len, |
2029 | | - self.max_num_batched_tokens) |
2030 | | - |
2031 | | - self.max_num_encoder_input_tokens = self.max_num_batched_tokens |
2032 | | - self.encoder_cache_size = self.max_num_batched_tokens |
2033 | | - |
2034 | | - if self.enable_chunked_prefill: |
2035 | | - logger.info( |
2036 | | - "Chunked prefill is enabled with max_num_batched_tokens=%d.", |
2037 | | - self.max_num_batched_tokens) |
2038 | | - |
2039 | | - self.chunked_prefill_enabled = self.enable_chunked_prefill |
2040 | | - if self.max_num_partial_prefills > 1: |
2041 | | - if self.long_prefill_token_threshold == 0: |
2042 | | - self.long_prefill_token_threshold = int(self.max_model_len * |
2043 | | - 0.04) |
2044 | | - |
2045 | | - logger.info( |
2046 | | - "Concurrent partial prefills enabled with " |
2047 | | - "max_num_partial_prefills=%d, max_long_partial_prefills=%d, " |
2048 | | - "long_prefill_token_threshold=%d", |
2049 | | - self.max_num_partial_prefills, self.max_long_partial_prefills, |
2050 | | - self.long_prefill_token_threshold) |
2051 | | - |
2052 | | - # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)]. |
2053 | | - # This avoids OOM in tight memory scenarios with small max_num_seqs, |
2054 | | - # and prevents capture of many large graphs (>512) that would greatly |
2055 | | - # increase startup time with limited performance benefit. |
2056 | | - if not self.cuda_graph_sizes: |
2057 | | - self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)] |
2058 | | - |
2059 | | - if self.async_scheduling: |
2060 | | - self.scheduler_cls = ( |
2061 | | - "vllm.v1.core.sched.async_scheduler.AsyncScheduler") |
2062 | | - |
2063 | | - @model_validator(mode='after') |
2064 | | - def _verify_args(self) -> Self: |
2065 | | - if (self.max_num_batched_tokens < self.max_model_len |
2066 | | - and not self.chunked_prefill_enabled): |
2067 | | - raise ValueError( |
2068 | | - f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " |
2069 | | - f"smaller than max_model_len ({self.max_model_len}). " |
2070 | | - "This effectively limits the maximum sequence length to " |
2071 | | - "max_num_batched_tokens and makes vLLM reject longer " |
2072 | | - "sequences. Please increase max_num_batched_tokens or " |
2073 | | - "decrease max_model_len.") |
2074 | | - |
2075 | | - if self.max_num_batched_tokens < self.max_num_seqs: |
2076 | | - raise ValueError( |
2077 | | - f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " |
2078 | | - "be greater than or equal to max_num_seqs " |
2079 | | - f"({self.max_num_seqs}).") |
2080 | | - |
2081 | | - if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: |
2082 | | - logger.warning( |
2083 | | - "max_num_batched_tokens (%d) exceeds max_num_seqs " |
2084 | | - "* max_model_len (%d). This may lead to unexpected behavior.", |
2085 | | - self.max_num_batched_tokens, |
2086 | | - self.max_num_seqs * self.max_model_len) |
2087 | | - |
2088 | | - if self.num_lookahead_slots < 0: |
2089 | | - raise ValueError( |
2090 | | - "num_lookahead_slots " |
2091 | | - f"({self.num_lookahead_slots}) must be greater than or " |
2092 | | - "equal to 0.") |
2093 | | - |
2094 | | - if self.num_scheduler_steps < 1: |
2095 | | - raise ValueError( |
2096 | | - "num_scheduler_steps " |
2097 | | - f"({self.num_scheduler_steps}) must be greater than or " |
2098 | | - "equal to 1.") |
2099 | | - |
2100 | | - if self.max_num_partial_prefills < 1: |
2101 | | - raise ValueError( |
2102 | | - f"max_num_partial_prefills ({self.max_num_partial_prefills}) " |
2103 | | - "must be greater than or equal to 1.") |
2104 | | - elif self.max_num_partial_prefills > 1: |
2105 | | - if not self.chunked_prefill_enabled: |
2106 | | - raise ValueError("Chunked prefill must be enabled to set " |
2107 | | - "max_num_partial_prefills > 1.") |
2108 | | - |
2109 | | - if self.long_prefill_token_threshold > self.max_model_len: |
2110 | | - raise ValueError( |
2111 | | - "long_prefill_token_threshold " |
2112 | | - f"({self.long_prefill_token_threshold}) cannot be greater " |
2113 | | - f"than the max_model_len ({self.max_model_len}).") |
2114 | | - |
2115 | | - if (self.max_long_partial_prefills |
2116 | | - < 1) or (self.max_long_partial_prefills |
2117 | | - > self.max_num_partial_prefills): |
2118 | | - raise ValueError( |
2119 | | - f"max_long_partial_prefills ({self.max_long_partial_prefills}) " |
2120 | | - "must be greater than or equal to 1 and less than or equal to " |
2121 | | - f"max_num_partial_prefills ({self.max_num_partial_prefills}).") |
2122 | | - |
2123 | | - return self |
2124 | | - |
2125 | | - @property |
2126 | | - def is_multi_step(self) -> bool: |
2127 | | - return self.num_scheduler_steps > 1 |
2128 | | - |
2129 | | - |
2130 | 1818 | Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"] |
2131 | 1819 |
|
2132 | 1820 |
|
|
0 commit comments