From ab9d63f90966ce246488958a2f7d8cdb2846eede Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 14 Aug 2025 12:13:46 -0400 Subject: [PATCH 01/31] LP docs draft Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 101 ++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 docs/features/custom_logitsprocs.md diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md new file mode 100644 index 000000000000..d782439b1d8a --- /dev/null +++ b/docs/features/custom_logitsprocs.md @@ -0,0 +1,101 @@ +# Custom Logits Processors + +This document shows you how to augment vLLM with custom logits processors. + +## Build a custom logits processor and pass it to the offline `LLM` engine + +Subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and override the following methods +* `__init__(self, vllm_config: vllm.config.VllmConfig, device: torch.device, is_pin_memory: bool)` + * `vllm_config`: vLLM engine configuration +* `is_argmax_invariant(self)` +* `update_state(self, batch_update: Optional[vllm.v1.sample.logits_processor.BatchUpdate])` + * `batch_update`: representation of added/removed/moved requests in the vLLM persistent batch during the most recent engine step +* `apply(self, logits: torch.Tensor)` + * `logits`: a $num\_reqs \times vocab\_size$ tensor representing the unprocessed token probability distribution for each request. + +The contrived example below implements a + +??? code "Example custom logits processor definition" + + ```python + from typing import Optional + import torch + from vllm.config import VllmConfig + from vllm.sampling_params import SamplingParams + from vllm.v1.sample.logits_processor import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) + + class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + self.req_info: dict[int, SamplingParams] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and (target_token := + params.extra_args.get("target_token")): + self.req_info[index] = target_token + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + rows_list = list(self.req_info.keys()) + cols = torch.tensor([self.req_info[i] for i in rows_list], + dtype=torch.long, + device=logits.device) + rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + values_to_keep = logits[rows, cols].clone() + ``` + +Pass your custom logits processor to the `LLM` constructor in the form of (1) a class object or (2) a fully-qualified class name (FQCN), as shown in the example below (which assumes that `DummyLogitsProcessor` is defined in `vllm.test_utils`): + +``` +# Pass in class object +llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], +) + +# Pass in FQCN +llm = LLM( + model="facebook/opt-125m", + logits_processors=["vllm.test_utils:DummyLogitsProcessor"], +) +``` + +## Online scenario: pass the logits processor FQCN via CLI with `--logits-processors` + +??? console "Launch vLLM OpenAI API-compatible server with custom logits processor" + + ```bash + $ vllm serve facebook/opt-125m --logits-processors vllm.test_utils:DummyLogitsProcessor + ``` \ No newline at end of file From 47b63293298bf85e7e6c974c701fb11347e8023b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 19 Aug 2025 09:52:52 -0400 Subject: [PATCH 02/31] wip Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index d782439b1d8a..ac0e8e98d263 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -13,7 +13,7 @@ Subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and override the foll * `apply(self, logits: torch.Tensor)` * `logits`: a $num\_reqs \times vocab\_size$ tensor representing the unprocessed token probability distribution for each request. -The contrived example below implements a +The contrived example below implements a custom logits processor which masks out all tokens except for one (`target_token`) with `float(-inf)`. ??? code "Example custom logits processor definition" From 75d63556981f8aa2b9260eb35e527ee3479a2123 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 19 Aug 2025 10:29:55 -0400 Subject: [PATCH 03/31] custom args Signed-off-by: Andrew Feldman --- docs/features/custom_arguments.md | 40 +++++++++++++++++++++++++++++ docs/features/custom_logitsprocs.md | 9 ++++--- 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 docs/features/custom_arguments.md diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md new file mode 100644 index 000000000000..e837d97902d6 --- /dev/null +++ b/docs/features/custom_arguments.md @@ -0,0 +1,40 @@ +# vLLM Custom Arguments + +You can use vLLM *custom arguments* to enable [custom logits processors](./custom_logitsprocs.md) and vLLM plugins to receive request arguments which are not hard-coded into vLLM's interface. + +Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`: + +``` python +SamplingParams(..., + extra_args={"your_custom_arg_name": 67}) +``` + +This allows arguments which are not already part of `SamplingParams` to be passed into vLLM. + +The vLLM REST API allows custom arguments to be passed to the vLLM server via `vllm_xargs`; under the hood `vllm_xargs` is transferred directly into `SamplingParams.extra_args`. The example below integrates custom arguments into a vLLM REST API request: + +``` bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0, + "vllm_xargs": {"your_custom_arg": 67} + }' +``` + +Furthermore, OpenAI SDK users can access `vllm_xargs` via the `extra_body` argument: + +``` python +batch = await client.completions.create( + model=model_name, + prompt=prompt, + extra_body={ + "vllm_xargs": { + "your_custom_arg": 67 + } + } +) +``` diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index ac0e8e98d263..978f7dd5a13f 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -5,13 +5,14 @@ This document shows you how to augment vLLM with custom logits processors. ## Build a custom logits processor and pass it to the offline `LLM` engine Subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and override the following methods + * `__init__(self, vllm_config: vllm.config.VllmConfig, device: torch.device, is_pin_memory: bool)` * `vllm_config`: vLLM engine configuration * `is_argmax_invariant(self)` * `update_state(self, batch_update: Optional[vllm.v1.sample.logits_processor.BatchUpdate])` * `batch_update`: representation of added/removed/moved requests in the vLLM persistent batch during the most recent engine step * `apply(self, logits: torch.Tensor)` - * `logits`: a $num\_reqs \times vocab\_size$ tensor representing the unprocessed token probability distribution for each request. + * `logits`: a $num\_reqs \times vocab\_size$ tensor representing the unprocessed token probability distribution for each request. The contrived example below implements a custom logits processor which masks out all tokens except for one (`target_token`) with `float(-inf)`. @@ -78,7 +79,7 @@ The contrived example below implements a custom logits processor which masks out Pass your custom logits processor to the `LLM` constructor in the form of (1) a class object or (2) a fully-qualified class name (FQCN), as shown in the example below (which assumes that `DummyLogitsProcessor` is defined in `vllm.test_utils`): -``` +``` python # Pass in class object llm = LLM( model="facebook/opt-125m", @@ -92,10 +93,12 @@ llm = LLM( ) ``` +### Custom + ## Online scenario: pass the logits processor FQCN via CLI with `--logits-processors` ??? console "Launch vLLM OpenAI API-compatible server with custom logits processor" ```bash $ vllm serve facebook/opt-125m --logits-processors vllm.test_utils:DummyLogitsProcessor - ``` \ No newline at end of file + ``` From 505ac7d6cd09feb62f7dc062dbae868c50596412 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 19 Aug 2025 10:56:21 -0400 Subject: [PATCH 04/31] wip Signed-off-by: Andrew Feldman --- docs/features/custom_arguments.md | 6 +- docs/features/custom_logitsprocs.md | 96 ++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 33 deletions(-) diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md index e837d97902d6..7a0c65b21fa5 100644 --- a/docs/features/custom_arguments.md +++ b/docs/features/custom_arguments.md @@ -18,9 +18,7 @@ curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ "model": "Qwen/Qwen2.5-1.5B-Instruct", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0, + ... "vllm_xargs": {"your_custom_arg": 67} }' ``` @@ -30,7 +28,7 @@ Furthermore, OpenAI SDK users can access `vllm_xargs` via the `extra_body` argum ``` python batch = await client.completions.create( model=model_name, - prompt=prompt, + ..., extra_body={ "vllm_xargs": { "your_custom_arg": 67 diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 978f7dd5a13f..745071a54a87 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -2,19 +2,11 @@ This document shows you how to augment vLLM with custom logits processors. -## Build a custom logits processor and pass it to the offline `LLM` engine +## Build a Custom Logits Processor and Pass It to the vLLM Engine -Subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and override the following methods +Subclass `vllm.v1.sample.logits_processor.LogitsProcessor` in order to implement a custom logits processor. -* `__init__(self, vllm_config: vllm.config.VllmConfig, device: torch.device, is_pin_memory: bool)` - * `vllm_config`: vLLM engine configuration -* `is_argmax_invariant(self)` -* `update_state(self, batch_update: Optional[vllm.v1.sample.logits_processor.BatchUpdate])` - * `batch_update`: representation of added/removed/moved requests in the vLLM persistent batch during the most recent engine step -* `apply(self, logits: torch.Tensor)` - * `logits`: a $num\_reqs \times vocab\_size$ tensor representing the unprocessed token probability distribution for each request. - -The contrived example below implements a custom logits processor which masks out all tokens except for one (`target_token`) with `float(-inf)`. +The contrived example below implements a custom logits processor which masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. ??? code "Example custom logits processor definition" @@ -77,28 +69,74 @@ The contrived example below implements a custom logits processor which masks out values_to_keep = logits[rows, cols].clone() ``` -Pass your custom logits processor to the `LLM` constructor in the form of (1) a class object or (2) a fully-qualified class name (FQCN), as shown in the example below (which assumes that `DummyLogitsProcessor` is defined in `vllm.test_utils`): +Pass your custom logits processor to the `LLM` constructor in the form of (1) a class object or (2) a fully-qualified class name (FQCN), as shown in the example below (which assumes that `DummyLogitsProcessor` is defined in `your.module.path`): -``` python -# Pass in class object -llm = LLM( - model="facebook/opt-125m", - logits_processors=[DummyLogitsProcessor], -) +??? code "Passing custom logits processor to `LLM` in Python" -# Pass in FQCN -llm = LLM( - model="facebook/opt-125m", - logits_processors=["vllm.test_utils:DummyLogitsProcessor"], -) -``` + ``` python + # Pass in class object + llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], + ) -### Custom - -## Online scenario: pass the logits processor FQCN via CLI with `--logits-processors` + # Pass in FQCN + llm = LLM( + model="facebook/opt-125m", + logits_processors=["your.module.path:DummyLogitsProcessor"], + ) + ``` -??? console "Launch vLLM OpenAI API-compatible server with custom logits processor" +??? code "Passing custom logits processor to vLLM server via CLI" ```bash - $ vllm serve facebook/opt-125m --logits-processors vllm.test_utils:DummyLogitsProcessor + vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor + ``` + +## Configure The Custom Logits Processor for a Request + +To enable the logits processor for a request, pass `target_token` in with the request as a vLLM [custom argument](./custom_arguments.md): + +??? code "Python: configure custom logits processor for a request" + + ``` python + outputs_logitproc = llm.generate("your prompt", + SamplingParams(..., + extra_args={"target_token": 67})) ``` + +??? code "vLLM REST API: configure custom logits processor for a request" + + ``` bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + ... + "vllm_xargs": {"target_token": 67} + }' + ``` + +??? code "OpenAI SDK: configure custom logits processor for a request" + + ``` python + batch = await client.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + ..., + extra_body={ + "vllm_xargs": { + "target_token": 67 + } + } + ) + ``` + +## Logits Processor Programming Model + +* `__init__(self, vllm_config: vllm.config.VllmConfig, device: torch.device, is_pin_memory: bool)` + * `vllm_config`: vLLM engine configuration +* `is_argmax_invariant(self)` +* `update_state(self, batch_update: Optional[vllm.v1.sample.logits_processor.BatchUpdate])` + * `batch_update`: representation of added/removed/moved requests in the vLLM persistent batch during the most recent engine step +* `apply(self, logits: torch.Tensor)` + * `logits`: a $num\_reqs \times vocab\_size$ tensor representing the unprocessed token probability distribution for each request. From 9892dda6904b36b6474787ce70d13489c0e6e11c Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 19 Aug 2025 16:57:58 -0400 Subject: [PATCH 05/31] wip Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 108 +++++++++++++++++++++------- 1 file changed, 83 insertions(+), 25 deletions(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 745071a54a87..b3ed395c91cf 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -2,11 +2,85 @@ This document shows you how to augment vLLM with custom logits processors. -## Build a Custom Logits Processor and Pass It to the vLLM Engine +## Ways to Pass Your Custom Logits Processor to vLLM -Subclass `vllm.v1.sample.logits_processor.LogitsProcessor` in order to implement a custom logits processor. +### 1. Offline-only: pass a Python class object to the vLLM constructor -The contrived example below implements a custom logits processor which masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. +You can pass one or more custom logits processor class objects to the `LLM` constructor. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` is instantiated, or (2) imported from a Python package. + +??? code "Passing custom logits processor class object to `LLM` in Python" + + ``` python + # Import custom logits processor + from some.module import DummyLogitsProcessor + + # ...or... + + # Define custom logits processor locally + from vllm.v1.sample.logits_processor import LogitsProcessor + + class DummyLogitsProcessor(LogitsProcessor): + # See DummyLogitsProcessor implementation above + ... + + # Pass class object to LLM constructor + llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], + ) + ``` + +### 2. Pass the custom logits processor fully-qualified class name (FQCN) to vLLM at initialization time + +This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` Python constructor, or as a CLI argument to `vllm serve` with the following syntax + +``` bash +vllm serve ... --logits_processors ... +``` + +The only requirements on the FQCN are +1. Python's `importlib.import_module()` must be able to resolve the dotted path portion of the FQCN and load it as a module +2. The class-name portion of the FQCN must be possible to import from the loaded module +3. The object pointed to by the FQCN must be a subclass of `LogitsProcessor` + +See examples below: + +??? code "Passing custom logits processor FQCN to `LLM` in Python" + + ``` python + # Pass in FQCN + llm = LLM( + model="facebook/opt-125m", + logits_processors=["your.module.path:DummyLogitsProcessor"], + ) + ``` + +??? code "Passing custom logits processor FQCN to vLLM server via CLI" + + ```bash + vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor + ``` + +### 3. Automatically detect installed custom logits processors in your Python environment via Python entry points + +During initialization, vLLM automatically scans the `vllm.logits_processors` [entry point](https://setuptools.pypa.io/latest/userguide/entry_point.html) group and loads any installed logits processors which it finds. + +Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your Python package; see example below: + +??? code "Exposing a custom logits processor as a Python entrypoint" + + ``` toml + [project.entry-points."vllm.logits_processors"] + dummy_logits_processor = "your.module.path:DummyLogitsProcessor" + ``` + +Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to `logits_processors` at initialization time. + +**Note:** vLLM will *always* load *all* logits processors which are exposed via entrypoints under `vllm.logits_processors`. + +## Writing a vLLM Custom Logits Procesor + +Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor.LogitsProcessor`. The contrived example below implements a custom logits processor which masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. ??? code "Example custom logits processor definition" @@ -69,33 +143,17 @@ The contrived example below implements a custom logits processor which masks out values_to_keep = logits[rows, cols].clone() ``` -Pass your custom logits processor to the `LLM` constructor in the form of (1) a class object or (2) a fully-qualified class name (FQCN), as shown in the example below (which assumes that `DummyLogitsProcessor` is defined in `your.module.path`): +## Defining How the Custom Logits Processor Can Be Used -??? code "Passing custom logits processor to `LLM` in Python" +Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. It is up to the logits processor author to determine: - ``` python - # Pass in class object - llm = LLM( - model="facebook/opt-125m", - logits_processors=[DummyLogitsProcessor], - ) - - # Pass in FQCN - llm = LLM( - model="facebook/opt-125m", - logits_processors=["your.module.path:DummyLogitsProcessor"], - ) - ``` - -??? code "Passing custom logits processor to vLLM server via CLI" +1. **The per-request attributes which configure the logits processor's behavior against that request.** vLLM supports [custom arguments](./custom_arguments.md): the user may pass in a `dict` of custom request arguments, which will be accessible to all logits processors via `SamplingParams.extra_args`. If your logits processor requires arguments not already supported by `SamplingParams` and the vLLM REST API, we recommended designing your custom logits processor to look for these arguments as keys in the `SamplingParams.extra_args` dict. In the `DummyLogitsProcessor` example above, the logits processor looks for `target_tokens` as a custom argument. - ```bash - vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor - ``` +2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, we recommended writing your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. through the absence of a particular custom argument or by passing in a specific argument value. In the `DummyLogitsProcessor` example above, the absence of `target_token` disables the logits processor for a given request. -## Configure The Custom Logits Processor for a Request +3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `update_state()` and `apply()` to exit early if all requests have the logits processor disabled. -To enable the logits processor for a request, pass `target_token` in with the request as a vLLM [custom argument](./custom_arguments.md): +The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior. ??? code "Python: configure custom logits processor for a request" From ebfb31feed793ff1b9d11349c8fb077e2772b3dd Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 20 Aug 2025 18:24:20 -0400 Subject: [PATCH 06/31] design wip Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 146 +++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 docs/design/logits_processors.md diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md new file mode 100644 index 000000000000..051079ee5a7a --- /dev/null +++ b/docs/design/logits_processors.md @@ -0,0 +1,146 @@ +# Logits Processors Programming Model + +This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors. + +## Logits Processors Background + +A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior. + +In vLLM, logits processors operate at batch granularity: during a given engine step, the logits processor consumes a $(num_requests) \times (vocab_size)$ tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. + +## Logits Processors in the vLLM engine + +The vLLM engine's persistent batch data structure maintains a list of loaded logits processors. This list is passed to `SamplingMetadata` when the data structure is built. + +In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch, such as whether each requests enables the logits processor as well as each request's configuration settings. Therefore, logits processors are stateful. + +In each engine step, the vLLM engine will: + +1. **Update each logits processor's internal state to match persistent batch internal state, by invoking each logits processor's `update_state()` method.** this is necessary to ensure that logits transformations are applied to the correct requests with the correct configuration settings; to ensure that the logits processors discard information about finished requests; and to allow certain logits processors to count decoding steps (example: limiting the max number of generated tokens requires counting the number of generated tokens for each request which uses the logits processor.) The pseudocode below shows how the vLLM model runner computes updates to the persistent batch state and then notifies each logits processor of the state changes: + +??? code "Model Runner Updates Logits Processor States" + + ``` python + # gpu_model_runner.py + + class GPUModelRunner(...): + + ... + + def execute_model(self, scheduler_output, ...): + self._update_states(scheduler_output) + + ... + + def _update_states(...): + + ... + + # Update persistent batch to reflect new/finished requests & reordering + # of requests within batch + + ... + + self.input_batch.refresh_metadata() + + + # gpu_input_batch.py + + class InputBatch: + + ... + + def refresh_metadata(self): + + ... + + # Update each logits processor's state to reflect persistent batch state + batch_update = self.batch_update_builder.get_and_reset(self.num_reqs) + for logit_proc in self.logitsprocs.all: + logit_proc.update_state(batch_update) + + ... + + + # logits_processor/interface.py + + @dataclass(frozen=True) + class BatchUpdate: + # Batch state-change data structure which is passed to logits processor + # update_state() method + + batch_size: int + + removed: Sequence[RemovedRequest] + moved: Sequence[MovedRequest] + added: Sequence[AddedRequest] + ``` + +2. **Apply the logits processors to the model output logits tensor, by invoking each logits processor's `apply()` method.** The pseudocode below shows how the vLLM model runner invokes the sampler, which in turn causes the logits processors to transform the model output logits. + +??? code "Apply logits processors to model output logits" + + ``` python + # gpu_model_runner.py + + class GPUModelRunner(...): + + ... + + def execute_model(self, scheduler_output, ...): + + ... + + sampler_output = self.sampler(logits=logits, + sampling_metadata=sampling_metadata) + + ... + + + # sampler.py + + class Sampler(nn.Module): + + ... + + def forward(self, logits, sampling_metadata): + + ... + + # Apply non-argmax-invariant logits processors to model output logits + for processor in (sampling_metadata.logitsprocs.non_argmax_invariant): + logits = processor.apply(logits) + + sampled = self.sample(logits, sampling_metadata) + + ... + + # Return sampler output data structure + + + def sample(self, logits, sampling_metadta) + + ... + + # Exit early if all requests are greedy-sampling + + ... + + # Apply argmax-invariant logits processors + for processor in sampling_metadata.logitsprocs.argmax_invariant: + logits = processor.apply(logits) + + ... + + # Perform sampling and return sampling result + ``` + +At sampling time, the engine saves compute by skipping "argmax-invariant" logits processors in the edge-case where all requests employ greedy sampling. Here, "argmax" is shorthand for the token ID with the highest logit value in a given row of the logits tensor (i.e. the token which the model weighted the highest for a given request). + +* An **argmax-invariant logits processor** is a logits processor (such as Min-P) which does not modify the argmax. For example, a logits processor which masks out the lowest-probability tokens will not change which token ID has the max logit. Greedy sampling always picks the highest-logit-value token ID, and so conceptually an argmax-invariant logits processor can be skipped for greedy sampling requests. + +* A **non-argmax-invariant logits processor** is a logits processor which may modify the argmax. For example, a logits processor which masks all tokens except for EOS after a certain number of steps in order to force decoding to terminate might end up masking the max-logit-value token and therefore change the argmax. Conceptually, these logits processors cannot be skipped for greedy sampling requests. + +The vLLM logits processor abstraction requires the engine to pass in state updates at batch granularity; therefore in practice state updates for argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling. + +### Updating logits processor state to match persistent batch state \ No newline at end of file From 29ff326c30f83f8a322f157745f37c938a7ba000 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 20 Aug 2025 20:28:25 -0400 Subject: [PATCH 07/31] more design Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 274 ++++++++++++++++++++++++++++++- 1 file changed, 269 insertions(+), 5 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index 051079ee5a7a..b55241ee0c61 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -1,4 +1,4 @@ -# Logits Processors Programming Model +# Logits Processor Support in vLLM This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors. @@ -6,17 +6,17 @@ This document describes how the vLLM engine interacts with logits processors, an A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior. -In vLLM, logits processors operate at batch granularity: during a given engine step, the logits processor consumes a $(num_requests) \times (vocab_size)$ tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. +In vLLM, logits processors operate at batch granularity: during a given engine step, the logits processor consumes a $(num\_requests) \times (vocab\_size)$ tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. ## Logits Processors in the vLLM engine The vLLM engine's persistent batch data structure maintains a list of loaded logits processors. This list is passed to `SamplingMetadata` when the data structure is built. -In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch, such as whether each requests enables the logits processor as well as each request's configuration settings. Therefore, logits processors are stateful. +In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch (i.e. each request's logits-processor-specific configuration settings). Therefore, logits processors are stateful. In each engine step, the vLLM engine will: -1. **Update each logits processor's internal state to match persistent batch internal state, by invoking each logits processor's `update_state()` method.** this is necessary to ensure that logits transformations are applied to the correct requests with the correct configuration settings; to ensure that the logits processors discard information about finished requests; and to allow certain logits processors to count decoding steps (example: limiting the max number of generated tokens requires counting the number of generated tokens for each request which uses the logits processor.) The pseudocode below shows how the vLLM model runner computes updates to the persistent batch state and then notifies each logits processor of the state changes: +1. **Update each logits processor's internal state to match persistent batch internal state, by invoking each logits processor's `update_state()` method.** This is necessary to ensure that logits processors' internal state is reorganized to match the new persistent batch state at the end of the current step. The pseudocode below shows how the vLLM model runner computes updates to the persistent batch state and then notifies each logits processor of the state changes: ??? code "Model Runner Updates Logits Processor States" @@ -76,6 +76,8 @@ In each engine step, the vLLM engine will: added: Sequence[AddedRequest] ``` + Note that `InputBatch.refresh_metadata()` generates a `BatchUpdate` data structure - representing the persistent batch state changes resulting from new, finished and reordered requests - and passes that data structure to the logits processors' `update_state()` methods. + 2. **Apply the logits processors to the model output logits tensor, by invoking each logits processor's `apply()` method.** The pseudocode below shows how the vLLM model runner invokes the sampler, which in turn causes the logits processors to transform the model output logits. ??? code "Apply logits processors to model output logits" @@ -143,4 +145,266 @@ At sampling time, the engine saves compute by skipping "argmax-invariant" logits The vLLM logits processor abstraction requires the engine to pass in state updates at batch granularity; therefore in practice state updates for argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling. -### Updating logits processor state to match persistent batch state \ No newline at end of file +## Logits Processor Programming Model + +The previous sections alluded to the interfaces which vLLM logits processors must support. This section introduces in full the programming model for implementing logits processors that are compatible with the vLLM engine, including the `LogitsProcessor` base class and its interface methods as well as the `BatchUpdate` data structure for representing persistent batch state changes, both of which are shown in the code below: + +??? code "`LogitsProcessor` base class and `BatchUpdate` data structure" + + ``` python + from abc import ABC, abstractmethod + from collections.abc import Sequence + from dataclasses import dataclass + from enum import Enum, auto + from typing import TYPE_CHECKING, Optional + + import torch + + from vllm import SamplingParams + + if TYPE_CHECKING: + from vllm.config import VllmConfig + + + class MoveDirectionality(Enum): + # One-way i1->i2 req move within batch + UNIDIRECTIONAL = auto() + # Two-way i1<->i2 req swap within batch + SWAP = auto() + + + # (index, params, prompt_tok_ids, output_tok_ids) tuples for new + # requests added to the batch. + AddedRequest = tuple[int, SamplingParams, list[int], list[int]] + + # (index 1, index 2, directionality) tuples representing + # one-way moves or two-way swaps of requests in batch + MovedRequest = tuple[int, int, MoveDirectionality] + + # Batch indices of any removed requests. + RemovedRequest = int + + + @dataclass(frozen=True) + class BatchUpdate: + """Persistent batch state change info for logitsprocs""" + batch_size: int # Current num reqs in batch + + # Metadata for requests added to, removed from, and moved + # within the persistent batch. + # + # Key assumption: the `output_tok_ids` list (which is an element of each + # tuple in `added`) is a reference to the request's running output tokens + # list; via this reference, the logits processors always see the latest + # list of generated output tokens + removed: Sequence[RemovedRequest] + moved: Sequence[MovedRequest] + added: Sequence[AddedRequest] + + + class LogitsProcessor(ABC): + + @abstractmethod + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool) -> None: + raise NotImplementedError + + @abstractmethod + def apply(self, logits: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + @abstractmethod + def is_argmax_invariant(self) -> bool: + """True if logits processor has no impact on the + argmax computation in greedy sampling. + NOTE: may or may not have the same value for all + instances of a given LogitsProcessor subclass, + depending on subclass implementation. + """ + raise NotImplementedError + + @abstractmethod + def update_state( + self, + batch_update: Optional["BatchUpdate"], + ) -> None: + """Called when there are new output tokens, prior + to each forward pass. + + Args: + batch_update is non-None iff there have been + changes to the batch makeup. + """ + raise NotImplementedError + + ``` + +A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods: + +* `__init__()` + +* `apply(self, logits: torch.Tensor) -> torch.Tensor`: + * Consume a $(num\_requests) \times (vocab\_size)$ logits tensor (`logits`) + * Apply logits processor transformation at batch granularity + * Return a transformed $(num\_requests) \times (vocab\_size)$ logits tensor + +* `is_argmax_invariant(self) -> bool`: + * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax + +* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: + * Consume a `BatchUpdate` data structure representing persistent batch state changes at the end of the current engine step + * Batch update data structure may be `None`, signaling no state-change + +### `BatchUpdate` data structure + +The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (summarized below along with a schematic representation of how the batch is modified by the operation): + +* **Add:** add (or replace existing request with) a new request at index $i$ + + * An Add is represented in `Batchupdate.added` as a tuple of + + ``` + (index, new request SamplingParams, prompt token ids, output token ids) + ``` + + * `prompt token ids` and `output token ids` are references to the request's prompt token ids and output token ids lists, respectively. Note that the output token ids list grows with each engine step, and this growth is visible to the logits processor because output token ids are passed by reference + + * The implementation of the particular logits processor subclass determines whether or how the fields in the added request tuple are digested into an internal representation. For example, a logits processor that does not utilize prompt or output token ids may only need to utilize `index` and `SamplingParams` and discard the other tuple fields + + * If index $i$ currently holds a request, a replacement occurs: + + ``` + Batch: [A,B,C] + New request to be added @ i: D @ 1 + + => + + New Batch: [A,D,C] # Add D, discard B + ``` + + * If index $i$ does not currently hold a request (because $i$ is out of bounds of the current batch size): + + ``` + Batch: [A,B,C] + New request to be added @ i: D @ 3 + + => + + New Batch: [A,B,C,D] # Add D, extending batch + ``` + +* **Remove:** remove (without replacement) request at index $i$ + + * A Remove is represented in `Batchupdate.removed` by an `int` (representing $i$) + + * Effect of remove-at-index on batch: + + ``` + Batch: [A,B,C] + Remove @ i: 1 + + => + + New Batch: [A,x,C] # Discard B and leave an empty slot + ``` + +* **Move:** move request at index $s$ to index $d$ OR swap requests at indices $s$ and $d$ + + * A Move is represented in `Batchupdate.moved` as a tuple of + + ``` + (s, d, UNIDIRECTIONAL or SWAP) + ``` + + * If the Move specifies `UNIDRECTIONAL`: + + * The request at index `s` is moved to index `d`; index `s` becomes an empty slot + + ``` + Batch: [A,x,C,D] + Unidirectionally Move s -> d: 3 -> 1 + + => + + New Batch: [A,D,C,x] # Move D to 1, leaving empty slot at 3 + ``` + + * If another request already resided at index `d`, it is replaced and discarded + + ``` + Batch: [A,B,C,D] + Unidirectionally Move s -> d: 3 -> 1 + + => + + New Batch: [A,D,C,x] # Move D to 1, discarding B and leaving empty slot at 3 + ``` + + * If the Move specifies `SWAP`, the requests at `s` and `d` exchange indices + + ``` + Batch: [A,B,C,D] + Swap Move s <-> d: 3 <-> 1 + + => + + New Batch: [A,D,C,B] # Swap B and D + ``` + +Additionally, the `BatchUpdate` data structure includes a representation (`batch_size`) of the size of the persistent batch at the end of the engine step. + +### How the vLLM engine builds the `BatchUpdate` data structure + +Logits processor `update_state()` implementations should assume the following model for how model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction): + +1. Identify indices of requests which finished in the current engine step + +2. Identify new requests introduced in the current step + +3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index + +4. Based on the relative number of new and finished requests: + + 1. If the numbers of new and finished requests are the same, proceed to next step + + 2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1` + + 3. *If there are fewer new requests than finished requests:* + + * Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state + + * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous + + * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots + +5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch + +Notes: + +* The index argument for Add and Remove operations refers to the index *at the time the Add or Remove occurred*, i.e. before any Move operations + * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3 + * In other words Move operations can be assumed to be applied after Adds and Removes + +* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved` + +* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None` + +## Best practices for writing logits processors + +* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity + * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` + * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor + +* It is up to the logits processor author to determine: + + 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` + + 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. $0.0$. Try to save compute and memory for requests which disable the logits processor + + 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor + + * Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None` + +* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) + +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant) From 687ec93459f40faea7e8d437f364de81d6a51d6a Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 20 Aug 2025 21:53:03 -0400 Subject: [PATCH 08/31] examples Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 135 ++++++++++++++++++- docs/features/custom_arguments.md | 22 +++- docs/features/custom_logitsprocs.md | 196 ++++++++++++++-------------- 3 files changed, 246 insertions(+), 107 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index b55241ee0c61..d9f5981ffabc 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -76,7 +76,8 @@ In each engine step, the vLLM engine will: added: Sequence[AddedRequest] ``` - Note that `InputBatch.refresh_metadata()` generates a `BatchUpdate` data structure - representing the persistent batch state changes resulting from new, finished and reordered requests - and passes that data structure to the logits processors' `update_state()` methods. + !!! note + `InputBatch.refresh_metadata()` generates a `BatchUpdate` data structure - representing the persistent batch state changes resulting from new, finished and reordered requests - and passes that data structure to the logits processors' `update_state()` methods. 2. **Apply the logits processors to the model output logits tensor, by invoking each logits processor's `apply()` method.** The pseudocode below shows how the vLLM model runner invokes the sampler, which in turn causes the logits processors to transform the model output logits. @@ -250,6 +251,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) * `is_argmax_invariant(self) -> bool`: * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax + * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling * `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: * Consume a `BatchUpdate` data structure representing persistent batch state changes at the end of the current engine step @@ -355,7 +357,7 @@ Additionally, the `BatchUpdate` data structure includes a representation (`batch ### How the vLLM engine builds the `BatchUpdate` data structure -Logits processor `update_state()` implementations should assume the following model for how model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction): +Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction): 1. Identify indices of requests which finished in the current engine step @@ -389,7 +391,96 @@ Notes: * If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None` -## Best practices for writing logits processors +#### Example: Batch Update with Fewer New Requests Than Finished Requests + +The following example models an engine step where 1 new request is introduced and 2 finished requests are eliminated, additionally the attention backend performs a swap to optimize the batch ordering. + +``` +Batch state (beginning of engine step): [A,B,C,D] +Batch size: 4 + +New requests: E + +Finished requests: A, C + +Processing steps (using BatchUpdate abstraction): + +1. Add E at index 0 + +[E,B,C,D] # Discard A +Batch size: 4 + +2. Remove at index 2 + +[E,B,x,D] # Discard C, empty slot at index 2 +Batch size: 4 + +3. Condense batch with a Unidirectional Move 3 -> 2 operation and shrink batch + +[E,B,D] x # Empty slot is now outside batch +Batch size: 3 + +4. Attention backend optimization: reorder batch with Swap 0 <-> 1 + +[B,E,D] +Batch size: 3 + +``` + +The resulting `BatchUpdate` data structure will look like + +``` +BatchUpdate instance +* added: [(0,E's SamplingParams,E's prompt tokens ref,E's output tokens ref)] +* removed: [2] # request C was removed without replacement +* moved: [(3,2,UNIDIRECTIONAL),(0,1,SWAP)] +``` + +#### Example: Batch Update with More New Requests Than Finished Requests + +The following example models an engine step where 2 new requests are introduced and 1 finished request is eliminated, additionally the attention backend performs a swap to optimize the batch ordering. + +``` +Batch state (beginning of engine step): [A,B,C,D] +Batch size: 4 + +New requests: E,F + +Finished requests: C + +Processing steps (using BatchUpdate abstraction): + +1. Add E at index 2 + +[A,B,E,D] # Discard C +Batch size: 4 + +2. Add E at index 4 (current max batch index + 1) + +[A,B,E,D,F] # Extend batch by 1 +Batch size: 5 + +4. Attention backend optimization: reorder batch with Swap 0 <-> 1 + +[B,A,E,D,F] +Batch size: 5 + +``` + +Note that batch condensation is skipped because there are no empty slots left behind by Remove operations. + +The resulting `BatchUpdate` data structure will look like + +``` +BatchUpdate instance +* added: [(2,E's SamplingParams,E's prompt tokens ref,E's output tokens ref),(4,F's SamplingParams,F's prompt tokens ref,F's output tokens ref)] +* removed: [] # no requests were removed without replacement +* moved: [(0,1,SWAP)] +``` + +## How to Introduce a New Logits Processor to vLLM + +### Best Practices for Writing Built-In Logits Processors * Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` @@ -397,7 +488,7 @@ Notes: * It is up to the logits processor author to determine: - 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` + 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. $0.0$. Try to save compute and memory for requests which disable the logits processor @@ -407,4 +498,38 @@ Notes: * Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) -* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant) +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method + +### Built-In Logits Processors + +Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently supports the following built-in logits processors based on the programming model described above: + +* Min-P + +* Logit bias + +* Min-tokens + +Review these logits processor implementations for guidance on writing built-in logits processors. + +Additionally, the following logits processors or logits-processor-like functionalities are hard-coded into the sampler for efficiency and do not utilize the programming model described above, but may be updated to use the aforemented logits processor programming model in the future: + +* Allowed token IDs + +* Bad words + +* Repetition penalty + +* Frequency penalty + +* Presence penalty + +* Temperature + +* Top-K + +* Top-P + +### Custom Logits Processors + +vLLM can be augmented with [user-provided custom logits processors](../features/custom_logitsprocs.md). diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md index 7a0c65b21fa5..74ed40835b4d 100644 --- a/docs/features/custom_arguments.md +++ b/docs/features/custom_arguments.md @@ -1,17 +1,22 @@ -# vLLM Custom Arguments +# Custom Arguments -You can use vLLM *custom arguments* to enable [custom logits processors](./custom_logitsprocs.md) and vLLM plugins to receive request arguments which are not hard-coded into vLLM's interface. +You can use vLLM *custom arguments* to pass in arguments which are not part of the vLLM `SamplingParams` and REST API specifications. Adding or removing a vLLM custom argument does not require recompiling vLLM, since the custom arguments are passed in as a dictionary. + +Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code. + +## Offline Custom Arguments Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`: ``` python -SamplingParams(..., - extra_args={"your_custom_arg_name": 67}) +SamplingParams(extra_args={"your_custom_arg_name": 67}) ``` -This allows arguments which are not already part of `SamplingParams` to be passed into vLLM. +This allows arguments which are not already part of `SamplingParams` to be passed into `LLM` as part of a request. -The vLLM REST API allows custom arguments to be passed to the vLLM server via `vllm_xargs`; under the hood `vllm_xargs` is transferred directly into `SamplingParams.extra_args`. The example below integrates custom arguments into a vLLM REST API request: +## Online Custom Arguments + +The vLLM REST API allows custom arguments to be passed to the vLLM server via `vllm_xargs`. The example below integrates custom arguments into a vLLM REST API request: ``` bash curl http://localhost:8000/v1/completions \ @@ -27,7 +32,7 @@ Furthermore, OpenAI SDK users can access `vllm_xargs` via the `extra_body` argum ``` python batch = await client.completions.create( - model=model_name, + model="Qwen/Qwen2.5-1.5B-Instruct", ..., extra_body={ "vllm_xargs": { @@ -36,3 +41,6 @@ batch = await client.completions.create( } ) ``` + +!!! note + `vllm_xargs` is assigned to `SamplingParams.extra_args` under the hood, so code which uses `SamplingParams.extra_args` is compatible with both offline and online scenarios. diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index b3ed395c91cf..5d242dfb33ff 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -1,90 +1,20 @@ # Custom Logits Processors -This document shows you how to augment vLLM with custom logits processors. +A "custom" logits processor is written by a user of vLLM and is loaded into vLLM at initialization without needing to modify or recompile the vLLM source code. It is the opposite of a built-in logits processor. -## Ways to Pass Your Custom Logits Processor to vLLM +This document shows how to write, load and use a custom logits processor. -### 1. Offline-only: pass a Python class object to the vLLM constructor +Review the [logits processor design documentation](../design/logits_processors.md) for baseline guidance on writing correct and efficient logits processors. -You can pass one or more custom logits processor class objects to the `LLM` constructor. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` is instantiated, or (2) imported from a Python package. - -??? code "Passing custom logits processor class object to `LLM` in Python" - - ``` python - # Import custom logits processor - from some.module import DummyLogitsProcessor +## Writing a Custom Logits Procesor - # ...or... +Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor.LogitsProcessor`. Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although your are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) - # Define custom logits processor locally - from vllm.v1.sample.logits_processor import LogitsProcessor - - class DummyLogitsProcessor(LogitsProcessor): - # See DummyLogitsProcessor implementation above - ... - - # Pass class object to LLM constructor - llm = LLM( - model="facebook/opt-125m", - logits_processors=[DummyLogitsProcessor], - ) - ``` - -### 2. Pass the custom logits processor fully-qualified class name (FQCN) to vLLM at initialization time - -This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` Python constructor, or as a CLI argument to `vllm serve` with the following syntax - -``` bash -vllm serve ... --logits_processors ... -``` - -The only requirements on the FQCN are -1. Python's `importlib.import_module()` must be able to resolve the dotted path portion of the FQCN and load it as a module -2. The class-name portion of the FQCN must be possible to import from the loaded module -3. The object pointed to by the FQCN must be a subclass of `LogitsProcessor` - -See examples below: - -??? code "Passing custom logits processor FQCN to `LLM` in Python" - - ``` python - # Pass in FQCN - llm = LLM( - model="facebook/opt-125m", - logits_processors=["your.module.path:DummyLogitsProcessor"], - ) - ``` - -??? code "Passing custom logits processor FQCN to vLLM server via CLI" - - ```bash - vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor - ``` - -### 3. Automatically detect installed custom logits processors in your Python environment via Python entry points - -During initialization, vLLM automatically scans the `vllm.logits_processors` [entry point](https://setuptools.pypa.io/latest/userguide/entry_point.html) group and loads any installed logits processors which it finds. - -Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your Python package; see example below: - -??? code "Exposing a custom logits processor as a Python entrypoint" - - ``` toml - [project.entry-points."vllm.logits_processors"] - dummy_logits_processor = "your.module.path:DummyLogitsProcessor" - ``` - -Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to `logits_processors` at initialization time. - -**Note:** vLLM will *always* load *all* logits processors which are exposed via entrypoints under `vllm.logits_processors`. - -## Writing a vLLM Custom Logits Procesor - -Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor.LogitsProcessor`. The contrived example below implements a custom logits processor which masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. +In vLLM logits processors operate at batch granularity. The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request: ??? code "Example custom logits processor definition" - ```python + ``` python from typing import Optional import torch from vllm.config import VllmConfig @@ -143,26 +73,104 @@ Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor. values_to_keep = logits[rows, cols].clone() ``` -## Defining How the Custom Logits Processor Can Be Used +Throughout this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor. -Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. It is up to the logits processor author to determine: +Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently. -1. **The per-request attributes which configure the logits processor's behavior against that request.** vLLM supports [custom arguments](./custom_arguments.md): the user may pass in a `dict` of custom request arguments, which will be accessible to all logits processors via `SamplingParams.extra_args`. If your logits processor requires arguments not already supported by `SamplingParams` and the vLLM REST API, we recommended designing your custom logits processor to look for these arguments as keys in the `SamplingParams.extra_args` dict. In the `DummyLogitsProcessor` example above, the logits processor looks for `target_tokens` as a custom argument. +The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch. -2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, we recommended writing your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. through the absence of a particular custom argument or by passing in a specific argument value. In the `DummyLogitsProcessor` example above, the absence of `target_token` disables the logits processor for a given request. +## Ways to Load Your Custom Logits Processor in vLLM -3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `update_state()` and `apply()` to exit early if all requests have the logits processor disabled. +Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests. -The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior. +This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor. + +### Method 1: Pass the Custom Logits Processor Fully-Qualified Class Name (FQCN) to vLLM at Initialization Time + +This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` Python constructor, or as a CLI argument to `vllm serve` with the following syntax + +``` bash +vllm serve ... --logits_processors ... +``` + +The only requirements on the FQCN are + +1. Python's `importlib.import_module()` must be able to resolve the dotted path portion of the FQCN and load it as a module -??? code "Python: configure custom logits processor for a request" +2. The class-name portion of the FQCN must be possible to import from the loaded module + +3. The object pointed to by the FQCN must be a subclass of `LogitsProcessor` + +See examples below: + +??? code "Passing custom logits processor FQCN to `LLM` in Python" ``` python - outputs_logitproc = llm.generate("your prompt", - SamplingParams(..., - extra_args={"target_token": 67})) + # Pass in FQCN + llm = LLM( + model="facebook/opt-125m", + logits_processors=["your.module.path:DummyLogitsProcessor"], + ) + ``` + +??? code "Passing custom logits processor FQCN to vLLM server via CLI" + + ```bash + vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor + ``` + +### Method 2: Automatically Detect Custom Logits Processors Installed in Your Python Environment As Entry Points + +[`setuptools`](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) can enable installed packages to make themselves available as plugins to other Python programs, via pieces of metadata known as "entry points". + +During initialization, vLLM automatically scans the `vllm.logits_processors` entry point group and loads any installed logits processors which it finds. + +Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your logits processor Python package. The example below shows how to add an entrypoint to your project's `.toml` file: + +??? code "Exposing a custom logits processor as a Python entrypoint" + + ``` toml + [project.entry-points."vllm.logits_processors"] + dummy_logits_processor = "your.module.path:DummyLogitsProcessor" ``` +Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to the `LLM` constructor or to the vLLM server explicitly at initialization time if your logits processor is exposed as an entry point. + +!!! note + vLLM will *always* load *all* logits processors which are exposed via entrypoints under the `vllm.logits_processors` grouping. + +### Method 3 (Offline-only): Pass a Python Class Object to the vLLM Constructor + +You can pass one or more custom logits processor class objects to the `LLM` constructor. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` is instantiated, or (2) imported from a Python package. + +??? code "Passing custom logits processor class object to `LLM` in Python" + + ``` python + # Import custom logits processor + from some.module import DummyLogitsProcessor + + # ...or... + + # Define custom logits processor locally + from vllm.v1.sample.logits_processor import LogitsProcessor + + class DummyLogitsProcessor(LogitsProcessor): + # See DummyLogitsProcessor implementation above + ... + + # Pass class object to LLM constructor + llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], + ) + ``` + +## Invoking a Custom Logits Processor Against a Request + +The design of the custom logits processor determines whether the logits processor must be enabled/disabled for a given request, and what arguments must be provided to configure the logits processor. For more information, review [the logits processors design documentation](../design/logits_processors.md). + +The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior. + ??? code "vLLM REST API: configure custom logits processor for a request" ``` bash @@ -189,12 +197,10 @@ The examples below show how a user would pass a custom argument (`target_token`) ) ``` -## Logits Processor Programming Model +??? code "Offline: configure custom logits processor for a request" -* `__init__(self, vllm_config: vllm.config.VllmConfig, device: torch.device, is_pin_memory: bool)` - * `vllm_config`: vLLM engine configuration -* `is_argmax_invariant(self)` -* `update_state(self, batch_update: Optional[vllm.v1.sample.logits_processor.BatchUpdate])` - * `batch_update`: representation of added/removed/moved requests in the vLLM persistent batch during the most recent engine step -* `apply(self, logits: torch.Tensor)` - * `logits`: a $num\_reqs \times vocab\_size$ tensor representing the unprocessed token probability distribution for each request. + ``` python + outputs_logitproc = llm.generate("your prompt", + SamplingParams(..., + extra_args={"target_token": 67})) + ``` From 15f9ec79f7f44cd76ce7a458a5cff792c130898b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 20 Aug 2025 22:00:43 -0400 Subject: [PATCH 09/31] fixed type annotation Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 2 +- examples/offline_inference/logits_processor.py | 2 +- tests/v1/logits_processors/utils.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 5d242dfb33ff..2a764c1c9e46 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -28,7 +28,7 @@ In vLLM logits processors operate at batch granularity. The contrived example be def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool): - self.req_info: dict[int, SamplingParams] = {} + self.req_info: dict[int, int] = {} def is_argmax_invariant(self) -> bool: """Never impacts greedy sampling""" diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py index 7ef20efa7d28..09ebbd5c8287 100644 --- a/examples/offline_inference/logits_processor.py +++ b/examples/offline_inference/logits_processor.py @@ -53,7 +53,7 @@ class DummyLogitsProcessor(LogitsProcessor): def __init__( self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool ): - self.req_info: dict[int, SamplingParams] = {} + self.req_info: dict[int, int] = {} def is_argmax_invariant(self) -> bool: """Never impacts greedy sampling""" diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py index c0bfc1a18fec..4d0ad86caf09 100644 --- a/tests/v1/logits_processors/utils.py +++ b/tests/v1/logits_processors/utils.py @@ -8,7 +8,6 @@ import torch from vllm.config import VllmConfig -from vllm.sampling_params import SamplingParams from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate, LogitsProcessor, MoveDirectionality) @@ -45,7 +44,7 @@ class DummyLogitsProcessor(LogitsProcessor): def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool): - self.req_info: dict[int, SamplingParams] = {} + self.req_info: dict[int, int] = {} def is_argmax_invariant(self) -> bool: """Never impacts greedy sampling""" From 5f2d48b330fc4de032a9be4985d51846c847e369 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 20 Aug 2025 22:07:25 -0400 Subject: [PATCH 10/31] refactor Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 11 ++++++----- examples/offline_inference/logits_processor.py | 9 ++++----- tests/v1/logits_processors/utils.py | 7 ++++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 2a764c1c9e46..7f3b8b95e7ea 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -65,11 +65,12 @@ In vLLM logits processors operate at batch granularity. The contrived example be return logits # Save target values before modification - rows_list = list(self.req_info.keys()) - cols = torch.tensor([self.req_info[i] for i in rows_list], - dtype=torch.long, - device=logits.device) - rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + cols = torch.tensor( + list(self.req_info.values()), dtype=torch.long, device=logits.device + ) + rows = torch.tensor( + list(self.req_info.keys()), dtype=torch.long, device=logits.device + ) values_to_keep = logits[rows, cols].clone() ``` diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py index 09ebbd5c8287..a08695795a73 100644 --- a/examples/offline_inference/logits_processor.py +++ b/examples/offline_inference/logits_processor.py @@ -91,13 +91,12 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor: return logits # Save target values before modification - rows_list = list(self.req_info.keys()) cols = torch.tensor( - [self.req_info[i] for i in rows_list], - dtype=torch.long, - device=logits.device, + list(self.req_info.values()), dtype=torch.long, device=logits.device + ) + rows = torch.tensor( + list(self.req_info.keys()), dtype=torch.long, device=logits.device ) - rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) values_to_keep = logits[rows, cols].clone() # Mask all but target tokens diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py index 4d0ad86caf09..c4d2abda5eaf 100644 --- a/tests/v1/logits_processors/utils.py +++ b/tests/v1/logits_processors/utils.py @@ -81,11 +81,12 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor: return logits # Save target values before modification - rows_list = list(self.req_info.keys()) - cols = torch.tensor([self.req_info[i] for i in rows_list], + cols = torch.tensor(list(self.req_info.values()), + dtype=torch.long, + device=logits.device) + rows = torch.tensor(list(self.req_info.keys()), dtype=torch.long, device=logits.device) - rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) values_to_keep = logits[rows, cols].clone() # Mask all but target tokens From 6ab228588942041e81c170be5cb855729e66866f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 20 Aug 2025 22:10:09 -0400 Subject: [PATCH 11/31] typo Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 7f3b8b95e7ea..23b75daf97e8 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -72,6 +72,12 @@ In vLLM logits processors operate at batch granularity. The contrived example be list(self.req_info.keys()), dtype=torch.long, device=logits.device ) values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float('-inf') + logits[rows, cols] = values_to_keep + + return logits ``` Throughout this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor. From 0121cde5b18f54999c22b4acc6e59aecd323c28b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 21 Aug 2025 10:40:14 -0400 Subject: [PATCH 12/31] fixes Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 61 ++++++++++++++++------------- docs/features/custom_logitsprocs.md | 4 +- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index d9f5981ffabc..ac7967ff104b 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -4,19 +4,21 @@ This document describes how the vLLM engine interacts with logits processors, an ## Logits Processors Background -A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior. +A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior. -In vLLM, logits processors operate at batch granularity: during a given engine step, the logits processor consumes a $(num\_requests) \times (vocab\_size)$ tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. +In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. ## Logits Processors in the vLLM engine -The vLLM engine's persistent batch data structure maintains a list of loaded logits processors. This list is passed to `SamplingMetadata` when the data structure is built. +The vLLM engine's persistent batch data structure maintains a list of loaded logits processors. In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch (i.e. each request's logits-processor-specific configuration settings). Therefore, logits processors are stateful. -In each engine step, the vLLM engine will: +In each engine step, the vLLM engine will (1) update each logits processor's internal state and (2) apply logits processors to the model output logits. -1. **Update each logits processor's internal state to match persistent batch internal state, by invoking each logits processor's `update_state()` method.** This is necessary to ensure that logits processors' internal state is reorganized to match the new persistent batch state at the end of the current step. The pseudocode below shows how the vLLM model runner computes updates to the persistent batch state and then notifies each logits processor of the state changes: +### Updating logits processor internal state + +The vLLM model runner invokes each logits processor's `update_state()` method at the end of each engine step. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the end of the current step. The pseudocode below shows that the vLLM model runner computes updates to the persistent batch state and then notifies each logits processor of the state changes: ??? code "Model Runner Updates Logits Processor States" @@ -74,12 +76,17 @@ In each engine step, the vLLM engine will: removed: Sequence[RemovedRequest] moved: Sequence[MovedRequest] added: Sequence[AddedRequest] + ``` !!! note `InputBatch.refresh_metadata()` generates a `BatchUpdate` data structure - representing the persistent batch state changes resulting from new, finished and reordered requests - and passes that data structure to the logits processors' `update_state()` methods. -2. **Apply the logits processors to the model output logits tensor, by invoking each logits processor's `apply()` method.** The pseudocode below shows how the vLLM model runner invokes the sampler, which in turn causes the logits processors to transform the model output logits. +### Applying logits processors to the model output logits + +The pseudocode below shows how the vLLM model runner invokes the sampler, which in turn invokes the logits processors' `apply()` methods against the model output logit processors. + +Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata`, the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`. ??? code "Apply logits processors to model output logits" @@ -245,9 +252,9 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) * `__init__()` * `apply(self, logits: torch.Tensor) -> torch.Tensor`: - * Consume a $(num\_requests) \times (vocab\_size)$ logits tensor (`logits`) + * Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`) * Apply logits processor transformation at batch granularity - * Return a transformed $(num\_requests) \times (vocab\_size)$ logits tensor + * Return a transformed `(num_requests) x (vocab_size)` logits tensor * `is_argmax_invariant(self) -> bool`: * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax @@ -261,11 +268,11 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (summarized below along with a schematic representation of how the batch is modified by the operation): -* **Add:** add (or replace existing request with) a new request at index $i$ +* **Add:** add (or replace existing request with) a new request at index `i` * An Add is represented in `Batchupdate.added` as a tuple of - ``` + ``` text (index, new request SamplingParams, prompt token ids, output token ids) ``` @@ -273,9 +280,9 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, * The implementation of the particular logits processor subclass determines whether or how the fields in the added request tuple are digested into an internal representation. For example, a logits processor that does not utilize prompt or output token ids may only need to utilize `index` and `SamplingParams` and discard the other tuple fields - * If index $i$ currently holds a request, a replacement occurs: + * If index `i` currently holds a request, a replacement occurs: - ``` + ``` text Batch: [A,B,C] New request to be added @ i: D @ 1 @@ -284,9 +291,9 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, New Batch: [A,D,C] # Add D, discard B ``` - * If index $i$ does not currently hold a request (because $i$ is out of bounds of the current batch size): + * If index `i` does not currently hold a request (because `i` is out of bounds of the current batch size): - ``` + ``` text Batch: [A,B,C] New request to be added @ i: D @ 3 @@ -295,13 +302,13 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, New Batch: [A,B,C,D] # Add D, extending batch ``` -* **Remove:** remove (without replacement) request at index $i$ +* **Remove:** remove (without replacement) request at index `i` - * A Remove is represented in `Batchupdate.removed` by an `int` (representing $i$) + * A Remove is represented in `Batchupdate.removed` by an `int` (representing `i`) * Effect of remove-at-index on batch: - ``` + ``` text Batch: [A,B,C] Remove @ i: 1 @@ -310,11 +317,11 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, New Batch: [A,x,C] # Discard B and leave an empty slot ``` -* **Move:** move request at index $s$ to index $d$ OR swap requests at indices $s$ and $d$ +* **Move:** move request at index `s` to index `d` OR swap requests at indices `s` and `d` * A Move is represented in `Batchupdate.moved` as a tuple of - ``` + ``` text (s, d, UNIDIRECTIONAL or SWAP) ``` @@ -322,7 +329,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, * The request at index `s` is moved to index `d`; index `s` becomes an empty slot - ``` + ``` text Batch: [A,x,C,D] Unidirectionally Move s -> d: 3 -> 1 @@ -333,7 +340,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, * If another request already resided at index `d`, it is replaced and discarded - ``` + ``` text Batch: [A,B,C,D] Unidirectionally Move s -> d: 3 -> 1 @@ -344,7 +351,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, * If the Move specifies `SWAP`, the requests at `s` and `d` exchange indices - ``` + ``` text Batch: [A,B,C,D] Swap Move s <-> d: 3 <-> 1 @@ -395,7 +402,7 @@ Notes: The following example models an engine step where 1 new request is introduced and 2 finished requests are eliminated, additionally the attention backend performs a swap to optimize the batch ordering. -``` +``` text Batch state (beginning of engine step): [A,B,C,D] Batch size: 4 @@ -429,7 +436,7 @@ Batch size: 3 The resulting `BatchUpdate` data structure will look like -``` +``` text BatchUpdate instance * added: [(0,E's SamplingParams,E's prompt tokens ref,E's output tokens ref)] * removed: [2] # request C was removed without replacement @@ -440,7 +447,7 @@ BatchUpdate instance The following example models an engine step where 2 new requests are introduced and 1 finished request is eliminated, additionally the attention backend performs a swap to optimize the batch ordering. -``` +``` text Batch state (beginning of engine step): [A,B,C,D] Batch size: 4 @@ -471,7 +478,7 @@ Note that batch condensation is skipped because there are no empty slots left be The resulting `BatchUpdate` data structure will look like -``` +``` text BatchUpdate instance * added: [(2,E's SamplingParams,E's prompt tokens ref,E's output tokens ref),(4,F's SamplingParams,F's prompt tokens ref,F's output tokens ref)] * removed: [] # no requests were removed without replacement @@ -490,7 +497,7 @@ BatchUpdate instance 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API - 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. $0.0$. Try to save compute and memory for requests which disable the logits processor + 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 23b75daf97e8..52e24f366761 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -6,7 +6,7 @@ This document shows how to write, load and use a custom logits processor. Review the [logits processor design documentation](../design/logits_processors.md) for baseline guidance on writing correct and efficient logits processors. -## Writing a Custom Logits Procesor +## Writing a Custom Logits Processor Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor.LogitsProcessor`. Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although your are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) @@ -132,7 +132,7 @@ See examples below: During initialization, vLLM automatically scans the `vllm.logits_processors` entry point group and loads any installed logits processors which it finds. -Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your logits processor Python package. The example below shows how to add an entrypoint to your project's `.toml` file: +Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your logits processor Python package. The example below shows how to add an entrypoint to your project's `pyproject.toml` file: ??? code "Exposing a custom logits processor as a Python entrypoint" From 61dd26add0422e30669b96456c019a951188989d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 21 Aug 2025 10:54:08 -0400 Subject: [PATCH 13/31] lint Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index ac7967ff104b..b69e5d51b9a8 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -271,7 +271,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, * **Add:** add (or replace existing request with) a new request at index `i` * An Add is represented in `Batchupdate.added` as a tuple of - + ``` text (index, new request SamplingParams, prompt token ids, output token ids) ``` @@ -320,7 +320,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, * **Move:** move request at index `s` to index `d` OR swap requests at indices `s` and `d` * A Move is represented in `Batchupdate.moved` as a tuple of - + ``` text (s, d, UNIDIRECTIONAL or SWAP) ``` @@ -379,7 +379,7 @@ Logits processor `update_state()` implementations should assume the following mo 2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1` 3. *If there are fewer new requests than finished requests:* - + * Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous From 68e3789c7f5153347f1837c259b96370c5a96836 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 21 Aug 2025 11:22:13 -0400 Subject: [PATCH 14/31] fixes Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 44 ++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index b69e5d51b9a8..bc8c56bf9a0e 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -1,4 +1,4 @@ -# Logits Processor Support in vLLM +# Logits Processors This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors. @@ -18,7 +18,9 @@ In each engine step, the vLLM engine will (1) update each logits processor's int ### Updating logits processor internal state -The vLLM model runner invokes each logits processor's `update_state()` method at the end of each engine step. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the end of the current step. The pseudocode below shows that the vLLM model runner computes updates to the persistent batch state and then notifies each logits processor of the state changes: +At the beginning of each engine step, the persistent batch adds, discards and reorders requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step. + +The pseudocode below shows the process by which the vLLM model runner notifies each logits processor of changes in persistent batch state: ??? code "Model Runner Updates Logits Processor States" @@ -38,8 +40,8 @@ The vLLM model runner invokes each logits processor's `update_state()` method at ... - # Update persistent batch to reflect new/finished requests & reordering - # of requests within batch + # ...update persistent batch to reflect new/finished requests & reordering + # of requests within batch... ... @@ -68,8 +70,8 @@ The vLLM model runner invokes each logits processor's `update_state()` method at @dataclass(frozen=True) class BatchUpdate: - # Batch state-change data structure which is passed to logits processor - # update_state() method + # Batch state-change data structure which is passed to logits processors' + # update_state() methods batch_size: int @@ -79,14 +81,11 @@ The vLLM model runner invokes each logits processor's `update_state()` method at ``` - !!! note - `InputBatch.refresh_metadata()` generates a `BatchUpdate` data structure - representing the persistent batch state changes resulting from new, finished and reordered requests - and passes that data structure to the logits processors' `update_state()` methods. - ### Applying logits processors to the model output logits -The pseudocode below shows how the vLLM model runner invokes the sampler, which in turn invokes the logits processors' `apply()` methods against the model output logit processors. +After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors. This process is shown in the pseudocode below. -Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata`, the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`. +Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata` (not shown in the code below), the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`. ??? code "Apply logits processors to model output logits" @@ -98,11 +97,18 @@ Note that the sampler will access the logits processors via `SamplingMetadata.lo ... def execute_model(self, scheduler_output, ...): + # (discussed in previous section) + self._update_states(scheduler_output) + + ... + + # ...run model inference to obtain logits... ... + # Invoke sampler, which applies logits processors sampler_output = self.sampler(logits=logits, - sampling_metadata=sampling_metadata) + sampling_metadata=sampling_metadata) ... @@ -125,14 +131,14 @@ Note that the sampler will access the logits processors via `SamplingMetadata.lo ... - # Return sampler output data structure + # ...return sampler output data structure... def sample(self, logits, sampling_metadta) ... - # Exit early if all requests are greedy-sampling + # ...exit early if all requests are greedy-sampling... ... @@ -142,16 +148,16 @@ Note that the sampler will access the logits processors via `SamplingMetadata.lo ... - # Perform sampling and return sampling result + # ...perform sampling and return sampling result... ``` -At sampling time, the engine saves compute by skipping "argmax-invariant" logits processors in the edge-case where all requests employ greedy sampling. Here, "argmax" is shorthand for the token ID with the highest logit value in a given row of the logits tensor (i.e. the token which the model weighted the highest for a given request). +At sampling time, the sampler checks whether all requests in the persistent batch employ greedy sampling. If that is the case, the sampler saves compute by skipping "argmax-invariant" logits processors. Here, "argmax" is shorthand for the token ID with the highest logit value in a given row of the logits tensor (i.e. the token which the model weighted the highest for a given request). * An **argmax-invariant logits processor** is a logits processor (such as Min-P) which does not modify the argmax. For example, a logits processor which masks out the lowest-probability tokens will not change which token ID has the max logit. Greedy sampling always picks the highest-logit-value token ID, and so conceptually an argmax-invariant logits processor can be skipped for greedy sampling requests. * A **non-argmax-invariant logits processor** is a logits processor which may modify the argmax. For example, a logits processor which masks all tokens except for EOS after a certain number of steps in order to force decoding to terminate might end up masking the max-logit-value token and therefore change the argmax. Conceptually, these logits processors cannot be skipped for greedy sampling requests. -The vLLM logits processor abstraction requires the engine to pass in state updates at batch granularity; therefore in practice state updates for argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling. +The vLLM logits processor abstraction requires the engine to apply logits processors at batch granularity; therefore in practice the `apply()` calls to argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling. ## Logits Processor Programming Model @@ -261,7 +267,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling * `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: - * Consume a `BatchUpdate` data structure representing persistent batch state changes at the end of the current engine step + * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step * Batch update data structure may be `None`, signaling no state-change ### `BatchUpdate` data structure @@ -360,7 +366,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, New Batch: [A,D,C,B] # Swap B and D ``` -Additionally, the `BatchUpdate` data structure includes a representation (`batch_size`) of the size of the persistent batch at the end of the engine step. +Additionally, the `BatchUpdate` data structure includes a representation (`batch_size`) of the size of the persistent batch at the beginning of the engine step. ### How the vLLM engine builds the `BatchUpdate` data structure From 088e94841bfd865e92574ff29f5084929ea27e6b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 21 Aug 2025 11:25:31 -0400 Subject: [PATCH 15/31] fix Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index bc8c56bf9a0e..f4b6467b7c33 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -66,7 +66,7 @@ The pseudocode below shows the process by which the vLLM model runner notifies e ... - # logits_processor/interface.py + # vllm/v1/sample/logits_processor/interface.py @dataclass(frozen=True) class BatchUpdate: @@ -515,7 +515,7 @@ BatchUpdate instance ### Built-In Logits Processors -Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently supports the following built-in logits processors based on the programming model described above: +Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `vllm/v1/sample/logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently supports the following built-in logits processors based on the programming model described above: * Min-P From 40185e08c813643c3d33c893e4f3f525e3ba2f1b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 21 Aug 2025 12:11:16 -0400 Subject: [PATCH 16/31] cap Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index f4b6467b7c33..cadbe9884ede 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -16,7 +16,7 @@ In order to operate on the entire batch at once, each logits processor may maint In each engine step, the vLLM engine will (1) update each logits processor's internal state and (2) apply logits processors to the model output logits. -### Updating logits processor internal state +### Updating Logits Processor Internal State At the beginning of each engine step, the persistent batch adds, discards and reorders requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step. @@ -81,7 +81,7 @@ The pseudocode below shows the process by which the vLLM model runner notifies e ``` -### Applying logits processors to the model output logits +### Applying Logits Processors to the Model Output Logits After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors. This process is shown in the pseudocode below. From d085a20193dab79d016918684bf4c2e17acc3b72 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Mon, 25 Aug 2025 09:29:37 -0400 Subject: [PATCH 17/31] Update examples/offline_inference/logits_processor.py Co-authored-by: Joseph Marinier Signed-off-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> --- examples/offline_inference/logits_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py index a08695795a73..165717f6bfed 100644 --- a/examples/offline_inference/logits_processor.py +++ b/examples/offline_inference/logits_processor.py @@ -56,7 +56,6 @@ def __init__( self.req_info: dict[int, int] = {} def is_argmax_invariant(self) -> bool: - """Never impacts greedy sampling""" return False def update_state(self, batch_update: Optional[BatchUpdate]): From e9a5ad02a39b105c6874ec6b1a7c8f2d273e8bfe Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Mon, 25 Aug 2025 10:00:02 -0400 Subject: [PATCH 18/31] wip Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index cadbe9884ede..17680a1f75c8 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -20,7 +20,7 @@ In each engine step, the vLLM engine will (1) update each logits processor's int At the beginning of each engine step, the persistent batch adds, discards and reorders requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step. -The pseudocode below shows the process by which the vLLM model runner notifies each logits processor of changes in persistent batch state: +The pseudocode below shows the process by which the vLLM persistent batch notifies each logits processor of changes in batch state: ??? code "Model Runner Updates Logits Processor States" @@ -157,7 +157,7 @@ At sampling time, the sampler checks whether all requests in the persistent batc * A **non-argmax-invariant logits processor** is a logits processor which may modify the argmax. For example, a logits processor which masks all tokens except for EOS after a certain number of steps in order to force decoding to terminate might end up masking the max-logit-value token and therefore change the argmax. Conceptually, these logits processors cannot be skipped for greedy sampling requests. -The vLLM logits processor abstraction requires the engine to apply logits processors at batch granularity; therefore in practice the `apply()` calls to argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling. +The vLLM logits processor abstraction requires the engine to apply logits processors at batch granularity; therefore in practice the argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling. ## Logits Processor Programming Model @@ -268,11 +268,12 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) * `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step - * Batch update data structure may be `None`, signaling no state-change + * Use the `BatchUpdate` members to update logits processor internal state + * **Note:** batch update data structure may be `None`, signaling no state-change ### `BatchUpdate` data structure -The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (summarized below along with a schematic representation of how the batch is modified by the operation): +The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state: * **Add:** add (or replace existing request with) a new request at index `i` From 9de11fc84c73a3b2bb0bb1117be7803714ab10ff Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 4 Sep 2025 11:35:01 -0400 Subject: [PATCH 19/31] reorder Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 4 ++-- vllm/v1/sample/logits_processor/interface.py | 6 +++--- vllm/v1/sample/logits_processor/state.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index 17680a1f75c8..c7453250643e 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -18,7 +18,7 @@ In each engine step, the vLLM engine will (1) update each logits processor's int ### Updating Logits Processor Internal State -At the beginning of each engine step, the persistent batch adds, discards and reorders requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step. +At the beginning of each engine step, the persistent batch may add, discard and/or reorder requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step. The pseudocode below shows the process by which the vLLM persistent batch notifies each logits processor of changes in batch state: @@ -76,8 +76,8 @@ The pseudocode below shows the process by which the vLLM persistent batch notifi batch_size: int removed: Sequence[RemovedRequest] - moved: Sequence[MovedRequest] added: Sequence[AddedRequest] + moved: Sequence[MovedRequest] ``` diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 683fc7c00dfb..04027359909a 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -21,6 +21,9 @@ class MoveDirectionality(Enum): SWAP = auto() +# Batch indices of any removed requests. +RemovedRequest = int + # (index, params, prompt_tok_ids, output_tok_ids) tuples for new # requests added to the batch. AddedRequest = tuple[int, SamplingParams, list[int], list[int]] @@ -29,9 +32,6 @@ class MoveDirectionality(Enum): # one-way moves or two-way swaps of requests in batch MovedRequest = tuple[int, int, MoveDirectionality] -# Batch indices of any removed requests. -RemovedRequest = int - @dataclass(frozen=True) class BatchUpdate: diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py index 31cece58c7db..0a1196559d3e 100644 --- a/vllm/v1/sample/logits_processor/state.py +++ b/vllm/v1/sample/logits_processor/state.py @@ -36,18 +36,18 @@ class BatchUpdateBuilder: _removed: list[RemovedRequest] _is_removed_sorted: bool - moved: list[MovedRequest] added: list[AddedRequest] + moved: list[MovedRequest] def __init__( self, removed: Optional[list[RemovedRequest]] = None, - moved: Optional[list[MovedRequest]] = None, added: Optional[list[AddedRequest]] = None, + moved: Optional[list[MovedRequest]] = None, ) -> None: self._removed = removed or [] - self.moved = moved or [] self.added = added or [] + self.moved = moved or [] self._is_removed_sorted = False # Used to track changes in the pooling case @@ -107,8 +107,8 @@ def reset(self) -> bool: """Returns True if there were any changes to the batch.""" self._is_removed_sorted = False self._removed.clear() - self.moved.clear() self.added.clear() + self.moved.clear() batch_changed = self.batch_changed self.batch_changed = False return batch_changed From bb2b30245ae8872998aae1801f4a9d723fd2a2c9 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 5 Sep 2025 09:54:43 -0400 Subject: [PATCH 20/31] feedback Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 24 +++++++++++++++--------- docs/features/custom_logitsprocs.md | 4 +++- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index c7453250643e..8a2986920e0b 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -83,7 +83,7 @@ The pseudocode below shows the process by which the vLLM persistent batch notifi ### Applying Logits Processors to the Model Output Logits -After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors. This process is shown in the pseudocode below. +After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors, yielding transformed logits (the `apply()` methods may modify the logits in-place or out-of-place, although in-place is more memory-efficient). This process is shown in the pseudocode below. Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata` (not shown in the code below), the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`. @@ -255,12 +255,16 @@ The previous sections alluded to the interfaces which vLLM logits processors mus A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods: -* `__init__()` +* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)` + * `vllm_config`: engine configuration data structure + * `device`: hardware accelerator device info + * `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation * `apply(self, logits: torch.Tensor) -> torch.Tensor`: * Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`) * Apply logits processor transformation at batch granularity * Return a transformed `(num_requests) x (vocab_size)` logits tensor + * You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient * `is_argmax_invariant(self) -> bool`: * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax @@ -269,13 +273,13 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) * `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step * Use the `BatchUpdate` members to update logits processor internal state - * **Note:** batch update data structure may be `None`, signaling no state-change + * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. ### `BatchUpdate` data structure -The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state: +The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`): -* **Add:** add (or replace existing request with) a new request at index `i` +* **Add:** add (or replace existing request with) a new request at index `i`. If a request is replaced, its associated state should be discarded. * An Add is represented in `Batchupdate.added` as a tuple of @@ -283,7 +287,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, (index, new request SamplingParams, prompt token ids, output token ids) ``` - * `prompt token ids` and `output token ids` are references to the request's prompt token ids and output token ids lists, respectively. Note that the output token ids list grows with each engine step, and this growth is visible to the logits processor because output token ids are passed by reference + * `prompt token ids` and `output token ids` are references to the request's prompt token ids and output token ids lists, respectively. Note that the output token ids list grows with each engine step, and this growth is visible to the logits processor because output token ids are passed by reference. **This is important for LogitsProcessors that take into account the tokens generated so far**. * The implementation of the particular logits processor subclass determines whether or how the fields in the added request tuple are digested into an internal representation. For example, a logits processor that does not utilize prompt or output token ids may only need to utilize `index` and `SamplingParams` and discard the other tuple fields @@ -397,7 +401,9 @@ Logits processor `update_state()` implementations should assume the following mo Notes: -* The index argument for Add and Remove operations refers to the index *at the time the Add or Remove occurred*, i.e. before any Move operations +* A logits processor `update_state()` method must process batch update operations in the following order: adds, removes, moves + +* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3 * In other words Move operations can be assumed to be applied after Adds and Removes @@ -516,7 +522,7 @@ BatchUpdate instance ### Built-In Logits Processors -Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `vllm/v1/sample/logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently supports the following built-in logits processors based on the programming model described above: +Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `vllm/v1/sample/logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently employs the following built-in logits processors based on the programming model described above: * Min-P @@ -526,7 +532,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th Review these logits processor implementations for guidance on writing built-in logits processors. -Additionally, the following logits processors or logits-processor-like functionalities are hard-coded into the sampler for efficiency and do not utilize the programming model described above, but may be updated to use the aforemented logits processor programming model in the future: +Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model. * Allowed token IDs diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 52e24f366761..a5b01664f87e 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -8,7 +8,7 @@ Review the [logits processor design documentation](../design/logits_processors.m ## Writing a Custom Logits Processor -Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor.LogitsProcessor`. Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although your are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) +Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor.LogitsProcessor`. Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) In vLLM logits processors operate at batch granularity. The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request: @@ -44,6 +44,8 @@ In vLLM logits processors operate at batch granularity. The contrived example be if params.extra_args and (target_token := params.extra_args.get("target_token")): self.req_info[index] = target_token + else: + self.req_info.pop(index, None) if self.req_info: # Process removed requests. From b7779827cfb83ac7ba8593fb494fc571525098ee Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 5 Sep 2025 10:18:12 -0400 Subject: [PATCH 21/31] AsyncLLM Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index a5b01664f87e..1a1ac9110721 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -122,6 +122,15 @@ See examples below: ) ``` +??? code "Passing custom logits processor FQCN to `AsyncLLM` in Python" + + ``` python + # Pass in FQCN + engine_args = AsyncEngineArgs(model="facebook/opt-125m", + logits_processors=["your.module.path:DummyLogitsProcessor"]) + async_llm = AsyncLLM.from_engine_args(engine_args) + ``` + ??? code "Passing custom logits processor FQCN to vLLM server via CLI" ```bash @@ -172,6 +181,11 @@ You can pass one or more custom logits processor class objects to the `LLM` cons model="facebook/opt-125m", logits_processors=[DummyLogitsProcessor], ) + + # Pass class object to AsyncLLM constructor + engine_args = AsyncEngineArgs(model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor]) + async_llm = AsyncLLM.from_engine_args(engine_args) ``` ## Invoking a Custom Logits Processor Against a Request From 91c79b7af02cdb9f92de860abda7e4b6e6f73bc5 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 5 Sep 2025 10:23:14 -0400 Subject: [PATCH 22/31] reorder Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index 8a2986920e0b..4ffbf19493fe 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -279,6 +279,21 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`): +* **Remove:** remove (without replacement) request at index `i` + + * A Remove is represented in `Batchupdate.removed` by an `int` (representing `i`) + + * Effect of remove-at-index on batch: + + ``` text + Batch: [A,B,C] + Remove @ i: 1 + + => + + New Batch: [A,x,C] # Discard B and leave an empty slot + ``` + * **Add:** add (or replace existing request with) a new request at index `i`. If a request is replaced, its associated state should be discarded. * An Add is represented in `Batchupdate.added` as a tuple of @@ -313,21 +328,6 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, New Batch: [A,B,C,D] # Add D, extending batch ``` -* **Remove:** remove (without replacement) request at index `i` - - * A Remove is represented in `Batchupdate.removed` by an `int` (representing `i`) - - * Effect of remove-at-index on batch: - - ``` text - Batch: [A,B,C] - Remove @ i: 1 - - => - - New Batch: [A,x,C] # Discard B and leave an empty slot - ``` - * **Move:** move request at index `s` to index `d` OR swap requests at indices `s` and `d` * A Move is represented in `Batchupdate.moved` as a tuple of @@ -401,7 +401,7 @@ Logits processor `update_state()` implementations should assume the following mo Notes: -* A logits processor `update_state()` method must process batch update operations in the following order: adds, removes, moves +* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves * The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3 From 32b7c1ebc88004d328750c8a482d0cb6ad910969 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 5 Sep 2025 10:34:33 -0400 Subject: [PATCH 23/31] reorg Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 100 ++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 6 deletions(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 1a1ac9110721..c4ab69aa08b2 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -4,13 +4,81 @@ A "custom" logits processor is written by a user of vLLM and is loaded into vLLM This document shows how to write, load and use a custom logits processor. -Review the [logits processor design documentation](../design/logits_processors.md) for baseline guidance on writing correct and efficient logits processors. +## Logits Processors Background -## Writing a Custom Logits Processor +A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior. -Custom logits processors must be subclasses of `vllm.v1.sample.logits_processor.LogitsProcessor`. Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) +In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. -In vLLM logits processors operate at batch granularity. The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request: +## Creating a Custom Logits Processor + +Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and define (at minimum) the following methods: + +* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)` + * `vllm_config`: engine configuration data structure + * `device`: hardware accelerator device info + * `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation + +* `apply(self, logits: torch.Tensor) -> torch.Tensor`: + * Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`) + * Apply logits processor transformation at batch granularity + * Return a transformed `(num_requests) x (vocab_size)` logits tensor + * You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient + +* `is_argmax_invariant(self) -> bool`: + * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax + * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling + +* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: + * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step + * Use the `BatchUpdate` members to update logits processor internal state + * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. + +### How the vLLM engine builds the `BatchUpdate` data structure + +Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction): + +1. Identify indices of requests which finished in the current engine step + +2. Identify new requests introduced in the current step + +3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index + +4. Based on the relative number of new and finished requests: + + 1. If the numbers of new and finished requests are the same, proceed to next step + + 2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1` + + 3. *If there are fewer new requests than finished requests:* + + * Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state + + * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous + + * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots + +5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch + +Notes: + +* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves + +* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations + * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3 + * In other words Move operations can be assumed to be applied after Adds and Removes + +* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved` + +* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None` + +### Passing Custom Argument to a Custom Logits Processor + +Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) + +## Example Custom Logits Processor Implementation + +The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request: ??? code "Example custom logits processor definition" @@ -82,11 +150,31 @@ In vLLM logits processors operate at batch granularity. The contrived example be return logits ``` -Throughout this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor. +In the rest of this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor. + +The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch. + +## Best Practices for Writing Built-In Logits Processors Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently. -The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch. +* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity + * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` + * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor + +* It is up to the logits processor author to determine: + + 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API + + 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor + + 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor + + * Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None` + +* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) + +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method ## Ways to Load Your Custom Logits Processor in vLLM From 1b4e4e28e85c9215982b252b27b811b7e0dadfb0 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 5 Sep 2025 10:38:11 -0400 Subject: [PATCH 24/31] AsyncLLM example Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index c4ab69aa08b2..4039cb4f2aa3 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -308,10 +308,22 @@ The examples below show how a user would pass a custom argument (`target_token`) ) ``` -??? code "Offline: configure custom logits processor for a request" +??? code "Offline: configure custom logits processor for an `LLM` request" ``` python outputs_logitproc = llm.generate("your prompt", SamplingParams(..., extra_args={"target_token": 67})) ``` + +??? code "Offline: configure custom logits processor for an `AsyncLLM` request" + + ``` python + async for out in engine.generate(request_id="your request id", + prompt="your prompt", + sampling_params=SamplingParams(..., + extra_args={"target_token": 67})): + + # Process async request outputs + ... + ``` From 95f9ba70386b2dffaa67747838a131376df35bde Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 5 Sep 2025 10:43:01 -0400 Subject: [PATCH 25/31] warnings Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 4 ++++ docs/features/custom_logitsprocs.md | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index 4ffbf19493fe..4348b1245361 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -1,5 +1,9 @@ # Logits Processors +!!! important + Some logits processors design changes are still in progress and the API may + change in the near future. We hope to stabilize this part of the API soon + This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors. ## Logits Processors Background diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 4039cb4f2aa3..6a07016da463 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -1,5 +1,9 @@ # Custom Logits Processors +!!! important + Some logits processors design changes are still in progress and the API may + change in the near future. We hope to stabilize this part of the API soon + A "custom" logits processor is written by a user of vLLM and is loaded into vLLM at initialization without needing to modify or recompile the vLLM source code. It is the opposite of a built-in logits processor. This document shows how to write, load and use a custom logits processor. From f3e173d33524ce6b6a2b385b5d0db28f39e96966 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 9 Sep 2025 02:51:11 -0400 Subject: [PATCH 26/31] wrapped lps Signed-off-by: Andrew Feldman --- docs/design/logits_processors.md | 6 +- docs/features/custom_logitsprocs.md | 124 ++++++++++++++++++++++++++-- 2 files changed, 118 insertions(+), 12 deletions(-) diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index 4348b1245361..20d78ca3aae2 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -479,7 +479,7 @@ Processing steps (using BatchUpdate abstraction): [A,B,E,D] # Discard C Batch size: 4 -2. Add E at index 4 (current max batch index + 1) +2. Add F at index 4 (current max batch index + 1) [A,B,E,D,F] # Extend batch by 1 Batch size: 5 @@ -514,9 +514,9 @@ BatchUpdate instance 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API - 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor + 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the built-in logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor - 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor + 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the built-in logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the built-in logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor * Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None` diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 6a07016da463..384300f7f656 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -80,7 +80,7 @@ Notes: Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) -## Example Custom Logits Processor Implementation +### Example Custom Logits Processor Implementation The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request: @@ -158,26 +158,132 @@ In the rest of this document, we will use `DummyLogitsProcessor` as an example o The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch. -## Best Practices for Writing Built-In Logits Processors +### Wrapping an Existing Request-Level Logits Processor + +Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation: + +``` python +RequestLogitsProcessor = Union[ + + # (output token ids, logits tensor) -> logits tensor + Callable[[list[int], Tensor], Tensor], + + # (prompt token ids, output token ids, logits tensor) -> logits tensor + Callable[[list[int], list[int], Tensor], Tensor], +] +``` + +While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above. + +You can wrap the request-level processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance: + +??? code "Example of Wrapping a Request-Level Logits Processor" + + ``` python + ... + + from vllm.v1.sample.logits_processor import ( + AdapterLogitsProcessor, # Wrapper base-class + RequestLogitsProcessor, # Request-level logitsproc type annotation + ) + + ... + + # Stand-in for your request-level logits processor: + class DummyPerReqLogitsProcessor: + """The request-level logits processor masks out all logits except the + token id identified by `target_token`""" + + def __init__(self, target_token: int) -> None: + """Specify `target_token`""" + self.target_token = target_token + + def __call__( + self, + output_ids: list[int], + logits: torch.Tensor, + ) -> torch.Tensor: + val_to_keep = logits[self.target_token].item() + logits[:] = float("-inf") + logits[self.target_token] = val_to_keep + return logits + + ... + + # Example of wrapping the request-level logits processor: + class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): + """Example of wrapping a fake request-level logit processor to create a + batch-level logits processor""" + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor( + self, + params: SamplingParams, + ) -> Optional[RequestLogitsProcessor]: + """This method returns a new request-level logits processor, customized + to the `target_token` value associated with a particular request. + + Returns None if the logits processor should not be applied to the + particular request. To use the logits processor the request must have + a "target_token" custom argument with an integer value. + + Args: + params: per-request sampling params + + Returns: + `Callable` request logits processor, or None + """ + target_token: Optional[Any] = params.extra_args and params.extra_args.get( + "target_token" + ) + if target_token is None: + return None + if not isinstance(target_token, int): + logger.warning( + "target_token value %s is not int; not applying logits" + " processor to request.", + target_token, + ) + return None + return DummyPerReqLogitsProcessor(target_token) + ``` + +Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) which wraps your request level logits processor, you can pass the custom subclass to vLLM (this will be described in subsequent sections.) + +!!! note + Your `new_req_logits_processor()` override can return `None` to signal that the wrapped logits processor should not be applied to the request in question. + +## Best Practices for Writing Custom Logits Processors Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently. * Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor + * **Note:** wrapped request-level logits processors do not need to implement `apply()` and `update_state()`; the default `AdapterLogitsProcessor.update_state()` implementation maintains a sparse representation of request state, wherein requests for which `new_req_logits_processor()` returns `None` are not represented in the base-class state dictionary. The default implementation of `AdapterLogitsProcessor.apply()` applies the request-level logits processor to each row of input logits sequentially and assembles the output logits tensor. If the performance of this `AdapterLogitsProcessor` default implementation is insufficient, then avoid wrapping your request-level logits processor and instead re-implement it as a `LogitsProcessor` subclass with optimized `apply()` and `update_state()` implementations that operate at batch granularity * It is up to the logits processor author to determine: - 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API + 1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state + + * **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance. 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor + + * **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor - * Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None` + * Additionally, an easy way to save compute in `update_state()` is to exit early when the `batch_update` is `None` + + * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class implements the above optimizations by default * Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) + * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default + * `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method ## Ways to Load Your Custom Logits Processor in vLLM @@ -188,7 +294,7 @@ This section details different ways of making your logits processor visible to v ### Method 1: Pass the Custom Logits Processor Fully-Qualified Class Name (FQCN) to vLLM at Initialization Time -This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` Python constructor, or as a CLI argument to `vllm serve` with the following syntax +This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` and `AsyncLLM` Python constructors, or as a CLI argument to `vllm serve` with the following syntax ``` bash vllm serve ... --logits_processors ... @@ -244,16 +350,16 @@ Suppose that you have developed a Python package that holds your custom logits p dummy_logits_processor = "your.module.path:DummyLogitsProcessor" ``` -Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to the `LLM` constructor or to the vLLM server explicitly at initialization time if your logits processor is exposed as an entry point. +Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to the `LLM` or `AsyncLLM` constructors or to the vLLM server explicitly at initialization time if your logits processor is exposed as an entry point. !!! note vLLM will *always* load *all* logits processors which are exposed via entrypoints under the `vllm.logits_processors` grouping. ### Method 3 (Offline-only): Pass a Python Class Object to the vLLM Constructor -You can pass one or more custom logits processor class objects to the `LLM` constructor. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` is instantiated, or (2) imported from a Python package. +You can pass one or more custom logits processor class objects to the `LLM` and `AsyncLLM` constructors. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` or `AsyncLLM` is instantiated, or (2) imported from a Python package. -??? code "Passing custom logits processor class object to `LLM` in Python" +??? code "Passing custom logits processor class object to `LLM` or `AsyncLLM` in Python" ``` python # Import custom logits processor @@ -282,7 +388,7 @@ You can pass one or more custom logits processor class objects to the `LLM` cons ## Invoking a Custom Logits Processor Against a Request -The design of the custom logits processor determines whether the logits processor must be enabled/disabled for a given request, and what arguments must be provided to configure the logits processor. For more information, review [the logits processors design documentation](../design/logits_processors.md). +The design of the custom logits processor determines whether the logits processor must be enabled/disabled for a given request, and what arguments must be provided to configure the logits processor. The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior. From b7e59120f3ae4cb80fcc78d7052aae669a10482b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 9 Sep 2025 02:59:34 -0400 Subject: [PATCH 27/31] refactor Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 66 ++++++++++++++--------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 384300f7f656..0f161bdb00cf 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -175,7 +175,7 @@ RequestLogitsProcessor = Union[ While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above. -You can wrap the request-level processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance: +You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance: ??? code "Example of Wrapping a Request-Level Logits Processor" @@ -249,42 +249,11 @@ You can wrap the request-level processor by subclassing `AdapterLogitsProcessor` return None return DummyPerReqLogitsProcessor(target_token) ``` - -Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) which wraps your request level logits processor, you can pass the custom subclass to vLLM (this will be described in subsequent sections.) !!! note Your `new_req_logits_processor()` override can return `None` to signal that the wrapped logits processor should not be applied to the request in question. -## Best Practices for Writing Custom Logits Processors - -Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently. - -* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity - * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` - * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor - * **Note:** wrapped request-level logits processors do not need to implement `apply()` and `update_state()`; the default `AdapterLogitsProcessor.update_state()` implementation maintains a sparse representation of request state, wherein requests for which `new_req_logits_processor()` returns `None` are not represented in the base-class state dictionary. The default implementation of `AdapterLogitsProcessor.apply()` applies the request-level logits processor to each row of input logits sequentially and assembles the output logits tensor. If the performance of this `AdapterLogitsProcessor` default implementation is insufficient, then avoid wrapping your request-level logits processor and instead re-implement it as a `LogitsProcessor` subclass with optimized `apply()` and `update_state()` implementations that operate at batch granularity - -* It is up to the logits processor author to determine: - - 1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state - - * **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance. - - 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor - - * **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request - - 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor - - * Additionally, an easy way to save compute in `update_state()` is to exit early when the `batch_update` is `None` - - * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class implements the above optimizations by default - -* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) - - * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default - -* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method +Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) which wraps your request level logits processor, you can pass the custom subclass to vLLM via any of the methods described in the following section. ## Ways to Load Your Custom Logits Processor in vLLM @@ -437,3 +406,34 @@ The examples below show how a user would pass a custom argument (`target_token`) # Process async request outputs ... ``` + +## Best Practices for Writing Custom Logits Processors + +Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently. + +* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity + * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` + * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor + * **Note:** wrapped request-level logits processors do not need to implement `apply()` and `update_state()`; the default `AdapterLogitsProcessor.update_state()` implementation maintains a sparse representation of request state, wherein requests for which `new_req_logits_processor()` returns `None` are not represented in the base-class state dictionary. The default implementation of `AdapterLogitsProcessor.apply()` applies the request-level logits processor to each row of input logits sequentially and assembles the output logits tensor. If the performance of this `AdapterLogitsProcessor` default implementation is insufficient, then avoid wrapping your request-level logits processor and instead re-implement it as a `LogitsProcessor` subclass with optimized `apply()` and `update_state()` implementations that operate at batch granularity + +* It is up to the logits processor author to determine: + + 1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state + + * **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance. + + 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor + + * **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request + + 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor + + * Additionally, an easy way to save compute in `update_state()` is to exit early when the `batch_update` is `None` + + * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class implements the above optimizations by default + +* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) + + * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default + +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method \ No newline at end of file From c70376d04546aad16de339ebde2b50b0584c7079 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 9 Sep 2025 07:25:17 -0400 Subject: [PATCH 28/31] lint failures Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 0f161bdb00cf..19525571f84b 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -419,11 +419,11 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke * It is up to the logits processor author to determine: 1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state - + * **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance. 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor - + * **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor @@ -436,4 +436,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default -* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method \ No newline at end of file +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method From bfc6ac57b3395b73a98109e9d3827c6eef2d1357 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 9 Sep 2025 08:27:27 -0400 Subject: [PATCH 29/31] retrigger checks Signed-off-by: Andrew Feldman From 6c5f58de792c6497e1cd02eb6e9331bc2f94cbd2 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 16 Sep 2025 20:22:45 -0400 Subject: [PATCH 30/31] disclaimer Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 19525571f84b..ef186901e967 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -40,6 +40,11 @@ Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsPr ### How the vLLM engine builds the `BatchUpdate` data structure +!!! important + Some logits processors design changes are still in progress, we expect + that once those changes are complete the information in this section will + become irrelevant + Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction): 1. Identify indices of requests which finished in the current engine step From 758e6b16af66c664e3c4abf84218d204f2c6b072 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 16 Sep 2025 20:26:45 -0400 Subject: [PATCH 31/31] more disclaimer Signed-off-by: Andrew Feldman --- docs/features/custom_logitsprocs.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index ef186901e967..201b340c5972 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -41,9 +41,10 @@ Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsPr ### How the vLLM engine builds the `BatchUpdate` data structure !!! important - Some logits processors design changes are still in progress, we expect - that once those changes are complete the information in this section will - become irrelevant + Some logits processors design changes are still in progress. We expect + that in the future you will not need to account for batch state changes + when implementing a logits processor, and the information in this section + will become irrelevant. Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction):