chore: refactor logits processors to be in place

bhuvan002 · bhuvan002 · commit 3cbbd4cdaf2b · 2025-08-26T20:43:04.000Z
diff --git a/components/backends/trtllm/src/dynamo/trtllm/logits_processing/adapter.py b/components/backends/trtllm/src/dynamo/trtllm/logits_processing/adapter.py
@@ -50,16 +50,17 @@ def __call__(
         stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr)
         try:
             with torch.cuda.stream(stream):
-                for idx, (ids_req, logits_req) in enumerate(zip(ids, logits)):
-                    if logits_req.shape[0] != 1:
-                        raise ValueError(
-                            "Logits processing with beam width > 1 is not supported"
-                        )
-                    # Remove dimension 0 from logits_req
-                    modified_logits = self.processor(ids_req, logits_req.reshape(-1))
-
-                    # TRT-LLM expects in-place modification
-                    logits[idx, 0, :].copy_(modified_logits)
+                if logits.shape[0] != 1:
+                    raise ValueError(
+                        f"This logits adapter only supports per-request logits processing. "
+                        f"Received logits with batch size {logits.shape[0]} expected 1"
+                    )
+                if logits.shape[1] != 1:
+                    raise ValueError(
+                        "Logits processing with beam width > 1 is not supported"
+                    )
+                # Call the processor which modifies the logits in-place
+                self.processor(ids[0], logits[0, 0, :])
 
         except Exception as e:
             logger.error(f"Error in logits processor for request {req_ids}: {e}")
diff --git a/lib/bindings/python/src/dynamo/logits_processing/base.py b/lib/bindings/python/src/dynamo/logits_processing/base.py
@@ -26,15 +26,14 @@ def __call__(
         self,
         input_ids: Sequence[int],
         logits: torch.Tensor,
-    ) -> torch.Tensor:
+    ):
         """
         Process the logits for the next token prediction.
 
         Args:
             input_ids: The input token IDs generated so far.
             logits: The raw logits for the next token. Shape: (vocab_size,)
 
-        Returns:
-            A tensor with the same shape, dtype, and device as `logits`.
+        The processor is expected to modify the logits in-place.
         """
         ...
diff --git a/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py b/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py
@@ -23,7 +23,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase):
         self.eos_id = tokenizer.eos_token_id
         self.state = 0
 
-    def __call__(self, input_ids: Sequence[int], scores: torch.Tensor) -> torch.Tensor:
+    def __call__(self, input_ids: Sequence[int], scores: torch.Tensor):
         mask = torch.full_like(scores, float("-inf"))
 
         if self.state < len(self.token_ids):
@@ -36,5 +36,3 @@ def __call__(self, input_ids: Sequence[int], scores: torch.Tensor) -> torch.Tens
         # The `scores` tensor *must* also be modified in-place
         scores.add_(mask)
         self.state += 1
-
-        return scores
diff --git a/lib/bindings/python/src/dynamo/logits_processing/examples/temperature.py b/lib/bindings/python/src/dynamo/logits_processing/examples/temperature.py
@@ -26,17 +26,16 @@ def __init__(self, temperature: float = 1.0):
             raise ValueError("Temperature must be positive")
         self.temperature = temperature
 
-    def __call__(self, input_ids: Sequence[int], logits: torch.Tensor) -> torch.Tensor:
+    def __call__(self, input_ids: Sequence[int], logits: torch.Tensor):
         """
         Apply temperature scaling to logits.
 
         Args:
             input_ids: Token IDs generated so far (unused in this simple example)
             logits: Raw logits tensor from model
 
-        Returns:
-            Temperature-scaled logits tensor
+        The processor is expected to modify the logits in-place.
         """
         if self.temperature == 1.0:
-            return logits
-        return logits / self.temperature
+            return
+        logits.div_(self.temperature)