feat: add logits processor support for trtllm backend

bhuvan002 · bhuvan002 · commit 536ad291c856 · 2025-08-26T20:43:19.000Z
diff --git a/components/backends/trtllm/src/dynamo/trtllm/logits_processing/__init__.py b/components/backends/trtllm/src/dynamo/trtllm/logits_processing/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from .adapter import TrtllmDynamoLogitsAdapter, create_trtllm_adapters
+
+__all__ = ["TrtllmDynamoLogitsAdapter", "create_trtllm_adapters"]
diff --git a/components/backends/trtllm/src/dynamo/trtllm/logits_processing/adapter.py b/components/backends/trtllm/src/dynamo/trtllm/logits_processing/adapter.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import List, Optional
+
+import torch
+from tensorrt_llm.sampling_params import LogitsProcessor
+
+from dynamo.logits_processing import BaseLogitsProcessor
+
+logger = logging.getLogger(__name__)
+
+
+class TrtllmDynamoLogitsAdapter(LogitsProcessor):
+    """
+    Adapter that wraps Dynamo BaseLogitsProcessor instances to work with TensorRT-LLM's logits processor interface.
+
+    Inherits from tensorrt_llm.LogitsProcessor and implements the required interface:
+    __call__(self, req_ids: int, logits: torch.Tensor, ids: List[List[int]], stream_ptr, client_id: Optional[int])
+
+    This adapter maintains per-request state and converts between the interfaces.
+    """
+
+    def __init__(self, processor: BaseLogitsProcessor):
+        super().__init__()
+        self.processor = processor
+
+    def __call__(
+        self,
+        req_ids: int,
+        logits: torch.Tensor,
+        ids: List[List[int]],
+        stream_ptr,
+        client_id: Optional[int] = None,
+    ):
+        """
+        TensorRT-LLM logits processor interface.
+
+        Args:
+            req_ids: Request identifier
+            logits: Logits tensor for current step
+            ids: List of token sequences (batch of sequences)
+            stream_ptr: CUDA stream pointer
+            client_id: Optional client identifier
+
+        Returns:
+            Modified logits tensor (in-place modification expected by TRT-LLM)
+        """
+        print(f"Shapes: logits {logits.shape}, ids {ids}")
+        try:
+            for ids_req, logits_req in zip(ids, logits):
+                if logits_req.shape[0] != 1:
+                    raise ValueError(
+                        "Logits processing with beam width > 1 is not supported"
+                    )
+                # Remove dimension 0 from logits_req
+                modified_logits = self.processor(ids_req, logits_req.reshape(-1))
+
+                # TRT-LLM expects in-place modification
+                logits.copy_(modified_logits)
+
+        except Exception as e:
+            logger.error(f"Error in logits processor for request {req_ids}: {e}")
+            # Don't modify logits on error
+
+        # TRT-LLM expects void return (in-place modification)
+
+
+def create_trtllm_adapters(
+    processors: List[BaseLogitsProcessor],
+) -> List[TrtllmDynamoLogitsAdapter]:
+    """
+    Create TensorRT-LLM compatible adapters from Dynamo logits processors.
+
+    Args:
+        processors: List of Dynamo BaseLogitsProcessor instances
+
+    Returns:
+        List of TensorRT-LLM compatible logits processor adapters
+    """
+    adapters = []
+    for processor in processors:
+        adapter = TrtllmDynamoLogitsAdapter(processor)
+        adapters.append(adapter)
+    return adapters
diff --git a/components/backends/trtllm/src/dynamo/trtllm/main.py b/components/backends/trtllm/src/dynamo/trtllm/main.py
@@ -162,7 +162,7 @@ async def init(runtime: DistributedRuntime, config: Config):
         "pipeline_parallel_size": config.pipeline_parallel_size,
         "moe_expert_parallel_size": config.expert_parallel_size,
         "backend": "pytorch",
-        "skip_tokenizer_init": True,
+        "skip_tokenizer_init": False,
         "build_config": build_config,
         "kv_cache_config": kv_cache_config,
         "gpus_per_node": gpus_per_node,
diff --git a/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py b/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -21,8 +21,10 @@
 from tensorrt_llm import SamplingParams
 from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
 
+from dynamo.logits_processing.examples import HelloWorldLogitsProcessor
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.trtllm.engine import TensorRTLLMEngine
+from dynamo.trtllm.logits_processing import create_trtllm_adapters
 from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
 from dynamo.trtllm.publisher import Publisher
 from dynamo.trtllm.utils.disagg_utils import (
@@ -168,6 +170,11 @@ async def generate_locally(self, request: dict):
         request_id = request.get("id") or request.get("request_id", "unknown-id")
         model_name = request.get("model", "unknown_model")
 
+        # TODO: Just for testing. Hardcoding the hello world processor.
+        processors = [HelloWorldLogitsProcessor(self.engine.llm.tokenizer)]
+        adapters = create_trtllm_adapters(processors)
+        sampling_params.logits_processor = adapters
+
         # NEW: Updated engine call to include multimodal data
         async for res in self.engine.llm.generate_async(
             inputs=processed_input,  # Use the correctly extracted inputs
diff --git a/lib/bindings/python/src/dynamo/logits_processing/base.py b/lib/bindings/python/src/dynamo/logits_processing/base.py
@@ -8,11 +8,12 @@
 logits processors must implement.
 """
 
-from typing import Protocol, Sequence
+from typing import Protocol, Sequence, runtime_checkable
 
 import torch
 
 
+@runtime_checkable
 class BaseLogitsProcessor(Protocol):
     """
     Protocol for logits processors in Dynamo.
diff --git a/lib/bindings/python/src/dynamo/logits_processing/examples/__init__.py b/lib/bindings/python/src/dynamo/logits_processing/examples/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from .hello_world import HelloWorldLogitsProcessor
+from .temperature import TemperatureProcessor
+
+__all__ = ["TemperatureProcessor", "HelloWorldLogitsProcessor"]
diff --git a/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py b/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Sequence
+
+import torch
+from transformers import PreTrainedTokenizerBase
+
+from dynamo.logits_processing import BaseLogitsProcessor
+
+RESPONSE = "Hello world!"
+
+
+class HelloWorldLogitsProcessor(BaseLogitsProcessor):
+    """
+    Sample Logits Processor that always outputs a hardcoded
+    response (`RESPONSE`), no matter the input
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        self.tokenizer = tokenizer
+        self.token_ids = tokenizer.encode(RESPONSE, add_special_tokens=False)
+        self.eos_id = tokenizer.eos_token_id
+        self.state = 0
+
+    def __call__(self, input_ids: Sequence[int], scores: torch.Tensor) -> torch.Tensor:
+        print("Calling logits processor")
+        mask = torch.full_like(scores, float("-inf"))
+
+        if self.state < len(self.token_ids):
+            token_idx = self.token_ids[self.state]
+        else:
+            token_idx = self.eos_id
+        # Allow only a single token to be output
+        mask[token_idx] = 0.0
+
+        # The `scores` tensor *must* also be modified in-place
+        scores.add_(mask)
+        self.state += 1
+
+        return scores
diff --git a/lib/bindings/python/src/dynamo/logits_processing/examples/temperature.py b/lib/bindings/python/src/dynamo/logits_processing/examples/temperature.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Sequence
+
+import torch
+
+from dynamo.logits_processing import BaseLogitsProcessor
+
+
+class TemperatureProcessor(BaseLogitsProcessor):
+    """
+    Example logits processor that applies temperature scaling.
+
+    This is a simple demonstration of how to implement a logits processor
+    that can be used with any Dynamo backend.
+    """
+
+    def __init__(self, temperature: float = 1.0):
+        """
+        Args:
+            temperature: Scaling factor. Higher values make distribution more uniform,
+                        lower values make it more peaked. Must be positive.
+        """
+        if temperature <= 0:
+            raise ValueError("Temperature must be positive")
+        self.temperature = temperature
+
+    def __call__(self, input_ids: Sequence[int], logits: torch.Tensor) -> torch.Tensor:
+        """
+        Apply temperature scaling to logits.
+
+        Args:
+            input_ids: Token IDs generated so far (unused in this simple example)
+            logits: Raw logits tensor from model
+
+        Returns:
+            Temperature-scaled logits tensor
+        """
+        if self.temperature == 1.0:
+            return logits
+        return logits / self.temperature