fix: use cuda stream provided by trtllm

bhuvan002 · bhuvan002 · commit 421aa871e0bc · 2025-08-26T04:21:48.000Z
diff --git a/components/backends/trtllm/src/dynamo/trtllm/logits_processing/adapter.py b/components/backends/trtllm/src/dynamo/trtllm/logits_processing/adapter.py
@@ -47,18 +47,19 @@ def __call__(
         Returns:
             Modified logits tensor (in-place modification expected by TRT-LLM)
         """
-        print(f"Shapes: logits {logits.shape}, ids {ids}")
+        stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr)
         try:
-            for ids_req, logits_req in zip(ids, logits):
-                if logits_req.shape[0] != 1:
-                    raise ValueError(
-                        "Logits processing with beam width > 1 is not supported"
-                    )
-                # Remove dimension 0 from logits_req
-                modified_logits = self.processor(ids_req, logits_req.reshape(-1))
-
-                # TRT-LLM expects in-place modification
-                logits.copy_(modified_logits)
+            with torch.cuda.stream(stream):
+                for idx, (ids_req, logits_req) in enumerate(zip(ids, logits)):
+                    if logits_req.shape[0] != 1:
+                        raise ValueError(
+                            "Logits processing with beam width > 1 is not supported"
+                        )
+                    # Remove dimension 0 from logits_req
+                    modified_logits = self.processor(ids_req, logits_req.reshape(-1))
+
+                    # TRT-LLM expects in-place modification
+                    logits[idx, 0, :].copy_(modified_logits)
 
         except Exception as e:
             logger.error(f"Error in logits processor for request {req_ids}: {e}")
diff --git a/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py b/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py
@@ -24,7 +24,6 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase):
         self.state = 0
 
     def __call__(self, input_ids: Sequence[int], scores: torch.Tensor) -> torch.Tensor:
-        print("Calling logits processor")
         mask = torch.full_like(scores, float("-inf"))
 
         if self.state < len(self.token_ids):