more fixes

jain-ria · jain-ria · commit 9e929ab11540 · 2025-08-08T13:13:26.000-07:00
diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md
@@ -185,6 +185,7 @@ For comprehensive instructions on multinode serving, see the [multinode-examples
 
 ### Speculative Decoding
 - **[Llama 4 Maverick Instruct + Eagle Speculative Decoding](./llama4_plus_eagle.md)**
+- **[Async Speculative Decoding](./async_spec_dec.md)**
 
 ### Kubernetes Deployment
 
diff --git a/components/backends/trtllm/async_spec_dec.md b/components/backends/trtllm/async_spec_dec.md
@@ -0,0 +1,38 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Async Speculative Decoding
+
+This guide demonstrates how to run Draft-Target Model (DTM) speculative decoding asynchronously in Dynamo, where the draft model and target model run as separate Dynamo workers with the TRT-LLM backend.
+
+## Setup
+
+Follow the [Quickstart setup](./README.md#quick-start) instructions. Then, inside the container, run the following example:
+
+```
+cd $DYNAMO_HOME/components/backends/trtllm
+./launch/spec_dec.sh
+```
+
+To scale up the number of drafters:
+
+```
+cd $DYNAMO_HOME/components/backends/trtllm
+export NUM_DRAFTERS=2
+export DRAFTER_CUDA_VISIBLE_DEVICES:-"1,2"
+./launch/spec_dec.sh
+```
diff --git a/components/backends/trtllm/launch/spec_dec.sh b/components/backends/trtllm/launch/spec_dec.sh
@@ -11,7 +11,7 @@ export VERIFIER_CUDA_VISIBLE_DEVICES=${VERIFIER_CUDA_VISIBLE_DEVICES:-"0"}
 
 # Drafter variables
 export NUM_DRAFTERS=${NUM_DRAFTERS:-1}
-export DRAFTER_MODEL_PATH=${MODEL_PATH:-"meta-llama/Meta-Llama-3.2-1B-Instruct"}
+export DRAFTER_MODEL_PATH=${DRAFTER_MODEL_PATH:-"meta-llama/Meta-Llama-3.2-1B-Instruct"}
 export DRAFTER_MODEL_NAME=${DRAFTER_MODEL_NAME:-"meta-llama/Meta-Llama-3.2-1B-Instruct"}
 export DRAFTER_ENGINE_ARGS=${DRAFTER_ENGINE_ARGS:-"engine_configs/drafter.yaml"}
 export DRAFTER_CUDA_VISIBLE_DEVICES=${DRAFTER_CUDA_VISIBLE_DEVICES:-"1"}
diff --git a/components/backends/trtllm/src/dynamo/trtllm/utils/api_drafter.py b/components/backends/trtllm/src/dynamo/trtllm/utils/api_drafter.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+from typing import Dict, List
+
+from tensorrt_llm._torch.speculative.external_api import APIDrafter
+
+from dynamo.runtime import DistributedRuntime
+from dynamo.runtime.logging import configure_dynamo_logging
+
+configure_dynamo_logging()
+# TODO: remove this
+logging.getLogger().setLevel(logging.WARNING)
+
+
+class DynamoAPIDrafter(APIDrafter):
+    """
+    Custom Dynamo drafter to support internal Dynamo endpoints instead of only HTTP endpoints.
+    """
+
+    def __init__(self, spec_config, runtime: DistributedRuntime):
+        super().__init__(spec_config)
+        self.client = None
+        self.max_draft_len = spec_config.max_draft_len
+        # TODO: allow custom etcd connection info to be set in the spec_config
+        self.connection_info: Dict[str, str] = {}
+
+    async def _create_client(self):
+        try:
+            # parse endpoint
+            endpoint_path = self.endpoint.replace("dyn://", "")
+            parts = endpoint_path.split(".")
+            if len(parts) != 3:
+                raise ValueError(
+                    f"Invalid Dynamo endpoint format. Received: {self.endpoint}, but expected: dyn://namespace.component.endpoint"
+                )
+            namespace, component, endpoint = parts
+
+            # create minimal runtime for client access only
+            etcd_endpoints = self.connection_info.get(
+                "etcd_endpoints", "localhost:2379"
+            )
+            os.environ.setdefault("ETCD_ENDPOINTS", etcd_endpoints)
+            loop = asyncio.get_event_loop()
+            self.runtime = DistributedRuntime(loop, False)
+
+            self.client = (
+                await self.runtime.namespace(namespace)
+                .component(component)
+                .endpoint(endpoint)
+                .client()
+            )
+        except Exception as e:
+            logging.error(
+                f"Failed to create client for Dynamo endpoint: {self.endpoint} with error: {e}"
+            )
+            raise e
+
+    async def get_draft_tokens(
+        self,
+        prefix: list[int],
+        request_id: int,
+        end_id: int,
+        max_sequence_length: int,
+    ) -> List[int]:
+        print(f"VERIFIER:  {prefix}\n")
+        if self.endpoint.startswith("dyn://"):
+            request_data = {
+                "token_ids": prefix,
+                "sampling_options": {},
+                "stop_conditions": {
+                    "max_tokens": self.max_draft_len,
+                },
+            }
+
+            if self.client is None:
+                await self._create_client()
+
+            draft_tokens = List[int] = []
+            try:
+                if self.client is None:
+                    logging.error(
+                        f"Failed to create client for Dynamo endpoint: {self.endpoint}"
+                    )
+                    return []
+                response = await self.client.round_robin(request_data)
+
+                async for chunk in response:
+                    chunk_data = chunk.data()
+                    if chunk_data.get("finish_reason"):
+                        break
+                    draft_tokens.extend(chunk_data.get("token_ids", []))
+                    if len(draft_tokens) >= self.max_draft_len:
+                        break
+                print(f"DRAFTER:   {draft_tokens}\n")
+                return draft_tokens[: self.max_draft_len]
+            except Exception as e:
+                logging.error(
+                    f"Failed to get draft tokens for Dynamo endpoint: {self.endpoint} with error: {e}"
+                )
+                raise e
+        else:
+            raise ValueError(
+                f"Invalid Dynamo endpoint format. Received: {self.endpoint}, but expected: dyn://namespace.component.endpoint"
+            )
diff --git a/components/backends/trtllm/src/dynamo/trtllm/utils/trtllm_utils.py b/components/backends/trtllm/src/dynamo/trtllm/utils/trtllm_utils.py
@@ -99,14 +99,14 @@ def is_drafter(config):
     """
     Check if the current worker is a drafter worker.
     """
-    return config.component == "drafter"
+    return config.spec_dec_mode == "drafter"
 
 
 def is_verifier(config):
     """
     Check if the current worker is a verifier worker.
     """
-    return config.component == "verifier"
+    return config.spec_dec_mode == "verifier"
 
 
 def parse_endpoint(endpoint: str) -> tuple[str, str, str]: