ai-dynamo
diff --git a/‎lib/bindings/python/rust/lib.rs‎
Lines changed: 6 additions & 0 deletions b/‎lib/bindings/python/rust/lib.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/bindings/python/rust/llm/local_model.rs‎
Lines changed: 21 additions & 0 deletions b/‎lib/bindings/python/rust/llm/local_model.rs‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎lib/bindings/python/src/dynamo/_core.pyi‎
Lines changed: 2 additions & 2 deletions b/‎lib/bindings/python/src/dynamo/_core.pyi‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/bindings/python/tests/test_tensor.py‎
Lines changed: 88 additions & 0 deletions b/‎lib/bindings/python/tests/test_tensor.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎lib/llm/src/discovery/model_manager.rs‎
Lines changed: 33 additions & 0 deletions b/‎lib/llm/src/discovery/model_manager.rs‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎lib/llm/src/discovery/watcher.rs‎
Lines changed: 31 additions & 5 deletions b/‎lib/llm/src/discovery/watcher.rs‎
Lines changed: 31 additions & 5 deletions
diff --git a/‎lib/llm/src/grpc/service.rs‎
Lines changed: 1 addition & 0 deletions b/‎lib/llm/src/grpc/service.rs‎
Lines changed: 1 addition & 0 deletions
@@ -165,6 +165,7 @@ fn register_llm<'p>(
     let model_input = match model_input {
         ModelInput::Text => llm_rs::model_type::ModelInput::Text,
         ModelInput::Tokens => llm_rs::model_type::ModelInput::Tokens,
+        ModelInput::Tensor => llm_rs::model_type::ModelInput::Tensor,
     };
 
     let model_type_obj = model_type.inner;
@@ -298,6 +299,10 @@ impl ModelType {
     const Embedding: Self = ModelType {
         inner: llm_rs::model_type::ModelType::Embedding,
     };
+    #[classattr]
+    const TensorBased: Self = ModelType {
+        inner: llm_rs::model_type::ModelType::TensorBased,
+    };
 
     fn __or__(&self, other: &Self) -> Self {
         ModelType {
@@ -315,6 +320,7 @@ impl ModelType {
 enum ModelInput {
     Text = 1,
     Tokens = 2,
+    Tensor = 3,
 }
 
 #[pymethods]
 
@@ -52,6 +52,27 @@ impl ModelRuntimeConfig {
         Ok(())
     }
 
+    fn set_tensor_model_config(
+        &mut self,
+        _py: Python<'_>,
+        tensor_model_config: &Bound<'_, PyDict>,
+    ) -> PyResult<()> {
+        let tensor_model_config = pythonize::depythonize(tensor_model_config).map_err(|err| {
+            PyErr::new::<PyException, _>(format!("Failed to convert tensor_model_config: {}", err))
+        })?;
+        self.inner.tensor_model_config = Some(tensor_model_config);
+        Ok(())
+    }
+
+    fn get_tensor_model_config(&self, _py: Python<'_>) -> PyResult<Option<PyObject>> {
+        if let Some(tensor_model_config) = &self.inner.tensor_model_config {
+            let py_obj = pythonize::pythonize(_py, tensor_model_config).map_err(to_pyerr)?;
+            Ok(Some(py_obj.unbind()))
+        } else {
+            Ok(None)
+        }
+    }
+
     #[getter]
     fn total_kv_blocks(&self) -> Option<u64> {
         self.inner.total_kv_blocks
 
@@ -849,11 +849,11 @@ class HttpAsyncEngine:
     ...
 
 class ModelInput:
-    """What type of request this model needs: Text or Tokens"""
+    """What type of request this model needs: Text, Tokens or Tensor"""
     ...
 
 class ModelType:
-    """What type of request this model needs: Chat, Completions or Embedding"""
+    """What type of request this model needs: Chat, Completions, Embedding or Tensor"""
     ...
 
 class RouterMode:
 
@@ -0,0 +1,88 @@
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+# Usage: `TEST_END_TO_END=1 python test_tensor.py` to run this worker as tensor based echo worker.
+
+import os
+
+import uvloop
+
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
+from dynamo.runtime import DistributedRuntime, dynamo_worker
+
+TEST_END_TO_END = os.environ.get("TEST_END_TO_END", 0)
+
+
+@dynamo_worker(static=False)
+async def test_register(runtime: DistributedRuntime):
+    component = runtime.namespace("test").component("tensor")
+    await component.create_service()
+
+    endpoint = component.endpoint("generate")
+
+    model_config = {
+        "name": "tensor",
+        "inputs": [
+            {"name": "input_text", "data_type": "Bytes", "shape": [-1]},
+            {"name": "custom", "data_type": "Bytes", "shape": [-1]},
+            {"name": "streaming", "data_type": "Bool", "shape": [1]},
+        ],
+        "outputs": [{"name": "output_text", "data_type": "Bytes", "shape": [-1]}],
+    }
+    runtime_config = ModelRuntimeConfig()
+    runtime_config.set_tensor_model_config(model_config)
+
+    assert model_config == runtime_config.get_tensor_model_config()
+
+    # [gluo FIXME] register_llm will attempt to load a LLM model,
+    # which is not well-defined for Tensor yet. Currently provide
+    # a valid model name to pass the registration.
+    await register_llm(
+        ModelInput.Tensor,
+        ModelType.TensorBased,
+        endpoint,
+        "Qwen/Qwen3-0.6B",
+        "tensor",
+        runtime_config=runtime_config,
+    )
+
+    if TEST_END_TO_END:
+        await endpoint.serve_endpoint(generate)
+
+
+async def generate(request, context):
+    print(f"Received request: {request}")
+    # Echo input_text in output_text
+    output_text = None
+    streaming = False
+    for tensor in request["tensors"]:
+        if tensor["metadata"]["name"] == "input_text":
+            input_text_str = "".join(map(chr, tensor["data"]["values"][0]))
+            print(f"Input text: {input_text_str}")
+            output_text = tensor
+            output_text["metadata"]["name"] = "output_text"
+        if tensor["metadata"]["name"] == "streaming":
+            streaming = tensor["data"]["values"][0]
+    if output_text is None:
+        raise ValueError("input_text tensor not found in request")
+    if streaming:
+        for i in range(len(output_text["data"]["values"][0])):
+            chunk = {
+                "model": request["model"],
+                "tensors": [
+                    {
+                        "metadata": output_text["metadata"],
+                        "data": {
+                            "data_type": output_text["data"]["data_type"],
+                            "values": [[output_text["data"]["values"][0][i]]],
+                        },
+                    }
+                ],
+            }
+            yield chunk
+    else:
+        yield {"model": request["model"], "tensors": [output_text]}
+
+
+if __name__ == "__main__":
+    uvloop.run(test_register())
@@ -15,6 +15,7 @@ use crate::discovery::{KV_ROUTERS_ROOT_PATH, ModelEntry};
 use crate::kv_router::{KvRouterConfig, scheduler::DefaultWorkerSelector};
 use crate::{
     kv_router::KvRouter,
+    types::generic::tensor::TensorStreamingEngine,
     types::openai::{
         chat_completions::OpenAIChatCompletionsStreamingEngine,
         completions::OpenAICompletionsStreamingEngine, embeddings::OpenAIEmbeddingsStreamingEngine,
@@ -36,6 +37,7 @@ pub struct ModelManager {
     completion_engines: RwLock<ModelEngines<OpenAICompletionsStreamingEngine>>,
     chat_completion_engines: RwLock<ModelEngines<OpenAIChatCompletionsStreamingEngine>>,
     embeddings_engines: RwLock<ModelEngines<OpenAIEmbeddingsStreamingEngine>>,
+    tensor_engines: RwLock<ModelEngines<TensorStreamingEngine>>,
 
     // These two are Mutex because we read and write rarely and equally
     entries: Mutex<HashMap<String, ModelEntry>>,
@@ -54,6 +56,7 @@ impl ModelManager {
             completion_engines: RwLock::new(ModelEngines::default()),
             chat_completion_engines: RwLock::new(ModelEngines::default()),
             embeddings_engines: RwLock::new(ModelEngines::default()),
+            tensor_engines: RwLock::new(ModelEngines::default()),
             entries: Mutex::new(HashMap::new()),
             kv_choosers: Mutex::new(HashMap::new()),
         }
@@ -73,6 +76,7 @@ impl ModelManager {
             .into_iter()
             .chain(self.list_completions_models())
             .chain(self.list_embeddings_models())
+            .chain(self.list_tensor_models())
             .collect()
     }
 
@@ -88,6 +92,10 @@ impl ModelManager {
         self.embeddings_engines.read().list()
     }
 
+    pub fn list_tensor_models(&self) -> Vec<String> {
+        self.tensor_engines.read().list()
+    }
+
     pub fn add_completions_model(
         &self,
         model: &str,
@@ -115,6 +123,15 @@ impl ModelManager {
         clients.add(model, engine)
     }
 
+    pub fn add_tensor_model(
+        &self,
+        model: &str,
+        engine: TensorStreamingEngine,
+    ) -> Result<(), ModelManagerError> {
+        let mut clients = self.tensor_engines.write();
+        clients.add(model, engine)
+    }
+
     pub fn remove_completions_model(&self, model: &str) -> Result<(), ModelManagerError> {
         let mut clients = self.completion_engines.write();
         clients.remove(model)
@@ -130,6 +147,11 @@ impl ModelManager {
         clients.remove(model)
     }
 
+    pub fn remove_tensor_model(&self, model: &str) -> Result<(), ModelManagerError> {
+        let mut clients = self.tensor_engines.write();
+        clients.remove(model)
+    }
+
     pub fn get_embeddings_engine(
         &self,
         model: &str,
@@ -163,6 +185,17 @@ impl ModelManager {
             .ok_or(ModelManagerError::ModelNotFound(model.to_string()))
     }
 
+    pub fn get_tensor_engine(
+        &self,
+        model: &str,
+    ) -> Result<TensorStreamingEngine, ModelManagerError> {
+        self.tensor_engines
+            .read()
+            .get(model)
+            .cloned()
+            .ok_or(ModelManagerError::ModelNotFound(model.to_string()))
+    }
+
     /// Save a ModelEntry under an instance's etcd `models/` key so we can fetch it later when the key is
     /// deleted from etcd.
     pub fn save_model_entry(&self, key: &str, entry: ModelEntry) {
 
@@ -33,6 +33,7 @@ use crate::{
             completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
             embeddings::{NvCreateEmbeddingRequest, NvCreateEmbeddingResponse},
         },
+        tensor::{NvCreateTensorRequest, NvCreateTensorResponse},
     },
 };
 
@@ -59,6 +60,7 @@ const ALL_MODEL_TYPES: &[ModelType] = &[
     ModelType::Chat,
     ModelType::Completions,
     ModelType::Embedding,
+    ModelType::TensorBased,
 ];
 
 impl ModelWatcher {
@@ -213,10 +215,12 @@ impl ModelWatcher {
         let chat_model_remove_err = self.manager.remove_chat_completions_model(&model_name);
         let completions_model_remove_err = self.manager.remove_completions_model(&model_name);
         let embeddings_model_remove_err = self.manager.remove_embeddings_model(&model_name);
+        let tensor_model_remove_err = self.manager.remove_tensor_model(&model_name);
 
         let mut chat_model_removed = false;
         let mut completions_model_removed = false;
         let mut embeddings_model_removed = false;
+        let mut tensor_model_removed = false;
 
         if chat_model_remove_err.is_ok() && self.manager.list_chat_completions_models().is_empty() {
             chat_model_removed = true;
@@ -228,20 +232,29 @@ impl ModelWatcher {
         if embeddings_model_remove_err.is_ok() && self.manager.list_embeddings_models().is_empty() {
             embeddings_model_removed = true;
         }
+        if tensor_model_remove_err.is_ok() && self.manager.list_tensor_models().is_empty() {
+            tensor_model_removed = true;
+        }
 
-        if !chat_model_removed && !completions_model_removed && !embeddings_model_removed {
+        if !chat_model_removed
+            && !completions_model_removed
+            && !embeddings_model_removed
+            && !tensor_model_removed
+        {
             tracing::debug!(
-                "No updates to send for model {}: chat_model_removed: {}, completions_model_removed: {}, embeddings_model_removed: {}",
+                "No updates to send for model {}: chat_model_removed: {}, completions_model_removed: {}, embeddings_model_removed: {}, tensor_model_removed: {}",
                 model_name,
                 chat_model_removed,
                 completions_model_removed,
-                embeddings_model_removed
+                embeddings_model_removed,
+                tensor_model_removed
             );
         } else {
             for model_type in ALL_MODEL_TYPES {
                 if ((chat_model_removed && *model_type == ModelType::Chat)
                     || (completions_model_removed && *model_type == ModelType::Completions)
-                    || (embeddings_model_removed && *model_type == ModelType::Embedding))
+                    || (embeddings_model_removed && *model_type == ModelType::Embedding)
+                    || (tensor_model_removed && *model_type == ModelType::TensorBased))
                     && let Some(tx) = &self.model_update_tx
                 {
                     tx.send(ModelUpdate::Removed(*model_type)).await.ok();
@@ -421,11 +434,24 @@ impl ModelWatcher {
 
             self.manager
                 .add_embeddings_model(&model_entry.name, embedding_engine)?;
+        } else if model_entry.model_input == ModelInput::Tensor
+            && model_entry.model_type.supports_tensor()
+        {
+            // Case 5: Tensor + Tensor (non-LLM)
+            let push_router = PushRouter::<
+                NvCreateTensorRequest,
+                Annotated<NvCreateTensorResponse>,
+            >::from_client_with_threshold(
+                client, self.router_mode, self.busy_threshold
+            )
+            .await?;
+            let engine = Arc::new(push_router);
+            self.manager.add_tensor_model(&model_entry.name, engine)?;
         } else {
             // Reject unsupported combinations
             anyhow::bail!(
                 "Unsupported model configuration: {} with {} input. Supported combinations: \
-                Tokens+(Chat|Completions), Text+Chat, Text+Completions, Tokens+Embeddings",
+                Tokens+(Chat|Completions), Text+Chat, Text+Completions, Tokens+Embeddings, Tensor+TensorBased",
                 model_entry.model_type,
                 model_entry.model_input.as_str()
             );
 
@@ -3,3 +3,4 @@
 
 pub mod kserve;
 pub mod openai;
+pub mod tensor;
Original file line number	Diff line number	Diff line change
`@@ -3,3 +3,4 @@`
`3`	`3`
`4`	`4`	`pub mod kserve;`
`5`	`5`	`pub mod openai;`
	`6`	`+pub mod tensor;`