ai-dynamo
diff --git a/‎Cargo.lock‎
Lines changed: 4 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎components/backends/llama_cpp/src/dynamo/llama_cpp/main.py‎
Lines changed: 3 additions & 3 deletions b/‎components/backends/llama_cpp/src/dynamo/llama_cpp/main.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎components/backends/sglang/src/dynamo/sglang/register.py‎
Lines changed: 3 additions & 2 deletions b/‎components/backends/sglang/src/dynamo/sglang/register.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎components/backends/trtllm/src/dynamo/trtllm/main.py‎
Lines changed: 6 additions & 4 deletions b/‎components/backends/trtllm/src/dynamo/trtllm/main.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎components/backends/vllm/src/dynamo/vllm/main.py‎
Lines changed: 3 additions & 1 deletion b/‎components/backends/vllm/src/dynamo/vllm/main.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/guides/backend.md‎
Lines changed: 10 additions & 6 deletions b/‎docs/guides/backend.md‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎docs/guides/dynamo_run.md‎
Lines changed: 10 additions & 6 deletions b/‎docs/guides/dynamo_run.md‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎examples/multimodal/components/processor.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/multimodal/components/processor.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/bindings/python/Cargo.lock‎
Lines changed: 4 additions & 0 deletions b/‎lib/bindings/python/Cargo.lock‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/bindings/python/examples/hello_world/server_sglang.py‎
Lines changed: 8 additions & 3 deletions b/‎lib/bindings/python/examples/hello_world/server_sglang.py‎
Lines changed: 8 additions & 3 deletions
@@ -11,7 +11,7 @@
 import uvloop
 from llama_cpp import Llama
 
-from dynamo.llm import ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
 
@@ -41,10 +41,10 @@ async def worker(runtime: DistributedRuntime):
     component = runtime.namespace(config.namespace).component(config.component)
     await component.create_service()
 
-    model_type = ModelType.Chat  # llama.cpp does the pre-processing
     endpoint = component.endpoint(config.endpoint)
     await register_llm(
-        model_type,
+        ModelInput.Tokens,
+        ModelType.Chat,
         endpoint,
         config.model_path,
         config.model_name,
 
@@ -8,7 +8,7 @@
 from sglang.srt.server_args import ServerArgs
 
 from dynamo._core import Endpoint
-from dynamo.llm import ModelRuntimeConfig, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
 from dynamo.sglang.args import DynamoArgs
 
 
@@ -26,7 +26,8 @@ async def register_llm_with_runtime_config(
     runtime_config = await _get_runtime_config(engine, dynamo_args)
     try:
         await register_llm(
-            ModelType.Backend,
+            ModelInput.Tokens,
+            ModelType.Chat | ModelType.Completions,
             endpoint,
             server_args.model_path,
             server_args.served_model_name,
 
@@ -22,7 +22,7 @@
 from transformers import AutoConfig
 
 import dynamo.nixl_connect as nixl_connect
-from dynamo.llm import ModelRuntimeConfig, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.trtllm.engine import TensorRTLLMEngine, get_llm_engine
@@ -223,7 +223,8 @@ async def init(runtime: DistributedRuntime, config: Config):
     default_sampling_params = SamplingParams()
     default_sampling_params._setup(tokenizer)
     default_sampling_params.stop = None
-    modelType = ModelType.Backend
+    model_input = ModelInput.Tokens
+    model_type = ModelType.Chat | ModelType.Completions
     multimodal_processor = None
 
     if os.getenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR") == "1":
@@ -234,7 +235,7 @@ async def init(runtime: DistributedRuntime, config: Config):
 
     if modality == "multimodal":
         engine_args["skip_tokenizer_init"] = False
-        modelType = ModelType.Chat
+        model_input = ModelInput.Text
         model_config = AutoConfig.from_pretrained(
             config.model_path, trust_remote_code=True
         )
@@ -292,7 +293,8 @@ async def init(runtime: DistributedRuntime, config: Config):
         if is_first_worker(config):
             # Register the model with runtime config
             await register_llm(
-                modelType,
+                model_input,
+                model_type,
                 endpoint,
                 config.model_path,
                 config.served_model_name,
 
@@ -12,6 +12,7 @@
 from vllm.v1.engine.async_llm import AsyncLLM
 
 from dynamo.llm import (
+    ModelInput,
     ModelRuntimeConfig,
     ModelType,
     ZmqKvEventPublisher,
@@ -251,7 +252,8 @@ async def init(runtime: DistributedRuntime, config: Config):
         runtime_config.reasoning_parser = config.reasoning_parser
 
         await register_llm(
-            ModelType.Backend,
+            ModelInput.Tokens,
+            ModelType.Chat | ModelType.Completions,
             generate_endpoint,
             config.model,
             config.served_model_name,
 
@@ -16,7 +16,7 @@ The Python file must do three things:
 3. Attach a request handler
 
 ```
-from dynamo.llm import ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 
    # 1. Decorate a function to get the runtime
@@ -29,10 +29,11 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
     component = runtime.namespace("namespace").component("component")
     await component.create_service()
     model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
-    model_type = ModelType.Backend
+    model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
+    model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
     endpoint = component.endpoint("endpoint")
     # Optional last param to register_llm is model_name. If not present derives it from model_path
-    await register_llm(model_type, endpoint, model_path)
+    await register_llm(model_input, model_type, endpoint, model_path)
 
     # Initialize your engine here
     # engine = ...
@@ -62,10 +63,13 @@ The `model_path` can be:
 - The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
 - The path to a GGUF file, if your engine supports that.
 
+The `model_input` can be:
+- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing.
+- ModelInput.Text. Your engine expects raw text input and handles its own tokenization and pre-processing.
+
 The `model_type` can be:
-- ModelType.Backend. Dynamo handles pre-processing. Your `generate` method receives a `request` dict containing a `token_ids` array of int. It must return a dict also containing a `token_ids` array and an optional `finish_reason` string.
-- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). Your engine handles pre-processing.
-- ModelType.Completion. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). Your engine handles pre-processing.
+- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat).
+- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
 
 `register_llm` can also take the following kwargs:
 - `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, the folder name, or the GGUF file name.
 
@@ -389,7 +389,7 @@ The Python file must do three things:
 3. Attach a request handler
 
 ```
-from dynamo.llm import ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 
    # 1. Decorate a function to get the runtime
@@ -402,10 +402,11 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
     component = runtime.namespace("namespace").component("component")
     await component.create_service()
     model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
-    model_type = ModelType.Backend
+    model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
+    model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
     endpoint = component.endpoint("endpoint")
     # Optional last param to register_llm is model_name. If not present derives it from model_path
-    await register_llm(model_type, endpoint, model_path)
+    await register_llm(model_input, model_type, endpoint, model_path)
 
     # Initialize your engine here
     # engine = ...
@@ -435,10 +436,13 @@ The `model_path` can be:
 - The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
 - The path to a GGUF file, if your engine supports that.
 
+The `model_input` can be:
+- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing.
+- ModelInput.Text. Your engine expects raw text input and handles its own tokenization and pre-processing.
+
 The `model_type` can be:
-- ModelType.Backend. Dynamo handles pre-processing. Your `generate` method receives a `request` dict containing a `token_ids` array of int. It must return a dict also containing a `token_ids` array and an optional `finish_reason` string.
-- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). Your engine handles pre-processing.
-- ModelType.Completion. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). Your engine handles pre-processing.
+- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat).
+- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
 
 `register_llm` can also take the following kwargs:
 - `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, the folder name, or the GGUF file name.
 
@@ -32,7 +32,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import FlexibleArgumentParser
 
-from dynamo.llm import ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_llm
 from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
 
@@ -321,7 +321,8 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
 
     # Register the endpoint as entrypoint to a model
     await register_llm(
-        ModelType.Chat,  # Custom processor is used and this type bypasses SDK processor
+        ModelInput.Text,  # Custom processor is used and this type bypasses SDK processor
+        ModelType.Chat,
         generate_endpoint,
         config.model,
         config.served_model_name,
 
@@ -8,7 +8,7 @@
 # request via NATS to this python script, which runs sglang.
 #
 # The key differences between this and `server_sglang_tok.py` are:
-# - The `register_llm` function registers us a `Backend` model
+# - The `register_llm` function registers us a `Chat` and `Completions` model that accepts `Tokens` input
 # - The `generate` function receives a pre-tokenized request and must return token_ids in the response.
 #
 # Setup a virtualenv with dynamo.llm, dynamo.runtime and sglang[all] installed
@@ -27,7 +27,7 @@
 import uvloop
 from sglang.srt.server_args import ServerArgs
 
-from dynamo.llm import ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 
 DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
@@ -91,7 +91,12 @@ async def init(runtime: DistributedRuntime, config: Config):
     await component.create_service()
 
     endpoint = component.endpoint(config.endpoint)
-    await register_llm(ModelType.Backend, endpoint, config.model)
+    await register_llm(
+        ModelInput.Tokens,
+        ModelType.Chat | ModelType.Completions,
+        endpoint,
+        config.model,
+    )
 
     engine_args = ServerArgs(
         model_path=config.model,