Skip to content

Commit 4d54532

Browse files
oandreeva-nvGuanLuo
authored andcommitted
refactor: Split ModelType to ModelInput for request and response type; ModelType for the supported workloads (#2714)
Signed-off-by: Guan Luo <gluo@nvidia.com> Signed-off-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Co-authored-by: Guan Luo <gluo@nvidia.com> Co-authored-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
1 parent 4b8ae85 commit 4d54532

File tree

29 files changed

+522
-208
lines changed

29 files changed

+522
-208
lines changed

Cargo.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/backends/llama_cpp/src/dynamo/llama_cpp/main.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import uvloop
1212
from llama_cpp import Llama
1313

14-
from dynamo.llm import ModelType, register_llm
14+
from dynamo.llm import ModelInput, ModelType, register_llm
1515
from dynamo.runtime import DistributedRuntime, dynamo_worker
1616
from dynamo.runtime.logging import configure_dynamo_logging
1717

@@ -41,10 +41,10 @@ async def worker(runtime: DistributedRuntime):
4141
component = runtime.namespace(config.namespace).component(config.component)
4242
await component.create_service()
4343

44-
model_type = ModelType.Chat # llama.cpp does the pre-processing
4544
endpoint = component.endpoint(config.endpoint)
4645
await register_llm(
47-
model_type,
46+
ModelInput.Tokens,
47+
ModelType.Chat,
4848
endpoint,
4949
config.model_path,
5050
config.model_name,

components/backends/sglang/src/dynamo/sglang/register.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from sglang.srt.server_args import ServerArgs
99

1010
from dynamo._core import Endpoint
11-
from dynamo.llm import ModelRuntimeConfig, ModelType, register_llm
11+
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
1212
from dynamo.sglang.args import DynamoArgs
1313

1414

@@ -26,7 +26,8 @@ async def register_llm_with_runtime_config(
2626
runtime_config = await _get_runtime_config(engine, dynamo_args)
2727
try:
2828
await register_llm(
29-
ModelType.Backend,
29+
ModelInput.Tokens,
30+
ModelType.Chat | ModelType.Completions,
3031
endpoint,
3132
server_args.model_path,
3233
server_args.served_model_name,

components/backends/trtllm/src/dynamo/trtllm/main.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from transformers import AutoConfig
2323

2424
import dynamo.nixl_connect as nixl_connect
25-
from dynamo.llm import ModelRuntimeConfig, ModelType, register_llm
25+
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
2626
from dynamo.runtime import DistributedRuntime, dynamo_worker
2727
from dynamo.runtime.logging import configure_dynamo_logging
2828
from dynamo.trtllm.engine import TensorRTLLMEngine, get_llm_engine
@@ -223,7 +223,8 @@ async def init(runtime: DistributedRuntime, config: Config):
223223
default_sampling_params = SamplingParams()
224224
default_sampling_params._setup(tokenizer)
225225
default_sampling_params.stop = None
226-
modelType = ModelType.Backend
226+
model_input = ModelInput.Tokens
227+
model_type = ModelType.Chat | ModelType.Completions
227228
multimodal_processor = None
228229

229230
if os.getenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR") == "1":
@@ -234,7 +235,7 @@ async def init(runtime: DistributedRuntime, config: Config):
234235

235236
if modality == "multimodal":
236237
engine_args["skip_tokenizer_init"] = False
237-
modelType = ModelType.Chat
238+
model_input = ModelInput.Text
238239
model_config = AutoConfig.from_pretrained(
239240
config.model_path, trust_remote_code=True
240241
)
@@ -292,7 +293,8 @@ async def init(runtime: DistributedRuntime, config: Config):
292293
if is_first_worker(config):
293294
# Register the model with runtime config
294295
await register_llm(
295-
modelType,
296+
model_input,
297+
model_type,
296298
endpoint,
297299
config.model_path,
298300
config.served_model_name,

components/backends/vllm/src/dynamo/vllm/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from vllm.v1.engine.async_llm import AsyncLLM
1313

1414
from dynamo.llm import (
15+
ModelInput,
1516
ModelRuntimeConfig,
1617
ModelType,
1718
ZmqKvEventPublisher,
@@ -251,7 +252,8 @@ async def init(runtime: DistributedRuntime, config: Config):
251252
runtime_config.reasoning_parser = config.reasoning_parser
252253

253254
await register_llm(
254-
ModelType.Backend,
255+
ModelInput.Tokens,
256+
ModelType.Chat | ModelType.Completions,
255257
generate_endpoint,
256258
config.model,
257259
config.served_model_name,

docs/guides/backend.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ The Python file must do three things:
1616
3. Attach a request handler
1717

1818
```
19-
from dynamo.llm import ModelType, register_llm
19+
from dynamo.llm import ModelInput, ModelType, register_llm
2020
from dynamo.runtime import DistributedRuntime, dynamo_worker
2121
2222
# 1. Decorate a function to get the runtime
@@ -29,10 +29,11 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
2929
component = runtime.namespace("namespace").component("component")
3030
await component.create_service()
3131
model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
32-
model_type = ModelType.Backend
32+
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
33+
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
3334
endpoint = component.endpoint("endpoint")
3435
# Optional last param to register_llm is model_name. If not present derives it from model_path
35-
await register_llm(model_type, endpoint, model_path)
36+
await register_llm(model_input, model_type, endpoint, model_path)
3637
3738
# Initialize your engine here
3839
# engine = ...
@@ -62,10 +63,13 @@ The `model_path` can be:
6263
- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
6364
- The path to a GGUF file, if your engine supports that.
6465

66+
The `model_input` can be:
67+
- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing.
68+
- ModelInput.Text. Your engine expects raw text input and handles its own tokenization and pre-processing.
69+
6570
The `model_type` can be:
66-
- ModelType.Backend. Dynamo handles pre-processing. Your `generate` method receives a `request` dict containing a `token_ids` array of int. It must return a dict also containing a `token_ids` array and an optional `finish_reason` string.
67-
- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). Your engine handles pre-processing.
68-
- ModelType.Completion. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). Your engine handles pre-processing.
71+
- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat).
72+
- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
6973

7074
`register_llm` can also take the following kwargs:
7175
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, the folder name, or the GGUF file name.

docs/guides/dynamo_run.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ The Python file must do three things:
389389
3. Attach a request handler
390390

391391
```
392-
from dynamo.llm import ModelType, register_llm
392+
from dynamo.llm import ModelInput, ModelType, register_llm
393393
from dynamo.runtime import DistributedRuntime, dynamo_worker
394394
395395
# 1. Decorate a function to get the runtime
@@ -402,10 +402,11 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
402402
component = runtime.namespace("namespace").component("component")
403403
await component.create_service()
404404
model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
405-
model_type = ModelType.Backend
405+
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
406+
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
406407
endpoint = component.endpoint("endpoint")
407408
# Optional last param to register_llm is model_name. If not present derives it from model_path
408-
await register_llm(model_type, endpoint, model_path)
409+
await register_llm(model_input, model_type, endpoint, model_path)
409410
410411
# Initialize your engine here
411412
# engine = ...
@@ -435,10 +436,13 @@ The `model_path` can be:
435436
- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
436437
- The path to a GGUF file, if your engine supports that.
437438

439+
The `model_input` can be:
440+
- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing.
441+
- ModelInput.Text. Your engine expects raw text input and handles its own tokenization and pre-processing.
442+
438443
The `model_type` can be:
439-
- ModelType.Backend. Dynamo handles pre-processing. Your `generate` method receives a `request` dict containing a `token_ids` array of int. It must return a dict also containing a `token_ids` array and an optional `finish_reason` string.
440-
- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). Your engine handles pre-processing.
441-
- ModelType.Completion. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). Your engine handles pre-processing.
444+
- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat).
445+
- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
442446

443447
`register_llm` can also take the following kwargs:
444448
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, the folder name, or the GGUF file name.

examples/multimodal/components/processor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from vllm.transformers_utils.tokenizer import AnyTokenizer
3333
from vllm.utils import FlexibleArgumentParser
3434

35-
from dynamo.llm import ModelType, register_llm
35+
from dynamo.llm import ModelInput, ModelType, register_llm
3636
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
3737
from dynamo.runtime.logging import configure_dynamo_logging
3838

@@ -321,7 +321,8 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
321321

322322
# Register the endpoint as entrypoint to a model
323323
await register_llm(
324-
ModelType.Chat, # Custom processor is used and this type bypasses SDK processor
324+
ModelInput.Text, # Custom processor is used and this type bypasses SDK processor
325+
ModelType.Chat,
325326
generate_endpoint,
326327
config.model,
327328
config.served_model_name,

lib/bindings/python/Cargo.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/bindings/python/examples/hello_world/server_sglang.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# request via NATS to this python script, which runs sglang.
99
#
1010
# The key differences between this and `server_sglang_tok.py` are:
11-
# - The `register_llm` function registers us a `Backend` model
11+
# - The `register_llm` function registers us a `Chat` and `Completions` model that accepts `Tokens` input
1212
# - The `generate` function receives a pre-tokenized request and must return token_ids in the response.
1313
#
1414
# Setup a virtualenv with dynamo.llm, dynamo.runtime and sglang[all] installed
@@ -27,7 +27,7 @@
2727
import uvloop
2828
from sglang.srt.server_args import ServerArgs
2929

30-
from dynamo.llm import ModelType, register_llm
30+
from dynamo.llm import ModelInput, ModelType, register_llm
3131
from dynamo.runtime import DistributedRuntime, dynamo_worker
3232

3333
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
@@ -91,7 +91,12 @@ async def init(runtime: DistributedRuntime, config: Config):
9191
await component.create_service()
9292

9393
endpoint = component.endpoint(config.endpoint)
94-
await register_llm(ModelType.Backend, endpoint, config.model)
94+
await register_llm(
95+
ModelInput.Tokens,
96+
ModelType.Chat | ModelType.Completions,
97+
endpoint,
98+
config.model,
99+
)
95100

96101
engine_args = ServerArgs(
97102
model_path=config.model,

0 commit comments

Comments
 (0)