Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions components/backends/mocker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Mocker engine

The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:

- Testing distributed system components without GPU resources
- Benchmarking infrastructure and networking overhead
- Developing and debugging Dynamo components
- Load testing and performance analysis

**Basic usage:**

The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights (but the pre-processor needs the tokenizer). The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.

And below are arguments that are mocker-specific:
- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
- `dp_size`: Number of data parallel workers to simulate (default: 1)
- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.

>[!NOTE]
>Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
```bash
echo '{"speedup_ratio": 10.0}' > mocker_args.json
python -m dynamo.mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
python -m dynamo.frontend --http-port 8080
```
Empty file.
7 changes: 7 additions & 0 deletions components/backends/mocker/src/dynamo/mocker/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from dynamo.mocker.main import main

if __name__ == "__main__":
main()
76 changes: 76 additions & 0 deletions components/backends/mocker/src/dynamo/mocker/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Usage: `python -m dynamo.mocker --model-path /data/models/Qwen3-0.6B-Q8_0.gguf --extra-engine-args args.json`

import argparse
from pathlib import Path

import uvloop

from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging

DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"

configure_dynamo_logging()


@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
args = cmd_line_args()

# Create engine configuration
entrypoint_args = EntrypointArgs(
engine_type=EngineType.Mocker,
model_path=args.model_path,
model_name=args.model_name,
endpoint_id=args.endpoint,
extra_engine_args=args.extra_engine_args,
)

# Create and run the engine
# NOTE: only supports dyn endpoint for now
engine_config = await make_engine(runtime, entrypoint_args)
await run_input(runtime, args.endpoint, engine_config)


def cmd_line_args():
parser = argparse.ArgumentParser(
description="Mocker engine for testing Dynamo LLM infrastructure.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--model-path",
type=str,
help="Path to model directory or HuggingFace model ID for tokenizer",
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string (default: {DEFAULT_ENDPOINT})",
)
parser.add_argument(
"--model-name",
type=str,
default=None,
help="Model name for API responses (default: mocker-engine)",
)
parser.add_argument(
"--extra-engine-args",
type=Path,
help="Path to JSON file with mocker configuration "
"(num_gpu_blocks, speedup_ratio, etc.)",
)

return parser.parse_args()


def main():
uvloop.run(worker())


if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions docs/guides/dynamo_run.md
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,32 @@ The output looks like this:
{"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
```

#### Mocker engine

The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:

- Testing distributed system components without GPU resources
- Benchmarking infrastructure and networking overhead
- Developing and debugging Dynamo components
- Load testing and performance analysis

**Basic usage:**

The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.

And below are arguments that are mocker-specific:
- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
- `dp_size`: Number of data parallel workers to simulate (default: 1)
- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.

>[!NOTE]
>Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
```bash
echo '{"speedup_ratio": 10.0}' > mocker_args.json
dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
dynamo-run in=http out=dyn --router-mode kv
```

### Extra engine arguments
The vllm and sglang backends support passing any argument the engine accepts.
Put the arguments in a JSON file:
Expand Down
13 changes: 13 additions & 0 deletions launch/dynamo-run/src/flags.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use clap::ValueEnum;
use dynamo_llm::entrypoint::RouterConfig;
use dynamo_llm::kv_router::KvRouterConfig;
use dynamo_llm::local_model::LocalModel;
use dynamo_llm::mocker::protocols::MockEngineArgs;
use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;

use crate::Output;
Expand Down Expand Up @@ -212,6 +213,9 @@ impl Flags {
anyhow::bail!("--model-path should refer to a GGUF file. llama_cpp does not support safetensors.");
}
}
Output::Mocker => {
// nothing to check here
}
}
Ok(())
}
Expand Down Expand Up @@ -241,6 +245,15 @@ impl Flags {
Ok(None)
}
}

pub fn mocker_config(&self) -> MockEngineArgs {
let Some(path) = &self.extra_engine_args else {
tracing::warn!("Did not specify extra engine args. Using default mocker args.");
return MockEngineArgs::default();
};
MockEngineArgs::from_json_file(path)
.unwrap_or_else(|e| panic!("Failed to build mocker engine args from {path:?}: {e}"))
}
}

#[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]
Expand Down
36 changes: 31 additions & 5 deletions launch/dynamo-run/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use dynamo_llm::entrypoint::input::Input;
use dynamo_llm::entrypoint::EngineConfig;
use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
use dynamo_runtime::CancellationToken;
use dynamo_runtime::{DistributedRuntime, Runtime};

mod flags;
use either::Either;
Expand All @@ -21,7 +22,7 @@ mod subprocess;
const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);

pub async fn run(
runtime: dynamo_runtime::Runtime,
runtime: Runtime,
in_opt: Input,
out_opt: Option<Output>,
flags: Flags,
Expand Down Expand Up @@ -52,8 +53,7 @@ pub async fn run(
if let Input::Endpoint(path) = &in_opt {
builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));

let distributed_runtime =
dynamo_runtime::DistributedRuntime::from_settings(runtime.clone()).await?;
let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
rt = Either::Right(distributed_runtime);
};

Expand All @@ -70,8 +70,14 @@ pub async fn run(
flags.validate(&local_model, &out_opt)?;

// Make an engine from the local_model, flags and output.
let (engine_config, extra) =
engine_for(runtime.primary_token(), out_opt, flags.clone(), local_model).await?;
let (engine_config, extra) = engine_for(
runtime.primary_token(),
out_opt,
flags.clone(),
local_model,
rt.clone(),
)
.await?;

//
// Run in from an input
Expand All @@ -96,6 +102,7 @@ async fn engine_for(
out_opt: Output,
flags: Flags,
local_model: LocalModel,
rt: Either<Runtime, DistributedRuntime>,
) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
match out_opt {
Output::Dynamic => Ok((EngineConfig::Dynamic(Box::new(local_model)), None)),
Expand Down Expand Up @@ -161,6 +168,25 @@ async fn engine_for(
)
.await
}
Output::Mocker => {
let Either::Right(drt) = rt else {
panic!("Mocker requires a distributed runtime to run.");
};

let args = flags.mocker_config();
let endpoint = local_model.endpoint_id().clone();

let engine =
dynamo_llm::mocker::engine::make_mocker_engine(drt, endpoint, args).await?;

Ok((
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
},
None,
))
}
}
}

Expand Down
5 changes: 5 additions & 0 deletions launch/dynamo-run/src/opt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ pub enum Output {
// Start vllm in a sub-process connecting via nats
// Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
Vllm,

Mocker,
}

impl TryFrom<&str> for Output {
Expand All @@ -47,6 +49,7 @@ impl TryFrom<&str> for Output {
"sglang" => Ok(Output::SgLang),
"trtllm" => Ok(Output::Trtllm),
"vllm" => Ok(Output::Vllm),
"mocker" => Ok(Output::Mocker),

"echo_full" => Ok(Output::EchoFull),
"echo_core" => Ok(Output::EchoCore),
Expand Down Expand Up @@ -79,6 +82,7 @@ impl fmt::Display for Output {
Output::SgLang => "sglang",
Output::Trtllm => "trtllm",
Output::Vllm => "vllm",
Output::Mocker => "mocker",

Output::EchoFull => "echo_full",
Output::EchoCore => "echo_core",
Expand Down Expand Up @@ -106,6 +110,7 @@ impl Output {
out.push(Output::SgLang.to_string());
out.push(Output::Trtllm.to_string());
out.push(Output::Vllm.to_string());
out.push(Output::Mocker.to_string());

out
}
Expand Down
Loading
Loading