ai-dynamo · PeaBrane · Jul 16, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
diff --git a/components/backends/mocker/README.md b/components/backends/mocker/README.md
@@ -0,0 +1,25 @@
+# Mocker engine
+
+The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:
+
+- Testing distributed system components without GPU resources
+- Benchmarking infrastructure and networking overhead
+- Developing and debugging Dynamo components
+- Load testing and performance analysis
+
+**Basic usage:**
+
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights (but the pre-processor needs the tokenizer). The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.
+
+And below are arguments that are mocker-specific:
+- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
+- `dp_size`: Number of data parallel workers to simulate (default: 1)
+- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
+
+>[!NOTE]
+>Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
+```bash
+echo '{"speedup_ratio": 10.0}' > mocker_args.json
+python -m dynamo.mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
+python -m dynamo.frontend --http-port 8080
+```
@@ -0,0 +1,7 @@
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+from dynamo.mocker.main import main
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,76 @@
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+# Usage: `python -m dynamo.mocker --model-path /data/models/Qwen3-0.6B-Q8_0.gguf --extra-engine-args args.json`
+
+import argparse
+from pathlib import Path
+
+import uvloop
+
+from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
+from dynamo.runtime import DistributedRuntime, dynamo_worker
+from dynamo.runtime.logging import configure_dynamo_logging
+
+DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
+
+configure_dynamo_logging()
+
+
+@dynamo_worker(static=False)
+async def worker(runtime: DistributedRuntime):
+    args = cmd_line_args()
+
+    # Create engine configuration
+    entrypoint_args = EntrypointArgs(
+        engine_type=EngineType.Mocker,
+        model_path=args.model_path,
+        model_name=args.model_name,
+        endpoint_id=args.endpoint,
+        extra_engine_args=args.extra_engine_args,
+    )
+
+    # Create and run the engine
+    # NOTE: only supports dyn endpoint for now
+    engine_config = await make_engine(runtime, entrypoint_args)
+    await run_input(runtime, args.endpoint, engine_config)
+
+
+def cmd_line_args():
+    parser = argparse.ArgumentParser(
+        description="Mocker engine for testing Dynamo LLM infrastructure.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        help="Path to model directory or HuggingFace model ID for tokenizer",
+    )
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default=DEFAULT_ENDPOINT,
+        help=f"Dynamo endpoint string (default: {DEFAULT_ENDPOINT})",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help="Model name for API responses (default: mocker-engine)",
+    )
+    parser.add_argument(
+        "--extra-engine-args",
+        type=Path,
+        help="Path to JSON file with mocker configuration "
+        "(num_gpu_blocks, speedup_ratio, etc.)",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    uvloop.run(worker())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
@@ -538,6 +538,32 @@ The output looks like this:
 {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
 ```
 
+#### Mocker engine
+
+The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:
+
+- Testing distributed system components without GPU resources
+- Benchmarking infrastructure and networking overhead
+- Developing and debugging Dynamo components
+- Load testing and performance analysis
+
+**Basic usage:**
+
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.
+
+And below are arguments that are mocker-specific:
+- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
+- `dp_size`: Number of data parallel workers to simulate (default: 1)
+- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
+
+>[!NOTE]
+>Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
+```bash
+echo '{"speedup_ratio": 10.0}' > mocker_args.json
+dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
+dynamo-run in=http out=dyn --router-mode kv
+```
+
 ### Extra engine arguments
 The vllm and sglang backends support passing any argument the engine accepts.
 Put the arguments in a JSON file:

@@ -20,6 +20,7 @@ use clap::ValueEnum;
 use dynamo_llm::entrypoint::RouterConfig;
 use dynamo_llm::kv_router::KvRouterConfig;
 use dynamo_llm::local_model::LocalModel;
+use dynamo_llm::mocker::protocols::MockEngineArgs;
 use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
 
 use crate::Output;
@@ -212,6 +213,9 @@ impl Flags {
                     anyhow::bail!("--model-path should refer to a GGUF file. llama_cpp does not support safetensors.");
                 }
             }
+            Output::Mocker => {
+                // nothing to check here
+            }
         }
         Ok(())
     }
@@ -241,6 +245,15 @@ impl Flags {
             Ok(None)
         }
     }
+
+    pub fn mocker_config(&self) -> MockEngineArgs {
+        let Some(path) = &self.extra_engine_args else {
+            tracing::warn!("Did not specify extra engine args. Using default mocker args.");
+            return MockEngineArgs::default();
+        };
+        MockEngineArgs::from_json_file(path)
+            .unwrap_or_else(|e| panic!("Failed to build mocker engine args from {path:?}: {e}"))
+    }
 }
 
 #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]

@@ -9,6 +9,7 @@ use dynamo_llm::entrypoint::input::Input;
 use dynamo_llm::entrypoint::EngineConfig;
 use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
 use dynamo_runtime::CancellationToken;
+use dynamo_runtime::{DistributedRuntime, Runtime};
 
 mod flags;
 use either::Either;
@@ -21,7 +22,7 @@ mod subprocess;
 const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
 
 pub async fn run(
-    runtime: dynamo_runtime::Runtime,
+    runtime: Runtime,
     in_opt: Input,
     out_opt: Option<Output>,
     flags: Flags,
@@ -52,8 +53,7 @@ pub async fn run(
     if let Input::Endpoint(path) = &in_opt {
         builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));
 
-        let distributed_runtime =
-            dynamo_runtime::DistributedRuntime::from_settings(runtime.clone()).await?;
+        let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
         rt = Either::Right(distributed_runtime);
     };
 
@@ -70,8 +70,14 @@ pub async fn run(
     flags.validate(&local_model, &out_opt)?;
 
     // Make an engine from the local_model, flags and output.
-    let (engine_config, extra) =
-        engine_for(runtime.primary_token(), out_opt, flags.clone(), local_model).await?;
+    let (engine_config, extra) = engine_for(
+        runtime.primary_token(),
+        out_opt,
+        flags.clone(),
+        local_model,
+        rt.clone(),
+    )
+    .await?;
 
     //
     // Run in from an input
@@ -96,6 +102,7 @@ async fn engine_for(
     out_opt: Output,
     flags: Flags,
     local_model: LocalModel,
+    rt: Either<Runtime, DistributedRuntime>,
 ) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
     match out_opt {
         Output::Dynamic => Ok((EngineConfig::Dynamic(Box::new(local_model)), None)),
@@ -161,6 +168,25 @@ async fn engine_for(
             )
             .await
         }
+        Output::Mocker => {
+            let Either::Right(drt) = rt else {
+                panic!("Mocker requires a distributed runtime to run.");
+            };
+
+            let args = flags.mocker_config();
+            let endpoint = local_model.endpoint_id().clone();
+
+            let engine =
+                dynamo_llm::mocker::engine::make_mocker_engine(drt, endpoint, args).await?;
+
+            Ok((
+                EngineConfig::StaticCore {
+                    engine,
+                    model: Box::new(local_model),
+                },
+                None,
+            ))
+        }
     }
 }
 

@@ -31,6 +31,8 @@ pub enum Output {
     // Start vllm in a sub-process connecting via nats
     // Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
     Vllm,
+
+    Mocker,
 }
 
 impl TryFrom<&str> for Output {
@@ -47,6 +49,7 @@ impl TryFrom<&str> for Output {
             "sglang" => Ok(Output::SgLang),
             "trtllm" => Ok(Output::Trtllm),
             "vllm" => Ok(Output::Vllm),
+            "mocker" => Ok(Output::Mocker),
 
             "echo_full" => Ok(Output::EchoFull),
             "echo_core" => Ok(Output::EchoCore),
@@ -79,6 +82,7 @@ impl fmt::Display for Output {
             Output::SgLang => "sglang",
             Output::Trtllm => "trtllm",
             Output::Vllm => "vllm",
+            Output::Mocker => "mocker",
 
             Output::EchoFull => "echo_full",
             Output::EchoCore => "echo_core",
@@ -106,6 +110,7 @@ impl Output {
         out.push(Output::SgLang.to_string());
         out.push(Output::Trtllm.to_string());
         out.push(Output::Vllm.to_string());
+        out.push(Output::Mocker.to_string());
 
         out
     }