feat: enable / disable chunked prefill for mockers (#2015)

PeaBrane · coderabbitai[bot] · web-flow · commit e330d969b959 · 2025-07-18T20:40:54.000Z
Signed-off-by: Yan Ru Pei &lt;yanrpei@gmail.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/components/backends/mocker/README.md b/components/backends/mocker/README.md
@@ -9,15 +9,13 @@ The mocker engine is a mock vLLM implementation designed for testing and develop
 
 **Basic usage:**
 
-The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights (but the pre-processor needs the tokenizer). The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights (but the pre-processor needs the tokenizer). The arguments `block_size`, `num_gpu_blocks`, `max_num_seqs`, `max_num_batched_tokens`, `enable_prefix_caching`, and `enable_chunked_prefill` are common arguments shared with the real VLLM engine.
 
 And below are arguments that are mocker-specific:
 - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
 - `dp_size`: Number of data parallel workers to simulate (default: 1)
 - `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
 
->[!NOTE]
->Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
 ```bash
 echo '{"speedup_ratio": 10.0}' > mocker_args.json
 python -m dynamo.mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
@@ -549,15 +549,13 @@ The mocker engine is a mock vLLM implementation designed for testing and develop
 
 **Basic usage:**
 
-The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights (but the pre-processor needs the tokenizer). The arguments `block_size`, `num_gpu_blocks`, `max_num_seqs`, `max_num_batched_tokens`, `enable_prefix_caching`, and `enable_chunked_prefill` are common arguments shared with the real VLLM engine.
 
 And below are arguments that are mocker-specific:
 - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
 - `dp_size`: Number of data parallel workers to simulate (default: 1)
 - `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
 
->[!NOTE]
->Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
 ```bash
 echo '{"speedup_ratio": 10.0}' > mocker_args.json
 dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
@@ -293,14 +293,9 @@ impl KvManager {
         let overlap_blocks = seq_blocks.len() - new_blocks;
         let new_tokens = sequence.num_input_tokens() - overlap_blocks * self.block_size;
 
-        // Calculate prefill compute
-        let prefill_compute =
-            1.25e-6 * (new_tokens as f64).powi(2) + 7.41e-2 * (new_tokens as f64) + 2.62e1;
-
         PrefillCost {
             new_blocks,
             new_tokens,
-            prefill_compute,
         }
     }
 }
diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs
@@ -58,7 +58,13 @@ pub struct DirectRequest {
 pub struct PrefillCost {
     pub new_blocks: usize,
     pub new_tokens: usize,
-    pub prefill_compute: f64,
+}
+
+impl PrefillCost {
+    pub fn predict_prefill_compute(&self, new_tokens: Option<usize>) -> f64 {
+        let tokens = new_tokens.unwrap_or(self.new_tokens);
+        1.25e-6 * (tokens as f64).powi(2) + 7.41e-2 * (tokens as f64) + 2.62e1
+    }
 }
 
 /// Signal for output token generation with completion status
@@ -89,6 +95,9 @@ pub struct MockEngineArgs {
     #[builder(default = true)]
     pub enable_prefix_caching: bool,
 
+    #[builder(default = true)]
+    pub enable_chunked_prefill: bool,
+
     #[builder(default = "0.01")]
     pub watermark: f64,
 
@@ -127,6 +136,7 @@ impl MockEngineArgs {
             "max_num_seqs",
             "max_num_batched_tokens",
             "enable_prefix_caching",
+            "enable_chunked_prefill",
             "watermark",
             "speedup_ratio",
             "dp_size",
@@ -181,6 +191,12 @@ impl MockEngineArgs {
             }
         }
 
+        if let Some(value) = extra_args.get("enable_chunked_prefill") {
+            if let Some(enabled) = value.as_bool() {
+                builder = builder.enable_chunked_prefill(enabled);
+            }
+        }
+
         if let Some(value) = extra_args.get("watermark") {
             if let Some(num) = value.as_f64() {
                 builder = builder.watermark(num);
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs

Original file line number	Diff line number	Diff line change
`@@ -293,14 +293,9 @@ impl KvManager {`
`293`	`293`	`let overlap_blocks = seq_blocks.len() - new_blocks;`
`294`	`294`	`let new_tokens = sequence.num_input_tokens() - overlap_blocks * self.block_size;`
`295`	`295`
`296`		`- // Calculate prefill compute`
`297`		`- let prefill_compute =`
`298`		`- 1.25e-6 * (new_tokens as f64).powi(2) + 7.41e-2 * (new_tokens as f64) + 2.62e1;`
`299`		`-`
`300`	`296`	`PrefillCost {`
`301`	`297`	`new_blocks,`
`302`	`298`	`new_tokens,`
`303`		`- prefill_compute,`
`304`	`299`	`}`
`305`	`300`	`}`
`306`	`301`	`}`