You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
feat: add max-model-len configuration and validation for context window (#82) (#85)
* feat: add max-model-len configuration and validation for context window (#82)
* refactor: remove redundant check for max model length in validateContextWindow
* fix: correct indentation for test entry in simulator configuration tests
* test: add additional test case for simulator configuration
* fix: static lint check errors
* fix: update error message capitalization in validateContextWindow
* fix: update error message capitalization in validateContextWindow
* fix: refactored context window validation func with detailed error messages and update README
---------
Signed-off-by: Mohit Pal Singh <mohit.pal.singh@outlook.com>
Copy file name to clipboardExpand all lines: README.md
+1Lines changed: 1 addition & 0 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -92,6 +92,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
92
92
-`lora-modules`: a list of LoRA adapters (a list of space-separated JSON strings): '{"name": "name", "path": "lora_path", "base_model_name": "id"}', optional, empty by default
93
93
-`max-loras`: maximum number of LoRAs in a single batch, optional, default is one
94
94
-`max-cpu-loras`: maximum number of LoRAs to store in CPU memory, optional, must be >= than max-loras, default is max-loras
95
+
-`max-model-len`: model's context window, maximum number of tokens in a single request including input and output, optional, default is 1024
95
96
-`max-num-seqs`: maximum number of sequences per iteration (maximum number of inference requests that could be processed at the same time), default is 5
96
97
-`mode`: the simulator mode, optional, by default `random`
97
98
-`echo`: returns the same text that was sent in the request
Copy file name to clipboardExpand all lines: pkg/llm-d-inference-sim/simulator.go
+12-1Lines changed: 12 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -150,6 +150,7 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
150
150
f.IntVar(&config.MaxNumSeqs, "max-num-seqs", config.MaxNumSeqs, "Maximum number of inference requests that could be processed at the same time (parameter to simulate requests waiting queue)")
151
151
f.IntVar(&config.MaxLoras, "max-loras", config.MaxLoras, "Maximum number of LoRAs in a single batch")
152
152
f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
153
+
f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
153
154
154
155
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
155
156
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
@@ -372,6 +373,16 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
s.sendCompletionError(ctx, fmt.Sprintf("This model's maximum context length is %d tokens. However, you requested %d tokens (%d in the messages, %d in the completion). Please reduce the length of the messages or completion",
0 commit comments