@@ -23,7 +23,7 @@ async def register_llm_with_runtime_config(
2323 Returns:
2424 bool: True if registration succeeded, False if it failed
2525 """
26- runtime_config = await _get_runtime_config (engine , dynamo_args )
26+ runtime_config = await _get_runtime_config (engine , server_args , dynamo_args )
2727 input_type = ModelInput .Tokens
2828 output_type = ModelType .Chat | ModelType .Completions
2929 if not server_args .skip_tokenizer_init :
@@ -51,13 +51,25 @@ async def register_llm_with_runtime_config(
5151
5252
5353async def _get_runtime_config (
54- engine : sgl .Engine , dynamo_args : DynamoArgs
54+ engine : sgl .Engine , server_args : ServerArgs , dynamo_args : DynamoArgs
5555) -> Optional [ModelRuntimeConfig ]:
5656 """Get runtime config from SGLang engine"""
5757 runtime_config = ModelRuntimeConfig ()
5858 # set reasoning parser and tool call parser
5959 runtime_config .reasoning_parser = dynamo_args .reasoning_parser
6060 runtime_config .tool_call_parser = dynamo_args .tool_call_parser
61+
62+ # In SGLang, these are server_args, not scheduler_info (unlike vLLM)
63+ # Note: If --max-running-requests is not specified, SGLang uses an internal default
64+ # undocumented value. The value here will be None if not explicitly set by user.
65+ max_running_requests = getattr (server_args , "max_running_requests" , None )
66+ if max_running_requests :
67+ runtime_config .max_num_seqs = max_running_requests
68+
69+ max_prefill_tokens = getattr (server_args , "max_prefill_tokens" , None )
70+ if max_prefill_tokens :
71+ runtime_config .max_num_batched_tokens = max_prefill_tokens
72+
6173 try :
6274 # Try to check if the engine has a scheduler attribute with the computed values
6375 if hasattr (engine , "scheduler_info" ) and engine .scheduler_info is not None :
@@ -77,7 +89,10 @@ async def _get_runtime_config(
7789 f"(max_total_tokens={ max_total_tokens } , page_size={ page_size } )"
7890 )
7991
80- # Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info
92+ # Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
93+ # SGLang separates configuration (server_args) from runtime stats (scheduler_info).
94+ # In contrast, vLLM exposes both config and runtime values through engine config.
95+ # These are config parameters, so they must be retrieved from server_args only.
8196
8297 return runtime_config
8398
0 commit comments