@@ -322,9 +322,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
322322 parser .add_argument ('--download-dir' ,
323323 type = nullable_str ,
324324 default = EngineArgs .download_dir ,
325- help = 'Directory to download and load the weights, '
326- 'default to the default cache dir of '
327- 'huggingface.' )
325+ help = 'Directory to download and load the weights.' )
328326 parser .add_argument (
329327 '--load-format' ,
330328 type = str ,
@@ -399,8 +397,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
399397 'Valid backend values are "xgrammar", "guidance", and "auto". '
400398 'With "auto", we will make opinionated choices based on request'
401399 'contents and what the backend libraries currently support, so '
402- 'the behavior is subject to change in each release. '
403- 'The default is xgrammar.' )
400+ 'the behavior is subject to change in each release.' )
404401 parser .add_argument (
405402 '--logits-processor-pattern' ,
406403 type = nullable_str ,
@@ -493,8 +490,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
493490 default = EngineArgs .prefix_caching_hash_algo ,
494491 help = "Set the hash algorithm for prefix caching. "
495492 "Options are 'builtin' (Python's built-in hash) or 'sha256' "
496- "(collision resistant but with certain overheads). Defaults "
497- "to 'builtin'." ,
493+ "(collision resistant but with certain overheads)." ,
498494 )
499495 parser .add_argument ('--disable-sliding-window' ,
500496 action = 'store_true' ,
@@ -568,9 +564,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
568564 type = int ,
569565 default = EngineArgs .max_num_partial_prefills ,
570566 help = "For chunked prefill, the max number of concurrent \
571- partial prefills."
572- "Defaults to 1" ,
573- )
567+ partial prefills." )
574568 parser .add_argument (
575569 "--max-long-partial-prefills" ,
576570 type = int ,
@@ -579,15 +573,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
579573 "than --long-prefill-token-threshold that will be prefilled "
580574 "concurrently. Setting this less than --max-num-partial-prefills "
581575 "will allow shorter prompts to jump the queue in front of longer "
582- "prompts in some cases, improving latency. Defaults to 1. " )
576+ "prompts in some cases, improving latency." )
583577 parser .add_argument (
584578 "--long-prefill-token-threshold" ,
585579 type = float ,
586580 default = EngineArgs .long_prefill_token_threshold ,
587581 help = "For chunked prefill, a request is considered long if the "
588- "prompt is longer than this number of tokens. Defaults to 4%% of "
589- "the model's context length." ,
590- )
582+ "prompt is longer than this number of tokens." )
591583 parser .add_argument ('--max-num-seqs' ,
592584 type = int ,
593585 default = EngineArgs .max_num_seqs ,
@@ -739,8 +731,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
739731 type = int ,
740732 default = EngineArgs .max_cpu_loras ,
741733 help = ('Maximum number of LoRAs to store in CPU memory. '
742- 'Must be >= than max_loras. '
743- 'Defaults to max_loras.' ))
734+ 'Must be >= than max_loras.' ))
744735 parser .add_argument (
745736 '--fully-sharded-loras' ,
746737 action = 'store_true' ,
@@ -894,7 +885,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
894885 help = 'Set the lower bound threshold for the posterior '
895886 'probability of a token to be accepted. This threshold is '
896887 'used by the TypicalAcceptanceSampler to make sampling decisions '
897- 'during speculative decoding. Defaults to 0.09 ' )
888+ 'during speculative decoding.' )
898889
899890 parser .add_argument (
900891 '--typical-acceptance-sampler-posterior-alpha' ,
0 commit comments