diff --git a/vllm/config.py b/vllm/config.py index 2e8d58411181c..2a165edfd70ee 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -22,6 +22,7 @@ class ModelConfig: available, and "slow" will always use the slow tokenizer. trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer. + keep_special_tokens: Keep special tokens in the generated text. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. use_np_weights: Save a numpy copy of model weights for faster loading. @@ -39,6 +40,7 @@ def __init__( tokenizer: str, tokenizer_mode: str, trust_remote_code: bool, + keep_special_tokens: bool, download_dir: Optional[str], use_np_weights: bool, use_dummy_weights: bool, @@ -49,6 +51,7 @@ def __init__( self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code + self.keep_special_tokens = keep_special_tokens self.download_dir = download_dir self.use_np_weights = use_np_weights self.use_dummy_weights = use_dummy_weights diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 99fe593b4cb01..3c97b0fe90a81 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -14,6 +14,7 @@ class EngineArgs: tokenizer: Optional[str] = None tokenizer_mode: str = 'auto' trust_remote_code: bool = False + keep_special_tokens: bool = False download_dir: Optional[str] = None use_np_weights: bool = False use_dummy_weights: bool = False @@ -130,6 +131,9 @@ def add_cli_args( parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics') + parser.add_argument('--keep-special-tokens', + action='store_true', + help='keep special tokens in the output') return parser @classmethod @@ -146,9 +150,9 @@ def create_engine_configs( # Initialize the configs. model_config = ModelConfig(self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, - self.download_dir, self.use_np_weights, - self.use_dummy_weights, self.dtype, - self.seed) + self.keep_special_tokens, self.download_dir, + self.use_np_weights, self.use_dummy_weights, + self.dtype, self.seed) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4ea443d8451d1..cad1deea1e232 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -628,7 +628,7 @@ def _decode_sequence(self, seq: Sequence) -> None: self.tokenizer, seq.output_tokens, seq.get_last_token_id(), - skip_special_tokens=True, + skip_special_tokens=not self.model_config.keep_special_tokens, ) if new_token is not None: seq.output_tokens.append(new_token)