Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding /get_tokenizer to api_server for lm-evaluation-harness ease integration. #2643

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,8 @@ def main(args: argparse.Namespace):
else:
api_url = f"http://{args.host}:{args.port}{args.endpoint}"

tokenizer = get_tokenizer(tokenizer_id,
trust_remote_code=args.trust_remote_code)
tokenizer, _ = get_tokenizer(tokenizer_id,
trust_remote_code=args.trust_remote_code)

if args.dataset is not None:
warnings.warn(
Expand Down
2 changes: 1 addition & 1 deletion tests/async_engine/test_chat_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_no_load_chat_template_literallike():
async def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output):
# Initialize the tokenizer
tokenizer = get_tokenizer(tokenizer_name=model)
tokenizer, _ = get_tokenizer(tokenizer_name=model)
mock_serving_chat = MockServingChat(tokenizer)
OpenAIServingChat._load_chat_template(mock_serving_chat,
chat_template=template)
Expand Down
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ def __init__(
)
if tokenizer_name is None:
tokenizer_name = model_name
self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
self.tokenizer, _ = get_tokenizer(tokenizer_name,
trust_remote_code=True)

def generate(
self,
Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/test_openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
async def test_logits_bias(server, client: openai.AsyncOpenAI):
prompt = "Hello, my name is"
max_tokens = 5
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
tokenizer, _ = get_tokenizer(tokenizer_name=MODEL_NAME)

# Test exclusive selection
token_id = 1000
Expand Down Expand Up @@ -827,7 +827,7 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI):
)
async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
model_name: str):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
tokenizer, _ = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
completion = await client.completions.create(model=model_name,
Expand Down
2 changes: 1 addition & 1 deletion tests/tokenization/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
def test_tokenizer_revision(tokenizer_name: str):
# Assume that "main" branch always exists
tokenizer = get_tokenizer(tokenizer_name, revision="main")
tokenizer, _ = get_tokenizer(tokenizer_name, revision="main")
assert isinstance(tokenizer, PreTrainedTokenizerBase)

# Assume that "never" branch always does not exist
Expand Down
12 changes: 11 additions & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ async def health() -> Response:
return Response(status_code=200)


@app.get("/v1/get_tokenizer")
async def get_tokenizer():
if openai_serving_chat.shared_tokenizer:
return JSONResponse(content=openai_serving_chat.tokenizer_jsons)
else:
return JSONResponse(content={"error": "No shared tokenizer"},
status_code=HTTPStatus.BAD_REQUEST)


@app.get("/v1/models")
async def show_available_models():
models = await openai_serving_chat.show_available_models()
Expand Down Expand Up @@ -161,9 +170,10 @@ async def authentication(request: Request, call_next):
openai_serving_chat = OpenAIServingChat(engine, served_model_names,
args.response_role,
args.lora_modules,
args.shared_tokenizer,
args.chat_template)
openai_serving_completion = OpenAIServingCompletion(
engine, served_model_names, args.lora_modules)
engine, served_model_names, args.lora_modules, args.shared_tokenizer)

app.root_path = args.root_path
uvicorn.run(app,
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ def make_arg_parser():
"field of a response will be the first name in this "
"list. If not specified, the model name will be the "
"same as the `--model` argument.")
parser.add_argument(
"--shared-tokenizer",
action='store_true',
help="Allow to share the tokenizer with /v1/get_tokenizer api path")
parser.add_argument(
"--lora-modules",
type=str,
Expand Down
4 changes: 3 additions & 1 deletion vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ def __init__(self,
served_model_names: List[str],
response_role: str,
lora_modules: Optional[List[LoRAModulePath]] = None,
shared_tokenizer: Optional[bool] = True,
chat_template: Optional[str] = None):
super().__init__(engine=engine,
served_model_names=served_model_names,
lora_modules=lora_modules)
lora_modules=lora_modules,
shared_tokenizer=shared_tokenizer)
self.response_role = response_role
self._load_chat_template(chat_template)

Expand Down
6 changes: 4 additions & 2 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,12 @@ class OpenAIServingCompletion(OpenAIServing):
def __init__(self,
engine: AsyncLLMEngine,
served_model_names: List[str],
lora_modules: Optional[List[LoRAModulePath]] = None):
lora_modules: Optional[List[LoRAModulePath]] = None,
shared_tokenizer: Optional[bool] = True):
super().__init__(engine=engine,
served_model_names=served_model_names,
lora_modules=lora_modules)
lora_modules=lora_modules,
shared_tokenizer=shared_tokenizer)

async def create_completion(self, request: CompletionRequest,
raw_request: Request):
Expand Down
43 changes: 40 additions & 3 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import asyncio
import json
import os
import shutil
from dataclasses import dataclass
from http import HTTPStatus
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

from pydantic import Field
Expand Down Expand Up @@ -29,8 +32,11 @@ class LoRAModulePath:

class OpenAIServing:

def __init__(self, engine: AsyncLLMEngine, served_model_names: List[str],
lora_modules: Optional[List[LoRAModulePath]]):
def __init__(self,
engine: AsyncLLMEngine,
served_model_names: List[str],
lora_modules: Optional[List[LoRAModulePath]],
shared_tokenizer: Optional[bool] = True):
self.engine = engine
self.served_model_names = served_model_names
if lora_modules is None:
Expand All @@ -47,6 +53,7 @@ def __init__(self, engine: AsyncLLMEngine, served_model_names: List[str],
self.max_model_len = 0
# Lazy initialized
self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
self.shared_tokenizer = shared_tokenizer

try:
event_loop = asyncio.get_running_loop()
Expand All @@ -66,13 +73,43 @@ async def _post_init(self):
self.max_model_len = engine_model_config.max_model_len

# A separate tokenizer to map token IDs to strings.
self.tokenizer = get_tokenizer(
self.tokenizer, hf_tokenizer = get_tokenizer(
engine_model_config.tokenizer,
tokenizer_mode=engine_model_config.tokenizer_mode,
tokenizer_revision=engine_model_config.tokenizer_revision,
trust_remote_code=engine_model_config.trust_remote_code,
truncation_side="left")

# tokenizer json required to be informed
if self.shared_tokenizer:
self.tokenizer_jsons = self._get_tokenizer_jsons(hf_tokenizer)

def _get_tokenizer_jsons(self, hf_tokenizer) -> Dict[str, Dict]:
"""Get tokenizer jsons been used"""
CURRENT_DIR = os.path.dirname(__file__)
EPHIMERAL_FOLDER_NAME = "tmp_tokenizer"
TOKENIZER_EPHIMERAL_PATH = Path(
os.path.join(CURRENT_DIR, EPHIMERAL_FOLDER_NAME))

# save tokenizer files in ephimeral folder
hf_tokenizer.save_pretrained(TOKENIZER_EPHIMERAL_PATH.absolute())
tmp_list = [i for i in TOKENIZER_EPHIMERAL_PATH.glob("*.json")]

# populate tokenizer json
tokenizer_jsons = {}
for json_path in tmp_list:
with open(json_path) as json_file:
filename = json_path.stem
tokenizer_jsons[filename] = json.load(json_file)
try:
shutil.rmtree(TOKENIZER_EPHIMERAL_PATH)
except OSError as e:
raise RuntimeError(
f"Error removing '{TOKENIZER_EPHIMERAL_PATH.name}' dir: {e}"
) from e

return tokenizer_jsons

async def show_available_models(self) -> ModelList:
"""Show available models. Right now we only have one model."""
model_cards = [
Expand Down
8 changes: 5 additions & 3 deletions vllm/transformers_utils/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from copy import deepcopy
from typing import Optional, Union

import huggingface_hub
Expand Down Expand Up @@ -126,16 +127,17 @@ def get_tokenizer(
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
return get_cached_tokenizer(tokenizer)
hf_tokenizer = deepcopy(tokenizer)
return get_cached_tokenizer(tokenizer), hf_tokenizer


def get_lora_tokenizer(lora_request: LoRARequest, *args,
**kwargs) -> Optional[PreTrainedTokenizer]:
if lora_request is None:
return None
try:
tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
**kwargs)
tokenizer, _ = get_tokenizer(lora_request.lora_local_path, *args,
**kwargs)
except OSError as e:
# No tokenizer was found in the LoRA folder,
# use base model tokenizer
Expand Down
3 changes: 2 additions & 1 deletion vllm/transformers_utils/tokenizer_group/tokenizer_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
self.tokenizer_config = tokenizer_config
self.enable_lora = enable_lora
self.max_input_length = max_input_length
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
self.tokenizer, _ = get_tokenizer(self.tokenizer_id,
**tokenizer_config)
self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
capacity=max_num_seqs) if enable_lora else None

Expand Down
Loading