From 95149a226f9629b6490e420dff65f8331925a0cc Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 20 Jun 2024 17:30:00 +0000 Subject: [PATCH 1/2] Override FlexibleArgumentParser to support both underscore and dash in names --- benchmarks/benchmark_latency.py | 4 +-- benchmarks/benchmark_prefix_caching.py | 4 +-- benchmarks/benchmark_serving.py | 7 +++- benchmarks/benchmark_throughput.py | 4 +-- .../cutlass_benchmarks/w8a8_benchmarks.py | 3 +- benchmarks/kernels/benchmark_aqlm.py | 4 +-- benchmarks/kernels/benchmark_moe.py | 3 +- .../kernels/benchmark_paged_attention.py | 4 +-- benchmarks/kernels/benchmark_rope.py | 4 +-- benchmarks/overheads/benchmark_hashing.py | 4 +-- examples/aqlm_example.py | 5 ++- examples/llm_engine_example.py | 3 +- examples/save_sharded_state.py | 4 +-- examples/tensorize_vllm_model.py | 4 +-- tests/async_engine/api_server_async_engine.py | 5 ++- vllm/engine/arg_utils.py | 34 ++++++++++++++----- vllm/entrypoints/api_server.py | 5 ++- vllm/entrypoints/openai/cli_args.py | 5 +-- vllm/entrypoints/openai/run_batch.py | 6 ++-- .../model_executor/model_loader/tensorizer.py | 5 ++- 20 files changed, 70 insertions(+), 47 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index e9d1048c89b64..f6ccfd19292e5 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -10,7 +10,7 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs +from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -120,7 +120,7 @@ def run_to_completion(profile_dir: Optional[str] = None): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') parser.add_argument('--model', type=str, default='facebook/opt-125m') diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 089966986984f..5d5ef213abb34 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,7 +1,7 @@ -import argparse import time from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import FlexibleArgumentParser PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 @@ -44,7 +44,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance with or without automatic ' 'prefix caching.') parser.add_argument('--model', diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index eef03e7d81c39..92c7c3221e3d6 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -44,6 +44,11 @@ except ImportError: from backend_request_func import get_tokenizer +try: + from vllm.engine.arg_utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + @dataclass class BenchmarkMetrics: @@ -511,7 +516,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the online serving throughput.") parser.add_argument( "--backend", diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ed65002bc7d3c..ffd7398a453ba 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -10,7 +10,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import EngineArgs +from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -261,7 +261,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 5cc0fbbd49b8e..6b91dbd307350 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -11,6 +11,7 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops +from vllm.engine.arg_utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -293,7 +294,7 @@ def to_torch_dtype(dt): return torch.float8_e4m3fn raise ValueError("unsupported dtype") - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description=""" Benchmark Cutlass GEMM. diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index ac6a9f297f95a..6efb206802252 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,3 @@ -import argparse import os import sys from typing import Optional @@ -7,6 +6,7 @@ import torch.nn.functional as F from vllm import _custom_ops as ops +from vllm.engine.arg_utils import FlexibleArgumentParser from vllm.model_executor.layers.quantization.aqlm import ( dequantize_weight, generic_dequantize_gemm, get_int_dtype, optimized_dequantize_gemm) @@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: def main(): - parser = argparse.ArgumentParser(description="Benchmark aqlm performance.") + parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") # Add arguments parser.add_argument("--nbooks", diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 62347aaf8ed6d..ca6501f59ce35 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -9,6 +9,7 @@ from ray.experimental.tqdm_ray import tqdm from transformers import AutoConfig +from vllm.engine.arg_utils import FlexibleArgumentParser from vllm.model_executor.layers.fused_moe.fused_moe import * @@ -315,7 +316,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 687e2369b758c..c0124966fbc06 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,4 +1,3 @@ -import argparse import random import time from typing import List, Optional @@ -6,6 +5,7 @@ import torch from vllm import _custom_ops as ops +from vllm.engine.arg_utils import FlexibleArgumentParser from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random NUM_BLOCKS = 1024 @@ -161,7 +161,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the paged attention kernel.") parser.add_argument("--version", type=str, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index a53c6c77a5828..47eeb41a1a088 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,10 +1,10 @@ -import argparse from itertools import accumulate from typing import List, Optional import nvtx import torch +from vllm.engine.arg_utils import FlexibleArgumentParser from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) @@ -86,7 +86,7 @@ def benchmark_rope_kernels_multi_lora( if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the rotary embedding kernels.") parser.add_argument("--is-neox-style", type=bool, default=True) parser.add_argument("--batch-size", type=int, default=16) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index c846e47de1fcf..5d24e0af62320 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,8 +1,8 @@ -import argparse import cProfile import pstats from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?" @@ -47,7 +47,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance of hashing function in' 'automatic prefix caching.') parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 3a63003ab4ba9..02acc14f43696 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,11 +1,10 @@ -import argparse - from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import FlexibleArgumentParser def main(): - parser = argparse.ArgumentParser(description='AQLM examples') + parser = FlexibleArgumentParser(description='AQLM examples') parser.add_argument('--model', '-m', diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py index a81c4b3e399c3..de3cf2a562de5 100644 --- a/examples/llm_engine_example.py +++ b/examples/llm_engine_example.py @@ -2,6 +2,7 @@ from typing import List, Tuple from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.engine.arg_utils import FlexibleArgumentParser def create_test_prompts() -> List[Tuple[str, SamplingParams]]: @@ -55,7 +56,7 @@ def main(args: argparse.Namespace): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Demo on using the LLMEngine class directly') parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/examples/save_sharded_state.py b/examples/save_sharded_state.py index c595d98ba2750..e3dd00aa24f3c 100644 --- a/examples/save_sharded_state.py +++ b/examples/save_sharded_state.py @@ -20,15 +20,15 @@ tensor_parallel_size=8, ) """ -import argparse import dataclasses import os import shutil from pathlib import Path from vllm import LLM, EngineArgs +from vllm.engine.arg_utils import FlexibleArgumentParser -parser = argparse.ArgumentParser() +parser = FlexibleArgumentParser() EngineArgs.add_cli_args(parser) parser.add_argument("--output", "-o", diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index f9ed5fe08988e..3364e58364a75 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -5,7 +5,7 @@ import uuid from vllm import LLM -from vllm.engine.arg_utils import EngineArgs +from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, TensorizerConfig, tensorize_vllm_model) @@ -96,7 +96,7 @@ def parse_args(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="An example script that can be used to serialize and " "deserialize vLLM models. These models " "can be loaded using tensorizer directly to the GPU " diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 1be76fdc8d868..778b20aca655a 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,12 +1,11 @@ """vllm.entrypoints.api_server with some extra logging for testing.""" -import argparse from typing import Any, Dict import uvicorn from fastapi.responses import JSONResponse, Response import vllm.entrypoints.api_server -from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, FlexibleArgumentParser from vllm.engine.async_llm_engine import AsyncLLMEngine app = vllm.entrypoints.api_server.app @@ -33,7 +32,7 @@ def stats() -> Response: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser = AsyncEngineArgs.add_cli_args(parser) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7f760c2775f3d..f43a38b6658c7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,6 +1,7 @@ import argparse import dataclasses import json +import sys import warnings from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -20,6 +21,24 @@ def nullable_str(val: str): return val +class FlexibleArgumentParser(argparse.ArgumentParser): + """ArgumentParser that allows both underscore and dash in names.""" + + def parse_args(self, args=None, namespace=None): + if args is None: + args = sys.argv[1:] + + # Convert underscores to dashes in argument names + processed_args = [] + for arg in args: + if arg.startswith('--'): + processed_args.append('--' + arg[2:].replace('_', '-')) + else: + processed_args.append(arg) + + return super().parse_args(processed_args, namespace) + + @dataclass class EngineArgs: """Arguments for vLLM engine.""" @@ -110,7 +129,7 @@ def __post_init__(self): @staticmethod def add_cli_args_for_vlm( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--image-input-type', type=nullable_str, default=None, @@ -156,8 +175,7 @@ def add_cli_args_for_vlm( return parser @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Shared CLI arguments for vLLM engine.""" # Model arguments @@ -800,8 +818,8 @@ class AsyncEngineArgs(EngineArgs): max_log_len: Optional[int] = None @staticmethod - def add_cli_args(parser: argparse.ArgumentParser, - async_args_only: bool = False) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser, + async_args_only: bool = False) -> FlexibleArgumentParser: if not async_args_only: parser = EngineArgs.add_cli_args(parser) parser.add_argument('--engine-use-ray', @@ -822,13 +840,13 @@ def add_cli_args(parser: argparse.ArgumentParser, # These functions are used by sphinx to build the documentation def _engine_args_parser(): - return EngineArgs.add_cli_args(argparse.ArgumentParser()) + return EngineArgs.add_cli_args(FlexibleArgumentParser()) def _async_engine_args_parser(): - return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(), + return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(), async_args_only=True) def _vlm_engine_args_parser(): - return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser()) + return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser()) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 075de0b4efb2d..310bb15f7d44b 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -6,7 +6,6 @@ change `vllm/entrypoints/openai/api_server.py` instead. """ -import argparse import json import ssl from typing import AsyncGenerator @@ -15,7 +14,7 @@ from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse -from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, FlexibleArgumentParser from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext @@ -80,7 +79,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8000) parser.add_argument("--ssl-keyfile", type=str, default=None) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 4c0cb1e4f3e49..6616931c8a2ed 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -8,7 +8,8 @@ import json import ssl -from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str +from vllm.engine.arg_utils import (AsyncEngineArgs, FlexibleArgumentParser, + nullable_str) from vllm.entrypoints.openai.serving_engine import LoRAModulePath @@ -23,7 +24,7 @@ def __call__(self, parser, namespace, values, option_string=None): def make_arg_parser(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") parser.add_argument("--host", type=nullable_str, diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 2f18701870a6f..e59fc8879484d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,4 +1,3 @@ -import argparse import asyncio import sys from io import StringIO @@ -6,7 +5,8 @@ import aiohttp -from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str +from vllm.engine.arg_utils import (AsyncEngineArgs, FlexibleArgumentParser, + nullable_str) from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, @@ -23,7 +23,7 @@ def parse_args(): - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible batch runner.") parser.add_argument( "-i", diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index d79fedaea428e..24c3739780838 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -14,7 +14,7 @@ import vllm.envs as envs from vllm.config import ModelConfig, ParallelConfig -from vllm.engine.arg_utils import EngineArgs +from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( @@ -177,8 +177,7 @@ def __post_init__(self): self.deserializer_params['encryption'] = decryption_params @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Tensorizer CLI arguments""" # Tensorizer options arg group From 34c62cb4d383ee7d0c9310b6a7cd4ba656642fe7 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 20 Jun 2024 21:01:24 +0000 Subject: [PATCH 2/2] Move to vllm/utils --- benchmarks/benchmark_latency.py | 3 ++- benchmarks/benchmark_prefix_caching.py | 2 +- benchmarks/benchmark_serving.py | 2 +- benchmarks/benchmark_throughput.py | 3 ++- .../cutlass_benchmarks/w8a8_benchmarks.py | 2 +- benchmarks/kernels/benchmark_aqlm.py | 2 +- benchmarks/kernels/benchmark_marlin.py | 4 ++-- benchmarks/kernels/benchmark_moe.py | 2 +- .../kernels/benchmark_paged_attention.py | 4 ++-- benchmarks/kernels/benchmark_rope.py | 2 +- benchmarks/overheads/benchmark_hashing.py | 2 +- examples/aqlm_example.py | 2 +- examples/llm_engine_example.py | 2 +- examples/save_sharded_state.py | 2 +- examples/tensorize_vllm_model.py | 3 ++- tests/async_engine/api_server_async_engine.py | 3 ++- vllm/engine/arg_utils.py | 21 +------------------ vllm/entrypoints/api_server.py | 4 ++-- vllm/entrypoints/openai/cli_args.py | 4 ++-- vllm/entrypoints/openai/run_batch.py | 5 ++--- .../model_executor/model_loader/tensorizer.py | 3 ++- vllm/utils.py | 19 +++++++++++++++++ 22 files changed, 50 insertions(+), 46 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index f6ccfd19292e5..a4cf0632b7790 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -10,9 +10,10 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 5d5ef213abb34..395107a5ec747 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,7 +1,7 @@ import time from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 92c7c3221e3d6..42867fc40edd2 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -45,7 +45,7 @@ from backend_request_func import get_tokenizer try: - from vllm.engine.arg_utils import FlexibleArgumentParser + from vllm.utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ffd7398a453ba..2c6beb4e89672 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -10,8 +10,9 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser +from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def sample_requests( diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 6b91dbd307350..377f8683c021f 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -11,7 +11,7 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops -from vllm.engine.arg_utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 6efb206802252..601c4ea439aea 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -6,10 +6,10 @@ import torch.nn.functional as F from vllm import _custom_ops as ops -from vllm.engine.arg_utils import FlexibleArgumentParser from vllm.model_executor.layers.quantization.aqlm import ( dequantize_weight, generic_dequantize_gemm, get_int_dtype, optimized_dequantize_gemm) +from vllm.utils import FlexibleArgumentParser os.environ['CUDA_VISIBLE_DEVICES'] = '0' diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 96f01967b351e..261f5829631ee 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,3 @@ -import argparse from typing import List import torch @@ -16,6 +15,7 @@ MarlinWorkspace, marlin_24_quantize, marlin_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -211,7 +211,7 @@ def main(args): # python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 # if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark Marlin across specified models/shapes/batches") parser.add_argument( "--models", diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index ca6501f59ce35..e00696d6d43cb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -9,8 +9,8 @@ from ray.experimental.tqdm_ray import tqdm from transformers import AutoConfig -from vllm.engine.arg_utils import FlexibleArgumentParser from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.utils import FlexibleArgumentParser class BenchmarkConfig(TypedDict): diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index c0124966fbc06..16de60477c305 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -5,8 +5,8 @@ import torch from vllm import _custom_ops as ops -from vllm.engine.arg_utils import FlexibleArgumentParser -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 47eeb41a1a088..78736c7a7ba6f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -4,9 +4,9 @@ import nvtx import torch -from vllm.engine.arg_utils import FlexibleArgumentParser from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index 5d24e0af62320..203699e9a8d06 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -2,7 +2,7 @@ import pstats from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?" diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py index 02acc14f43696..40f9a21ec9e51 100644 --- a/examples/aqlm_example.py +++ b/examples/aqlm_example.py @@ -1,5 +1,5 @@ from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser def main(): diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py index de3cf2a562de5..ca41f32b12b31 100644 --- a/examples/llm_engine_example.py +++ b/examples/llm_engine_example.py @@ -2,7 +2,7 @@ from typing import List, Tuple from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams -from vllm.engine.arg_utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser def create_test_prompts() -> List[Tuple[str, SamplingParams]]: diff --git a/examples/save_sharded_state.py b/examples/save_sharded_state.py index e3dd00aa24f3c..4207f8922403b 100644 --- a/examples/save_sharded_state.py +++ b/examples/save_sharded_state.py @@ -26,7 +26,7 @@ from pathlib import Path from vllm import LLM, EngineArgs -from vllm.engine.arg_utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser parser = FlexibleArgumentParser() EngineArgs.add_cli_args(parser) diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index 3364e58364a75..dd77a4ad0c6b7 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -5,10 +5,11 @@ import uuid from vllm import LLM -from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser +from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, TensorizerConfig, tensorize_vllm_model) +from vllm.utils import FlexibleArgumentParser # yapf conflicts with isort for this docstring # yapf: disable diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 778b20aca655a..495a123c351d7 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -5,8 +5,9 @@ from fastapi.responses import JSONResponse, Response import vllm.entrypoints.api_server -from vllm.engine.arg_utils import AsyncEngineArgs, FlexibleArgumentParser +from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.utils import FlexibleArgumentParser app = vllm.entrypoints.api_server.app diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f43a38b6658c7..ef31612420c94 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,7 +1,6 @@ import argparse import dataclasses import json -import sys import warnings from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -12,7 +11,7 @@ SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.utils import str_to_int_tuple +from vllm.utils import FlexibleArgumentParser, str_to_int_tuple def nullable_str(val: str): @@ -21,24 +20,6 @@ def nullable_str(val: str): return val -class FlexibleArgumentParser(argparse.ArgumentParser): - """ArgumentParser that allows both underscore and dash in names.""" - - def parse_args(self, args=None, namespace=None): - if args is None: - args = sys.argv[1:] - - # Convert underscores to dashes in argument names - processed_args = [] - for arg in args: - if arg.startswith('--'): - processed_args.append('--' + arg[2:].replace('_', '-')) - else: - processed_args.append(arg) - - return super().parse_args(processed_args, namespace) - - @dataclass class EngineArgs: """Arguments for vLLM engine.""" diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 310bb15f7d44b..feb904c5a13c9 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -14,11 +14,11 @@ from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse -from vllm.engine.arg_utils import AsyncEngineArgs, FlexibleArgumentParser +from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext -from vllm.utils import random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid TIMEOUT_KEEP_ALIVE = 5 # seconds. app = FastAPI() diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 6616931c8a2ed..59ad73bf097c8 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -8,9 +8,9 @@ import json import ssl -from vllm.engine.arg_utils import (AsyncEngineArgs, FlexibleArgumentParser, - nullable_str) +from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.utils import FlexibleArgumentParser class LoRAParserAction(argparse.Action): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index e59fc8879484d..488ac89710b6d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -5,8 +5,7 @@ import aiohttp -from vllm.engine.arg_utils import (AsyncEngineArgs, FlexibleArgumentParser, - nullable_str) +from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, @@ -16,7 +15,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 24c3739780838..b009ad8c882d4 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -14,13 +14,14 @@ import vllm.envs as envs from vllm.config import ModelConfig, ParallelConfig -from vllm.engine.arg_utils import EngineArgs, FlexibleArgumentParser +from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.utils import FlexibleArgumentParser tensorizer_error_msg = None diff --git a/vllm/utils.py b/vllm/utils.py index ffe921e650aaa..27a7b1042d88f 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,3 +1,4 @@ +import argparse import asyncio import datetime import enum @@ -775,3 +776,21 @@ def wrapper(*args, **kwargs) -> Any: wrapper.has_run = False # type: ignore[attr-defined] return wrapper + + +class FlexibleArgumentParser(argparse.ArgumentParser): + """ArgumentParser that allows both underscore and dash in names.""" + + def parse_args(self, args=None, namespace=None): + if args is None: + args = sys.argv[1:] + + # Convert underscores to dashes and vice versa in argument names + processed_args = [] + for arg in args: + if arg.startswith('--'): + processed_args.append('--' + arg[len('--'):].replace('_', '-')) + else: + processed_args.append(arg) + + return super().parse_args(processed_args, namespace)