From 6542f32ed109c0a35617ec2fb1b1f609c66a74f4 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 1 Aug 2025 19:53:29 -0700 Subject: [PATCH 1/3] docs: remove deprecated disable-log-requests flag --- benchmarks/README.md | 10 +++++----- benchmarks/auto_tune/auto_tune.sh | 1 - benchmarks/benchmark_serving.py | 3 +-- benchmarks/benchmark_serving_structured_output.py | 2 +- docs/models/supported_models.md | 2 +- examples/online_serving/prometheus_grafana/README.md | 3 +-- .../disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh | 2 -- 7 files changed, 9 insertions(+), 14 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 644517235b12..d6442a4fc387 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -91,7 +91,7 @@ become available. First start serving your model ```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +vllm serve NousResearch/Hermes-3-Llama-3.1-8B ``` Then run the benchmarking script @@ -146,7 +146,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you ```bash # start server -VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct ``` ```bash @@ -171,7 +171,7 @@ You can skip applying chat template if your data already has it by using `--cust ```bash # need a model with vision capability here -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +vllm serve Qwen/Qwen2-VL-7B-Instruct ``` ```bash @@ -205,7 +205,7 @@ vllm bench serve \ ### Other HuggingFaceDataset Examples ```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +vllm serve Qwen/Qwen2-VL-7B-Instruct ``` `lmms-lab/LLaVA-OneVision-Data`: @@ -430,7 +430,7 @@ Benchmark the performance of structured output generation (JSON, grammar, regex) ### Server Setup ```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +vllm serve NousResearch/Hermes-3-Llama-3.1-8B ``` ### JSON Schema Benchmark diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 3cd8580e065d..df26376504b9 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -60,7 +60,6 @@ start_server() { pkill -f vllm VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ - --disable-log-requests \ --port 8004 \ --gpu-memory-utilization $gpu_memory_utilization \ --max-num-seqs $max_num_seqs \ diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3affa18ae3a4..93b72211eb33 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -5,8 +5,7 @@ On the server side, run one of the following commands: vLLM OpenAI API server vllm serve \ - --swap-space 16 \ - --disable-log-requests + --swap-space 16 On the client side, run: python benchmarks/benchmark_serving.py \ diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 2a22f122c78e..ca6843a72aa3 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -4,7 +4,7 @@ On the server side, run one of the following commands: (vLLM OpenAI API server) - vllm serve --disable-log-requests + vllm serve On the client side, run: python benchmarks/benchmark_serving_structured_output.py \ diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 56c77a1e5f11..bd7a57b43621 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -255,7 +255,7 @@ export https_proxy=http://your.proxy.server:port https_proxy=http://your.proxy.server:port huggingface-cli download # or use vllm cmd directly -https_proxy=http://your.proxy.server:port vllm serve --disable-log-requests +https_proxy=http://your.proxy.server:port vllm serve ``` - Set the proxy in Python interpreter: diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index 7c4e649e6d02..5cd4dab5a8fa 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -13,8 +13,7 @@ Prometheus metric logging is enabled by default in the OpenAI-compatible server. ```bash vllm serve mistralai/Mistral-7B-v0.1 \ - --max-model-len 2048 \ - --disable-log-requests + --max-model-len 2048 ``` Launch Prometheus and Grafana servers with `docker compose`: diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh index 5719fa821292..1284466a4558 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh @@ -28,7 +28,6 @@ if [[ $1 == "prefiller" ]]; then CUDA_VISIBLE_DEVICES=0 \ vllm serve $MODEL \ --port 8100 \ - --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}' @@ -46,7 +45,6 @@ elif [[ $1 == "decoder" ]]; then CUDA_VISIBLE_DEVICES=1 \ vllm serve $MODEL \ --port 8200 \ - --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}' From 30f4ea4a79148590aa59e38638591773ab927d09 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 1 Aug 2025 20:02:44 -0700 Subject: [PATCH 2/3] update Signed-off-by: Roger Wang --- .buildkite/scripts/tpu/run_bm.sh | 1 - tests/entrypoints/openai/correctness/test_lmeval.py | 2 +- tests/entrypoints/openai/test_chunked_prompt.py | 2 -- tests/models/quantization/test_bitsandbytes.py | 1 - tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh | 2 -- tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh | 2 -- .../nixl_integration/run_tpu_disagg_accuracy_test.sh | 3 --- .../kv_connector/nixl_integration/run_tpu_edge_case_test.sh | 2 -- tests/v1/sample/test_logprobs_e2e.py | 2 +- vllm/utils/__init__.py | 5 +++-- 10 files changed, 5 insertions(+), 17 deletions(-) diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index beecaf7a740a..b1e17b438578 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -44,7 +44,6 @@ echo VLLM_USE_V1=1 vllm serve $MODEL \ --seed 42 \ - --disable-log-requests \ --max-num-seqs $MAX_NUM_SEQS \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index a07a147cdc2b..d75731637d28 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -22,7 +22,7 @@ FILTER = "exact_match,strict-match" RTOL = 0.03 EXPECTED_VALUE = 0.54 -DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] +DEFAULT_ARGS = ["--max-model-len", "4096"] MORE_ARGS_LIST = [ [], # Default ["--enable-chunked-prefill"], # Chunked diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 3c8ed955a65a..c8160c5f2d0e 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -26,8 +26,6 @@ def server(): "--enable-chunked-prefill", "--max-num-batched-tokens", "1000", - # large prompts create a lot of output - "--disable-log-requests", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index 8cb269d7e949..e0e919b62b21 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -102,7 +102,6 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, def test_load_pp_4bit_bnb_model(model_name, description) -> None: common_args = [ "--disable-log-stats", - "--disable-log-requests", "--dtype", "bfloat16", "--enable-prefix-caching", diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index b48655d80eef..9322410ec99e 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -88,7 +88,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" @@ -121,7 +120,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --tensor-parallel-size $DECODER_TP_SIZE \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh index 98903a176e28..b64461292910 100644 --- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh @@ -57,7 +57,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \ --port $PREFILL_PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" @@ -76,7 +75,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \ --port $DECODE_PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh index 45779d16914f..ea125f99fc42 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh @@ -63,7 +63,6 @@ launch_baseline() { --seed 42 \ --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --enforce-eager" echo ${BASELINE_BASE_CMD} ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" & @@ -87,7 +86,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" @@ -106,7 +104,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" echo ${PREFILL_BASE_CMD} diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh index c37c92fdf5d3..8ba653770c4f 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh @@ -68,7 +68,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" @@ -87,7 +86,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" echo ${PREFILL_BASE_CMD} diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index 50b14a15dc16..7f41355ff7ce 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -15,7 +15,7 @@ MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8" # noqa: E501 SERVER_ARGS = [ - "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests", + "--enforce_eager", "--no_enable_prefix_caching", "--gpu-memory-utilization=0.8" ] NUM_CONCURRENT = 100 diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 0d3fa6b059be..b14b03a3a2d1 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1672,8 +1672,9 @@ def parse_known_args(self, args=None, namespace=None): # Special case warning because the warning below won't trigger # if –-disable-log-requests because its value is default. logger.warning_once( - "argument '--disable-log-requests' is deprecated. This " - "will be removed in v0.12.0.") + "argument '--disable-log-requests' is deprecated and " + "replaced with '--enable-log-requests'. This will be " + "removed in v0.12.0.") namespace, args = super().parse_known_args(args, namespace) for action in FlexibleArgumentParser._deprecated: if (hasattr(namespace, dest := action.dest) From 354244c1fae64642b0536b01825961fee7affcaf Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 1 Aug 2025 20:05:03 -0700 Subject: [PATCH 3/3] remove Signed-off-by: Roger Wang --- docs/design/p2p_nccl_connector.md | 8 -------- .../disagg_example_p2p_nccl_xpyd.sh | 2 -- 2 files changed, 10 deletions(-) diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md index 94af8bedd24d..adf838306bc7 100644 --- a/docs/design/p2p_nccl_connector.md +++ b/docs/design/p2p_nccl_connector.md @@ -109,7 +109,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` @@ -131,7 +130,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` @@ -153,7 +151,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` @@ -175,7 +172,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` @@ -206,7 +202,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` @@ -228,7 +223,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` @@ -250,7 +244,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` @@ -272,7 +265,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh index 568f7a43b496..7b0b12bb34d2 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh @@ -178,7 +178,6 @@ main() { --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 & PIDS+=($!) @@ -207,7 +206,6 @@ main() { --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 & PIDS+=($!)