vllm-project
diff --git a/‎.buildkite/check-wheel-size.py‎
Lines changed: 4 additions & 4 deletions b/‎.buildkite/check-wheel-size.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/nightly-benchmarks/scripts/launch-server.sh‎
Lines changed: 2 additions & 6 deletions b/‎.buildkite/nightly-benchmarks/scripts/launch-server.sh‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh‎
Lines changed: 1 addition & 7 deletions b/‎.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh‎
Lines changed: 1 addition & 7 deletions
@@ -5,11 +5,11 @@
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 
 
 def print_top_10_largest_files(zip_file):
 
@@ -8,7 +8,7 @@ This benchmark aims to:
 
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 
 ## Setup
 
 
@@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]:
         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
         # we want to turn it into "8xGPUTYPE"
         df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
         )
 
     # get markdown tables
 
@@ -181,18 +181,14 @@ launch_vllm_server() {
   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
         -tp $tp \
-        --model $model \
         --port $port \
         $server_args"
   else
     echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
         -tp $tp \
-        --model $model \
         --port $port \
         $server_args"
   fi
 
@@ -365,8 +365,7 @@ run_serving_tests() {
       continue
     fi
 
-    server_command="$server_envs python3 \
-      -m vllm.entrypoints.openai.api_server \
+    server_command="$server_envs vllm serve \
       $server_args"
 
     # run the server
@@ -455,11 +454,6 @@ main() {
   fi
   check_hf_token
 
-  # Set to v1 to run v1 benchmark
-  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
-    export VLLM_USE_V1=1
-  fi
-
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
Original file line number	Diff line number	Diff line change
`@@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]:`
`368`	`368`	`# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",`
`369`	`369`	`# we want to turn it into "8xGPUTYPE"`
`370`	`370`	`df["GPU"] = df["GPU"].apply(`
`371`		`- lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"`
	`371`	`+ lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"`
`372`	`372`	`)`
`373`	`373`
`374`	`374`	`# get markdown tables`