code clean

Signed-off-by: Ning Wang <n.wang.chn@hotmail.com>
vllm-project · Feb 25, 2025 · 5791f68 · 5791f68
1 parent df0d3e1
commit 5791f68
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 113 deletions.
diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh
@@ -18,8 +18,9 @@
 PATH_PREFIX=`dirname "$0"`
 FILE_NAME="result"
 MODEL="llama2-7b"
+TEMPERATURE=0.0  
 
-TOTAL=100 
+TOTAL=100
 # TODO: Set your preferred request sizes and rates here.
 input_start=4
 input_limit=$((2**11)) # 2K
@@ -30,10 +31,6 @@ rate_limit=$((2**6)) # 64
 workload=
 dry_run=0
 
-debug_print() {
-    echo "[DEBUG] $1"
-}
-
 
 # Function to generate workload for specific input/output lengths
 generate_workload() {
@@ -48,19 +45,8 @@ generate_workload() {
     echo "  api_key: $api_key"
     echo "  num_prompts: $num_prompts"
     echo "  model: $model"
-    echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model"
-
-    echo "python $PATH_PREFIX/gen_benchmark_prompt.py \
-        $workload  \
-        --input-tokens \"$input_len\" \
-        --min-output-tokens \"$output_len\" \
-        --tolerance \"0.2\" \
-        --qps \"2.0\" \
-        --host \"localhost\" \
-        --port \"8010\" \
-        --api-key \"$api_key\" \
-        --total-prompts \"$num_prompts\" \
-        --model \"$model\""
+    echo "  temperature: $TEMPERATURE"
+    echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE"
 
     python $PATH_PREFIX/gen_benchmark_prompt.py \
         $workload  \
@@ -72,15 +58,14 @@ generate_workload() {
         --port "8010" \
         --api-key "$api_key" \
         --total-prompts "$num_prompts" \
-        --model "$model"
+        --model "$model" \
+        --temperature "$TEMPERATURE"
 }
 
 while [[ $# -gt 0 ]]; do
-  debug_print "Processing argument: $1"
   case "$1" in
     -m|--model)
       MODEL=$2
-      debug_print "Set MODEL to: $MODEL"
       shift 2
       ;;
     -o|--output)
@@ -89,32 +74,26 @@ while [[ $# -gt 0 ]]; do
       ;;
     --input-start)
       input_start=$2
-      debug_print "Set input_start to: $input_start"
       shift 2
       ;;
     --input-limit)
       input_limit=$2
-      debug_print "Set input_limit to: $input_limit"
       shift 2
       ;;
     --output-start)
       output_start=$2
-      debug_print "Set output_start to: $output_start"
       shift 2
       ;;
     --output-limit)
       output_limit=$2
-      debug_print "Set output_limit to: $output_limit"
       shift 2
       ;;
     --rate-start)
       rate_start=$2
-      debug_print "Set rate_start to: $rate_start"
       shift 2
       ;;
     --rate-limit)
       rate_limit=$2
-      debug_print "Set rate_limit to: $rate_limit"
       shift 2
       ;;
     --dry-run)
@@ -123,12 +102,14 @@ while [[ $# -gt 0 ]]; do
       ;;
     --api-key)
       LLM_API_KEY=$2
-      debug_print "Set LLM_API_KEY to: $LLM_API_KEY"
+      shift 2
+      ;;
+    --temperature)
+      TEMPERATURE=$2
       shift 2
       ;;
     --workload)
       workload="--workload_dataset_file $2"
-      debug_print "Set workload to: $workload"
       shift 2
       ;;
     # *)
@@ -138,14 +119,6 @@ while [[ $# -gt 0 ]]; do
   esac
 done
 
-# Add debug prints for initial parameters
-debug_print "Initial parameters:"
-debug_print "MODEL: $MODEL"
-debug_print "Input range: $input_start to $input_limit"
-debug_print "Output range: $output_start to $output_limit"
-debug_print "Rate range: $rate_start to $rate_limit"
-debug_print "Workload file: $workload"
-
 
 # Make sure the directory exists and clear output file
 OUTPUT_FILE="${PATH_PREFIX}/result/${FILE_NAME}.jsonl"
@@ -186,7 +159,7 @@ while [[ $input_len -le $input_limit ]]; do
 
       WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json"
       if [[ -f "$WORKLOAD_FILE" ]]; then
-        python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --workload_dataset_file $WORKLOAD_FILE >> "$OUTPUT_FILE" 
+        python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE" 
       fi
       req_rate=$((req_rate * 2)) 
     done

diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gen_benchmark_prompt.py
@@ -10,7 +10,7 @@
 
 def get_tokenizer(
         pretrained_model_name_or_path: str, trust_remote_code: bool
-) -> Any:  # Changed return type to Any since we're returning a different tokenizer type
+) -> Any:  
     """Get tiktoken tokenizer."""
     try:
         # Use cl100k_base for ChatGPT-style models
@@ -115,7 +115,7 @@ def find_matching_prompts(self, target_input_tokens: int, min_output_tokens: int
         # Sort candidates by input difference
         candidates.sort(key=lambda x: x[2])
 
-        # If max_candidates is not specified, use all candidates
+        # If max_candidates is not specified, use all candidates or choosing the first max_candidates number of candidates
         if max_candidates is not None:
             candidates = candidates[:max_candidates]
 
@@ -131,7 +131,7 @@ def find_matching_prompts(self, target_input_tokens: int, min_output_tokens: int
 
             if output_tokens and output_tokens >= min_output_tokens:
                 matching_prompts.append((prompt, input_tokens, output_tokens, response_data))
-                break  # We found our match, we can stop
+                break  #No match found, stop the loop
 
             print("-" * 80)
 
@@ -148,7 +148,6 @@ def save_results(self, matching_prompts: List[Tuple[str, int, int, Dict]],
 
         # Get the directory where the script is located
         script_dir = os.path.dirname(os.path.abspath(__file__))
-
         # Create the prompts directory relative to the script location
         prompts_dir = os.path.join(script_dir, "result", "prompts")
         os.makedirs(prompts_dir, exist_ok=True) 
@@ -162,7 +161,7 @@ def save_results(self, matching_prompts: List[Tuple[str, int, int, Dict]],
         for prompt, input_tokens, output_tokens, response_data in matching_prompts:
             for i in range(self.total_prompts):
                 benchmark_format.append({
-                    "Timestamp": base_timestamp + (i * 1000),  # Increment by 1000 for each prompt
+                    "Timestamp": base_timestamp + (i * 1000),  
                     "Requests": [{
                         "Prompt": prompt,
                         "Prompt Length": input_tokens,

diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/gpu_benchmark.py
@@ -195,70 +195,53 @@ async def send_request(
     ts = datetime.now(timezone.utc)
     timeout = aiohttp.ClientTimeout(total=3 * 3600)
     status_code = None
-    error_msg = None
-    token_latencies = []  # Initialize here
-    time_to_first = 0.0  # Initialize here
-
-    try:
-        async with aiohttp.ClientSession(timeout=timeout) as session:
-            while True:
-                async with session.post(api_url, headers=headers, json=pload) as response:
-                    status_code = response.status
-                    response_status = "success" if status_code == 200 else "failed"
-
-                    # Capture error response for non-200 status codes
-                    if status_code != 200:
-                        error_response = await response.text()
-                        try:
-                            error_json = json.loads(error_response)
-                            error_msg = error_json.get('error', error_response)
-                        except:
-                            error_msg = error_response
-                        print_err(f"Request {idx} failed with status {status_code}: {error_msg}")
-                        break
-
-                    chunks = []
-                    previous_token_time = time.perf_counter()
-                    first = True
-
-                    try:
-                        if streaming:
-                            async for chunk, _ in response.content.iter_chunks():
-                                chunks = [chunk]
-                                now_time = time.perf_counter()
-                                if first:
-                                    time_to_first = now_time - previous_token_time
-                                    first = False
-                                else:
-                                    token_latencies.append(now_time - previous_token_time)
-                                previous_token_time = now_time
-                                # Stream off: Chunks are full response.
-                                # chunks.append(chunk)
-
-                            output = b"".join(chunks).decode("utf-8")
-                            santicized = output.rstrip("\n\t ")
-                        else:
-                            time_to_first = time.perf_counter() - previous_token_time
-                            output = await response.text()
-                            santicized = output
-                    except Exception as e:
-                        error_msg = f"Failed to read response: {str(e)}"
-                        print_err(f"Failed to read response for request {idx}: {e}")
-                        break
-
+    async with aiohttp.ClientSession(timeout=timeout) as session:
+        while True:
+            # print(f"Sending request: {api_url}:{pload}")
+            async with session.post(api_url, headers=headers, json=pload) as response:
+                status_code = response.status
+                chunks = []
+                token_latencies = []
+                previous_token_time = time.perf_counter()
+                first = True
                 try:
-                    ret = load_response(santicized)
-                    if "error" not in ret:
-                        break
-                    error_msg = f"API error: {ret.get('error', 'Unknown error')}"
+                    if streaming:
+                        async for chunk, _ in response.content.iter_chunks():
+                            # Stream on: Each chunk in the response is the full response so far
+                            chunks = [chunk]
+
+                            now_time = time.perf_counter()
+                            if first:
+                                time_to_first = now_time - previous_token_time
+                                first = False
+                            else:
+                                token_latencies.append(now_time - previous_token_time)
+                            previous_token_time = now_time
+
+                            # Stream off: Chunks are full response.
+                            # chunks.append(chunk)
+
+                        output = b"".join(chunks).decode("utf-8")
+                        santicized = output.rstrip(
+                            "\n\t "
+                        )  # Remove trailing whitespace characters including EOF, and "[DONE]"
+                    else:
+                        time_to_first = time.perf_counter() - previous_token_time
+                        output = await response.text()
+                        santicized = output
                 except Exception as e:
-                    error_msg = f"Failed to parse response: {str(e)}"
-                    print_err(f"Invalid response for request {idx}: {santicized}: {e}")
+                    print_err(f"Failed to read response for request {idx}: {e}")
+                    break
+            try:
+                ret = load_response(santicized)
+
+                # Re-send the request if it failed.
+                if "error" not in ret:
                     break
-    except Exception as e:
-        # It's ok to parse failure, santicized output could be jsonl, other format, or internal error.
-        print_err(f"Invalid response for request {idx}: {santicized}: {e}")
-        return
+            except Exception as e:
+                # It's ok to parse failure, santicized output could be jsonl, other format, or internal error.
+                print_err(f"Invalid response for request {idx}: {santicized}: {e}")
+                break
 
     request_end_time = time.perf_counter()
     request_latency = request_end_time - request_start_time
@@ -272,18 +255,15 @@ async def send_request(
             else len(token_latencies) + 1,
             "timestamp": ts.strftime("%Y-%m-%d %H:%M:%S %Z%z"),
             "E2E": request_latency,
-            "status_code": status_code,
-            "success": status_code == 200 if status_code else False,
-            # "request_payload": pload
+            "status_code": status_code,  # Add status code to trace
+            "success": status_code == 200 if status_code else False  # Add success flag
         }
-        if error_msg:
-            request_trace["error"] = error_msg
         if len(token_latencies) > 0:
             request_trace["TTFT"] = time_to_first
-            request_trace["TPOT_mean"] = np.mean(token_latencies)
-            request_trace["TPOT_P50"] = np.percentile(token_latencies, 50)
-            request_trace["TPOT_P90"] = np.percentile(token_latencies, 90)
-            request_trace["TPOT_P99"] = np.percentile(token_latencies, 99)
+            request_trace["TPOT_mean"] = np.mean(token_latencies)  # type: ignore
+            request_trace["TPOT_P50"] = np.percentile(token_latencies, 50)  # type: ignore
+            request_trace["TPOT_P90"] = np.percentile(token_latencies, 90)  # type: ignore
+            request_trace["TPOT_P99"] = np.percentile(token_latencies, 99)  # type: ignore
         print(json.dumps(request_trace))
     REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
     if len(token_latencies) > 0:
@@ -336,6 +316,10 @@ async def benchmark(
 
 
 def main(args: argparse.Namespace):
+    # Set global temperature from args
+    global TEMPERATURE
+    TEMPERATURE = args.temperature
+
     result = {}
     if args.verbose:
         print(args)
@@ -345,6 +329,7 @@ def main(args: argparse.Namespace):
         result["request_rate"] = args.request_rate
         result["seed"] = args.seed
         result["model"] = args.model
+        result["temperature"] = args.temperature  
         result["samples"] = args.num_prompts
 
     random.seed(args.seed)
@@ -524,5 +509,11 @@ def main(args: argparse.Namespace):
         help="Path to a JSON file containing prompts",
     )
     parser.add_argument("--use-workload-interval", action="store_true")
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.0,
+        help="Temperature for text generation (default: 0.0)",
+    )
     args = parser.parse_args()
     main(args)