Skip to content

Commit

Permalink
code clean
Browse files Browse the repository at this point in the history
Signed-off-by: Ning Wang <n.wang.chn@hotmail.com>
  • Loading branch information
nwangfw committed Feb 25, 2025
1 parent df0d3e1 commit 5791f68
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 113 deletions.
49 changes: 11 additions & 38 deletions python/aibrix/aibrix/gpu_optimizer/optimizer/profiling/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
PATH_PREFIX=`dirname "$0"`
FILE_NAME="result"
MODEL="llama2-7b"
TEMPERATURE=0.0

TOTAL=100
TOTAL=100
# TODO: Set your preferred request sizes and rates here.
input_start=4
input_limit=$((2**11)) # 2K
Expand All @@ -30,10 +31,6 @@ rate_limit=$((2**6)) # 64
workload=
dry_run=0

debug_print() {
echo "[DEBUG] $1"
}


# Function to generate workload for specific input/output lengths
generate_workload() {
Expand All @@ -48,19 +45,8 @@ generate_workload() {
echo " api_key: $api_key"
echo " num_prompts: $num_prompts"
echo " model: $model"
echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model"

echo "python $PATH_PREFIX/gen_benchmark_prompt.py \
$workload \
--input-tokens \"$input_len\" \
--min-output-tokens \"$output_len\" \
--tolerance \"0.2\" \
--qps \"2.0\" \
--host \"localhost\" \
--port \"8010\" \
--api-key \"$api_key\" \
--total-prompts \"$num_prompts\" \
--model \"$model\""
echo " temperature: $TEMPERATURE"
echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE"

python $PATH_PREFIX/gen_benchmark_prompt.py \
$workload \
Expand All @@ -72,15 +58,14 @@ generate_workload() {
--port "8010" \
--api-key "$api_key" \
--total-prompts "$num_prompts" \
--model "$model"
--model "$model" \
--temperature "$TEMPERATURE"
}

while [[ $# -gt 0 ]]; do
debug_print "Processing argument: $1"
case "$1" in
-m|--model)
MODEL=$2
debug_print "Set MODEL to: $MODEL"
shift 2
;;
-o|--output)
Expand All @@ -89,32 +74,26 @@ while [[ $# -gt 0 ]]; do
;;
--input-start)
input_start=$2
debug_print "Set input_start to: $input_start"
shift 2
;;
--input-limit)
input_limit=$2
debug_print "Set input_limit to: $input_limit"
shift 2
;;
--output-start)
output_start=$2
debug_print "Set output_start to: $output_start"
shift 2
;;
--output-limit)
output_limit=$2
debug_print "Set output_limit to: $output_limit"
shift 2
;;
--rate-start)
rate_start=$2
debug_print "Set rate_start to: $rate_start"
shift 2
;;
--rate-limit)
rate_limit=$2
debug_print "Set rate_limit to: $rate_limit"
shift 2
;;
--dry-run)
Expand All @@ -123,12 +102,14 @@ while [[ $# -gt 0 ]]; do
;;
--api-key)
LLM_API_KEY=$2
debug_print "Set LLM_API_KEY to: $LLM_API_KEY"
shift 2
;;
--temperature)
TEMPERATURE=$2
shift 2
;;
--workload)
workload="--workload_dataset_file $2"
debug_print "Set workload to: $workload"
shift 2
;;
# *)
Expand All @@ -138,14 +119,6 @@ while [[ $# -gt 0 ]]; do
esac
done

# Add debug prints for initial parameters
debug_print "Initial parameters:"
debug_print "MODEL: $MODEL"
debug_print "Input range: $input_start to $input_limit"
debug_print "Output range: $output_start to $output_limit"
debug_print "Rate range: $rate_start to $rate_limit"
debug_print "Workload file: $workload"


# Make sure the directory exists and clear output file
OUTPUT_FILE="${PATH_PREFIX}/result/${FILE_NAME}.jsonl"
Expand Down Expand Up @@ -186,7 +159,7 @@ while [[ $input_len -le $input_limit ]]; do

WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json"
if [[ -f "$WORKLOAD_FILE" ]]; then
python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --workload_dataset_file $WORKLOAD_FILE >> "$OUTPUT_FILE"
python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE"
fi
req_rate=$((req_rate * 2))
done
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def get_tokenizer(
pretrained_model_name_or_path: str, trust_remote_code: bool
) -> Any: # Changed return type to Any since we're returning a different tokenizer type
) -> Any:
"""Get tiktoken tokenizer."""
try:
# Use cl100k_base for ChatGPT-style models
Expand Down Expand Up @@ -115,7 +115,7 @@ def find_matching_prompts(self, target_input_tokens: int, min_output_tokens: int
# Sort candidates by input difference
candidates.sort(key=lambda x: x[2])

# If max_candidates is not specified, use all candidates
# If max_candidates is not specified, use all candidates or choosing the first max_candidates number of candidates
if max_candidates is not None:
candidates = candidates[:max_candidates]

Expand All @@ -131,7 +131,7 @@ def find_matching_prompts(self, target_input_tokens: int, min_output_tokens: int

if output_tokens and output_tokens >= min_output_tokens:
matching_prompts.append((prompt, input_tokens, output_tokens, response_data))
break # We found our match, we can stop
break #No match found, stop the loop

print("-" * 80)

Expand All @@ -148,7 +148,6 @@ def save_results(self, matching_prompts: List[Tuple[str, int, int, Dict]],

# Get the directory where the script is located
script_dir = os.path.dirname(os.path.abspath(__file__))

# Create the prompts directory relative to the script location
prompts_dir = os.path.join(script_dir, "result", "prompts")
os.makedirs(prompts_dir, exist_ok=True)
Expand All @@ -162,7 +161,7 @@ def save_results(self, matching_prompts: List[Tuple[str, int, int, Dict]],
for prompt, input_tokens, output_tokens, response_data in matching_prompts:
for i in range(self.total_prompts):
benchmark_format.append({
"Timestamp": base_timestamp + (i * 1000), # Increment by 1000 for each prompt
"Timestamp": base_timestamp + (i * 1000),
"Requests": [{
"Prompt": prompt,
"Prompt Length": input_tokens,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,70 +195,53 @@ async def send_request(
ts = datetime.now(timezone.utc)
timeout = aiohttp.ClientTimeout(total=3 * 3600)
status_code = None
error_msg = None
token_latencies = [] # Initialize here
time_to_first = 0.0 # Initialize here

try:
async with aiohttp.ClientSession(timeout=timeout) as session:
while True:
async with session.post(api_url, headers=headers, json=pload) as response:
status_code = response.status
response_status = "success" if status_code == 200 else "failed"

# Capture error response for non-200 status codes
if status_code != 200:
error_response = await response.text()
try:
error_json = json.loads(error_response)
error_msg = error_json.get('error', error_response)
except:
error_msg = error_response
print_err(f"Request {idx} failed with status {status_code}: {error_msg}")
break

chunks = []
previous_token_time = time.perf_counter()
first = True

try:
if streaming:
async for chunk, _ in response.content.iter_chunks():
chunks = [chunk]
now_time = time.perf_counter()
if first:
time_to_first = now_time - previous_token_time
first = False
else:
token_latencies.append(now_time - previous_token_time)
previous_token_time = now_time
# Stream off: Chunks are full response.
# chunks.append(chunk)

output = b"".join(chunks).decode("utf-8")
santicized = output.rstrip("\n\t ")
else:
time_to_first = time.perf_counter() - previous_token_time
output = await response.text()
santicized = output
except Exception as e:
error_msg = f"Failed to read response: {str(e)}"
print_err(f"Failed to read response for request {idx}: {e}")
break

async with aiohttp.ClientSession(timeout=timeout) as session:
while True:
# print(f"Sending request: {api_url}:{pload}")
async with session.post(api_url, headers=headers, json=pload) as response:
status_code = response.status
chunks = []
token_latencies = []
previous_token_time = time.perf_counter()
first = True
try:
ret = load_response(santicized)
if "error" not in ret:
break
error_msg = f"API error: {ret.get('error', 'Unknown error')}"
if streaming:
async for chunk, _ in response.content.iter_chunks():
# Stream on: Each chunk in the response is the full response so far
chunks = [chunk]

now_time = time.perf_counter()
if first:
time_to_first = now_time - previous_token_time
first = False
else:
token_latencies.append(now_time - previous_token_time)
previous_token_time = now_time

# Stream off: Chunks are full response.
# chunks.append(chunk)

output = b"".join(chunks).decode("utf-8")
santicized = output.rstrip(
"\n\t "
) # Remove trailing whitespace characters including EOF, and "[DONE]"
else:
time_to_first = time.perf_counter() - previous_token_time
output = await response.text()
santicized = output
except Exception as e:
error_msg = f"Failed to parse response: {str(e)}"
print_err(f"Invalid response for request {idx}: {santicized}: {e}")
print_err(f"Failed to read response for request {idx}: {e}")
break
try:
ret = load_response(santicized)

# Re-send the request if it failed.
if "error" not in ret:
break
except Exception as e:
# It's ok to parse failure, santicized output could be jsonl, other format, or internal error.
print_err(f"Invalid response for request {idx}: {santicized}: {e}")
return
except Exception as e:
# It's ok to parse failure, santicized output could be jsonl, other format, or internal error.
print_err(f"Invalid response for request {idx}: {santicized}: {e}")
break

request_end_time = time.perf_counter()
request_latency = request_end_time - request_start_time
Expand All @@ -272,18 +255,15 @@ async def send_request(
else len(token_latencies) + 1,
"timestamp": ts.strftime("%Y-%m-%d %H:%M:%S %Z%z"),
"E2E": request_latency,
"status_code": status_code,
"success": status_code == 200 if status_code else False,
# "request_payload": pload
"status_code": status_code, # Add status code to trace
"success": status_code == 200 if status_code else False # Add success flag
}
if error_msg:
request_trace["error"] = error_msg
if len(token_latencies) > 0:
request_trace["TTFT"] = time_to_first
request_trace["TPOT_mean"] = np.mean(token_latencies)
request_trace["TPOT_P50"] = np.percentile(token_latencies, 50)
request_trace["TPOT_P90"] = np.percentile(token_latencies, 90)
request_trace["TPOT_P99"] = np.percentile(token_latencies, 99)
request_trace["TPOT_mean"] = np.mean(token_latencies) # type: ignore
request_trace["TPOT_P50"] = np.percentile(token_latencies, 50) # type: ignore
request_trace["TPOT_P90"] = np.percentile(token_latencies, 90) # type: ignore
request_trace["TPOT_P99"] = np.percentile(token_latencies, 99) # type: ignore
print(json.dumps(request_trace))
REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
if len(token_latencies) > 0:
Expand Down Expand Up @@ -336,6 +316,10 @@ async def benchmark(


def main(args: argparse.Namespace):
# Set global temperature from args
global TEMPERATURE
TEMPERATURE = args.temperature

result = {}
if args.verbose:
print(args)
Expand All @@ -345,6 +329,7 @@ def main(args: argparse.Namespace):
result["request_rate"] = args.request_rate
result["seed"] = args.seed
result["model"] = args.model
result["temperature"] = args.temperature
result["samples"] = args.num_prompts

random.seed(args.seed)
Expand Down Expand Up @@ -524,5 +509,11 @@ def main(args: argparse.Namespace):
help="Path to a JSON file containing prompts",
)
parser.add_argument("--use-workload-interval", action="store_true")
parser.add_argument(
"--temperature",
type=float,
default=0.0,
help="Temperature for text generation (default: 0.0)",
)
args = parser.parse_args()
main(args)

0 comments on commit 5791f68

Please sign in to comment.