Skip to content

Commit

Permalink
record profiling data
Browse files Browse the repository at this point in the history
  • Loading branch information
mchen644 committed Dec 18, 2024
1 parent 3ea96cb commit 8ded0e4
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 22 deletions.
4 changes: 2 additions & 2 deletions benchmarks/1_serving_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ result_dir="${pwd}/result"
# scheduler_policy=(infer)
# swap_policies=(partial)
declare -a scheduler_swap_policies
scheduler_swap_policies[0]="tfittradeoff partial"
# scheduler_swap_policies[1]="fcfs full"
# scheduler_swap_policies[0]="tfittradeoff partial"
scheduler_swap_policies[1]="fcfs full"
# scheduler_swap_policies[2]="las full"
# scheduler_swap_policies[3]="tfittradeoff full"
# scheduler_swap_policies[4]="sjf full"
Expand Down
12 changes: 12 additions & 0 deletions vllm/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,9 @@ def __init__(
self.schedule_waiting_time = 0.0
self.schedule_swapped_time = 0.0

self.prefill_token_num = 0
self.decode_token_num = 0

# Motivation:
self.gpu_memory_iter = 0
self.gpu_computation_iter = 0
Expand Down Expand Up @@ -2031,6 +2034,15 @@ def _schedule_chunked_prefill(self):
self.swapped = remaining_swapped
self.swapped.extend(running_scheduled.swapped_out)
self.iter_nums += 1

self.prefill_token_num = 0
for s in (prefills.seq_groups+running_scheduled.prefill_seq_groups+swapped_in.prefill_seq_groups):
self.prefill_token_num += len(s.seq_group.prompt_token_ids)

self.decode_token_num = 0
for s in (running_scheduled.decode_seq_groups +
swapped_in.decode_seq_groups):
self.decode_token_num += len(s.seq_group.prompt_token_ids)

# Motivation:
# 1) GPU Memory:
Expand Down
30 changes: 23 additions & 7 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ async def step_async(
output = []

et = time.time()
execution_time_per_step = et - st
self.execution_time += et - st
throughput_iter = self.scheduler.gpu_computation_iter / (et - st)
self.total_iteration_time = self.execution_time - self.swap_time
Expand Down Expand Up @@ -288,17 +289,20 @@ async def step_async(
# print(f"Total schedule time: {self.schedule_time}, execution time: {self.execution_time}, handle output time: {self.handle_output_time}, swap time: {self.swap_time}, total iteration number is: {self.total_count}")
if self.scheduler.total_swap_in_seqs != 0:
logger.info(
"Total time: %.5f s, Total schedule time: %.5f s, execution time: %.5f s, "
"Total time: %.5f s, Total schedule time: %.5f s, Total execution time: %.5f s, per execution time: %.5f s "
"handle output time: %.5f s, swap time: %.5f s, "
"total iteration number: %d, "
"swap out block num: %d, swap out seq num: %d, "
"swap in block num: %d, swap in seq num: %d, low efficient swap out ratio: %.5f, "
"mean low efficient swap out extent: %.5f, mean swap-out seq waiting time: %.5f, "
"gpu memory iter: %.5f, gpu computation iter: %.5f, sort time: %.5f, "
"schedule running time: %.5f, schedule swapped time: %.5f, schedule prefill time: %.5f,"
"swap while time: %.5f, prefill while time: %.5f",self.schedule_time+self.execution_time+self.handle_output_time,
"swap while time: %.5f, prefill while time: %.5f, prefill token num: %d, decode token num: %d, total prepare input time: %.5f,"
"total compute logits time: %.5f, total sample time: %.5f"
,self.schedule_time+self.execution_time+self.handle_output_time,
self.schedule_time,
self.execution_time,
execution_time_per_step,
self.handle_output_time,
self.swap_time,
self.total_count,
Expand All @@ -316,20 +320,27 @@ async def step_async(
self.scheduler.schedule_swapped_time,
self.scheduler.schedule_waiting_time,
self.scheduler.swap_while,
self.scheduler.prefill_while)
self.scheduler.prefill_while,
self.scheduler.prefill_token_num,
self.scheduler.decode_token_num,
self.model_executor.driver_worker.model_runner.total_prepare_input_time,
self.model_executor.driver_worker.model_runner.total_compute_logits_time,
self.model_executor.driver_worker.model_runner.total_sample_time)
else:
logger.info(
"Total time: %.5f s, Total schedule time: %.5f s, execution time: %.5f s, "
"Total time: %.5f s, Total schedule time: %.5f s, Total execution time: %.5f s, per execution time: %.5f s"
"handle output time: %.5f s, swap time: %.5f s, "
"total iteration number: %d, "
"swap out block num: %d, swap out seq num: %d, "
"swap in block num: %d, swap in seq num: %d, low efficient swap out ratio: %.5f,"
"mean low efficient swap out extent: %.5f, mean swap-out seq waiting time: %.5f, "
"gpu memory iter: %.5f, gpu computation iter: %.5f, sort time: %.5f, "
"schedule running time: %.5f, schedule swapped time: %.5f, schedule prefill time: %.5f,"
"swap while time: %.5f, prefill while time: %.5f",self.schedule_time+self.execution_time+self.handle_output_time,
"swap while time: %.5f, prefill while time: %.5f, prefill token num: %d, decode token num: %d, total_prepare_input_time: %.5f,"
"total compute logits time: %.5f, total sample time: %.5f",
self.schedule_time+self.execution_time+self.handle_output_time,
self.schedule_time,
self.execution_time, self.handle_output_time, self.swap_time,
self.execution_time, execution_time_per_step, self.handle_output_time, self.swap_time,
self.total_count, self.scheduler.total_swap_out_blocks,
self.scheduler.total_swap_out_seqs,
self.scheduler.total_swap_in_blocks,
Expand All @@ -344,7 +355,12 @@ async def step_async(
self.scheduler.schedule_swapped_time,
self.scheduler.schedule_waiting_time,
self.scheduler.swap_while,
self.scheduler.prefill_while)
self.scheduler.prefill_while,
self.scheduler.prefill_token_num,
self.scheduler.decode_token_num,
self.model_executor.driver_worker.model_runner.total_prepare_input_time,
self.model_executor.driver_worker.model_runner.total_compute_logits_time,
self.model_executor.driver_worker.model_runner.total_sample_time)
return request_outputs

async def process_model_inputs_async(
Expand Down
26 changes: 13 additions & 13 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,8 +722,8 @@ def execute_model(
(input_tokens, input_positions, attn_metadata, sampling_metadata,
lora_requests, lora_mapping, multi_modal_kwargs
) = self.prepare_input_tensors(seq_group_metadata_list)
# et = time.time()
# self.total_prepare_input_time += et - st
et = time.time()
self.total_prepare_input_time += et - st

if self.lora_config:
self.set_active_loras(lora_requests, lora_mapping)
Expand All @@ -737,38 +737,38 @@ def execute_model(
else:
model_executable = self.model

# st = time.time()
st = time.time()
hidden_states = model_executable(
input_ids=input_tokens,
positions=input_positions,
kv_caches=kv_caches,
attn_metadata=attn_metadata,
**multi_modal_kwargs,
)
# et = time.time()
# self.total_execute_time += et - st
et = time.time()
self.total_execute_time += et - st

# st = time.time()
st = time.time()
# Compute the logits.
logits = self.model.compute_logits(hidden_states, sampling_metadata)
# et = time.time()
# self.total_compute_logits_time += et - st
et = time.time()
self.total_compute_logits_time += et - st

# Only perform sampling in the driver worker.
if not self.is_driver_worker:
return None

# Sample the next token.
# sample_st = time.time()
sample_st = time.time()
output = self.model.sample(
logits=logits,
sampling_metadata=sampling_metadata,
)
# et = time.time()
# self.total_sample_time += et - st
et = time.time()
self.total_sample_time += et - sample_st

# print(f"Total time: {et - st:.3f}s, sample time: {et - sample_st:.3f}s, ratio: {100 * (et - sample_st) / (et - st):.2f}%")
# print(f"Total prepare input time: {self.total_prepare_input_time:.3f}s, total execute time: {self.total_execute_time:.3f}s, total compute logits time: {self.total_compute_logits_time:.3f}s, total sample time: {self.total_sample_time:.3f}s")
print(f"Total time: {et - st:.3f}s, sample time: {et - sample_st:.3f}s, ratio: {100 * (et - sample_st) / (et - st):.2f}%")
print(f"Total prepare input time: {self.total_prepare_input_time:.3f}s, total execute time: {self.total_execute_time:.3f}s, total compute logits time: {self.total_compute_logits_time:.3f}s, total sample time: {self.total_sample_time:.3f}s")

return output

Expand Down

0 comments on commit 8ded0e4

Please sign in to comment.