Skip to content

Commit

Permalink
Output median min and avg values to csv (openvinotoolkit#450)
Browse files Browse the repository at this point in the history
Co-authored-by: Chen Peter <peter.chen@intel.com>
  • Loading branch information
wgzintel and peterchen-intel authored May 23, 2024
1 parent 25909cc commit e6f05c6
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 112 deletions.
3 changes: 2 additions & 1 deletion llm_bench/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_

# if num_iters == 0, just output warm-up data
proc_id = os.getpid()
prompt_idx_list = [image_id for image_id, image_param in enumerate(images)]
for num in range(num_iters + 1):
image_id = 0
for img in images:
Expand All @@ -404,7 +405,7 @@ def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_
run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, image_id, tm_list, proc_id)
tm_list.clear()
image_id = image_id + 1
utils.metrics_print.print_average(iter_data_list, [], 0, False)
utils.metrics_print.print_average(iter_data_list, prompt_idx_list, 1, False)

return iter_data_list, pretrain_time

Expand Down
23 changes: 15 additions & 8 deletions llm_bench/python/utils/metrics_print.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=Fa
f"vqvae decoder step count: 1",)


def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size):
def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen):
for p_idx in prompt_idx_list:
avg_1st_token_latency = 0
avg_2nd_tokens_latency = 0
Expand All @@ -134,12 +134,20 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch
avg_input_size = int(avg_input_size / index_num)
if avg_2nd_tokens_latency > 0:
avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000
latency_unit = 'token'
latency_unit = 'token' if is_text_gen is True else 'step'
if batch_size > 1:
latency_unit = '{}tokens'.format(batch_size)
prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {:.2f} ms/{}, ' \
'2nd tokens latency: {:.2f} ms/{}, 2nd tokens throughput: {:.2f} tokens/s' \
.format(p_idx, avg_input_size, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput)
if is_text_gen is True:
latency_unit = '{}tokens'.format(batch_size)
else:
latency_unit = '{}steps'.format(batch_size)
if is_text_gen is True:
prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {:.2f} ms/{}, ' \
'2nd tokens latency: {:.2f} ms/{}, 2nd tokens throughput: {:.2f} tokens/s' \
.format(p_idx, avg_input_size, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput)
else:
prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] 1st step of unet latency {:.2f} ms/{}, ' \
'2nd steps of unet latency: {:.2f} ms/{}, 2nd steps throughput: {:.2f} steps/s' \
.format(p_idx, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput)


def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False):
Expand All @@ -163,8 +171,7 @@ def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False

if total_iters > 0:
prompt_dict = {}
if is_text_gen is True:
output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size)
output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen)
log.info('<<< Warm-up iteration is excluded. >>>')
out_str = '[Total] Iterations: {}'.format(total_iters)
for prompt_key in prompt_dict:
Expand Down
176 changes: 73 additions & 103 deletions llm_bench/python/utils/output_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# Copyright (C) 2023-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import csv
import numpy as np
import copy
from pathlib import Path


Expand Down Expand Up @@ -40,20 +42,80 @@ def output_comments(result, use_case, writer):
comment_list.append('detokenization_time: Tokenizer decode time')
comment_list.append('pretrain_time: Total time of load model and compile model')
comment_list.append('generation_time: Time for one interaction. (e.g. The duration of answering one question or generating one picture)')
comment_list.append('iteration=0: warm-up; iteration=-1: average (exclude warm-up)')
comment_list.append('iteration=0: warm-up; iteration=avg: average (exclude warm-up);iteration=mini: minimum value (exclude warm-up);'
'iteration=median: median value (exclude warm-up);')
comment_list.append(
'max_rss_mem: max rss memory consumption;' 'the value in -1 iteration row is the maximum value of all available RSS memory numbers in iterations',
'max_rss_mem: max rss memory consumption;'
)
comment_list.append(
'max_shared_mem: max shared memory consumption;'
'the value in -1 iteration row is the maximum value of all available shared memory numbers in iterations',
)

for comments in comment_list:
result['iteration'] = comments
writer.writerow(result)


def output_avg_min_median(iter_data_list):
prompt_idxs = []
for iter_data in iter_data_list:
prompt_idxs.append(iter_data['prompt_idx'])
prompt_idxs = list(set(prompt_idxs))
result = {}
for prompt_idx in prompt_idxs:
same_prompt_datas = []
for iter_data in iter_data_list:
if iter_data['prompt_idx'] == prompt_idx and iter_data['iteration'] > 0:
same_prompt_datas.append(iter_data)
key_word = ['input_size', 'infer_count', 'generation_time', 'output_size', 'latency', 'first_token_latency', 'other_tokens_avg_latency',
'first_token_infer_latency', 'other_tokens_infer_avg_latency', 'tokenization_time', 'detokenization_time']
if len(same_prompt_datas) > 0:
iters_idx = ['avg', 'mini', 'median']
result[prompt_idx] = [copy.deepcopy(same_prompt_datas[0]) for i in range(3)]
for i in range(len(iters_idx)):
result[prompt_idx][i]['iteration'] = iters_idx[i]
for key in key_word:
values = []
for prompt in same_prompt_datas:
if prompt[key] != '':
values.append(prompt[key])
if len(values) > 0:
result[prompt_idx][0][key] = np.mean(values)
result[prompt_idx][1][key] = np.min(values)
result[prompt_idx][2][key] = np.median(values)
return result


def gen_data_to_csv(result, iter_data, pretrain_time):
generation_time = iter_data['generation_time']
latency = iter_data['latency']
first_latency = iter_data['first_token_latency']
other_latency = iter_data['other_tokens_avg_latency']
first_token_infer_latency = iter_data['first_token_infer_latency']
other_token_infer_latency = iter_data['other_tokens_infer_avg_latency']
rss_mem = iter_data['max_rss_mem_consumption']
shared_mem = iter_data['max_shared_mem_consumption']
token_time = iter_data['tokenization_time']
detoken_time = iter_data['detokenization_time']
result['iteration'] = str(iter_data['iteration'])
result['pretrain_time(s)'] = pretrain_time
result['input_size'] = iter_data['input_size']
result['infer_count'] = iter_data['infer_count']
result['generation_time(s)'] = round(generation_time, 5) if generation_time != '' else generation_time
result['output_size'] = iter_data['output_size']
result['latency(ms)'] = round(latency, 5) if latency != '' else latency
result['result_md5'] = iter_data['result_md5']
result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency
result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency
result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency
result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency
result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem
result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem
result['prompt_idx'] = iter_data['prompt_idx']
result['tokenization_time'] = round(token_time, 5) if token_time != '' else token_time
result['detokenization_time'] = round(detoken_time, 5) if detoken_time != '' else detoken_time


def write_result(report_file, model, framework, device, model_args, iter_data_list, pretrain_time, model_precision):
header = [
'iteration',
Expand Down Expand Up @@ -86,17 +148,6 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li
with open(out_file, 'w+', newline='') as f:
writer = csv.DictWriter(f, header)
writer.writeheader()

total_generation_time = 0
total_num_tokens = 0
total_input_size = 0
total_infer_count = 0
total_first_token_latency = 0
total_other_tokens_avg_latency = 0
total_first_token_infer_latency = 0
total_other_tokens_infer_avg_latency = 0
total_max_rss_mem_consumption = 0
total_max_shared_mem_consumption = 0
result = {}
result['model'] = model
result['framework'] = framework
Expand All @@ -105,97 +156,16 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li
result['precision'] = model_precision
result['num_beams'] = model_args['num_beams']
result['batch_size'] = model_args['batch_size']
total_iters = len(iter_data_list)

skip_iter_nums = 0
for i in range(total_iters):
for i in range(len(iter_data_list)):
iter_data = iter_data_list[i]
generation_time = iter_data['generation_time']
latency = iter_data['latency']
first_latency = iter_data['first_token_latency']
other_latency = iter_data['other_tokens_avg_latency']
first_token_infer_latency = iter_data['first_token_infer_latency']
other_token_infer_latency = iter_data['other_tokens_infer_avg_latency']
rss_mem = iter_data['max_rss_mem_consumption']
shared_mem = iter_data['max_shared_mem_consumption']
token_time = iter_data['tokenization_time']
detoken_time = iter_data['detokenization_time']
result['iteration'] = str(iter_data['iteration'])
if i > 0:
result['pretrain_time(s)'] = ''

result['input_size'] = iter_data['input_size']
result['infer_count'] = iter_data['infer_count']
result['generation_time(s)'] = round(generation_time, 5) if generation_time != '' else generation_time
result['output_size'] = iter_data['output_size']
result['latency(ms)'] = round(latency, 5) if latency != '' else latency
result['result_md5'] = iter_data['result_md5']
result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency
result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency
result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency
result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency
result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem
result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem
result['prompt_idx'] = iter_data['prompt_idx']
result['tokenization_time'] = round(token_time, 5) if token_time != '' else token_time
result['detokenization_time'] = round(detoken_time, 5) if detoken_time != '' else detoken_time
pre_time = '' if i > 0 else result['pretrain_time(s)']
gen_data_to_csv(result, iter_data, pre_time)
writer.writerow(result)

# Skip the warm-up iteration
if iter_data['iteration'] > 0:
if iter_data['generation_time'] != '':
total_generation_time += iter_data['generation_time']
if iter_data['output_size'] != '':
total_num_tokens += iter_data['output_size']
if iter_data['input_size'] != '':
total_input_size += iter_data['input_size']
if iter_data['first_token_latency'] != '':
total_first_token_latency += iter_data['first_token_latency']
if iter_data['other_tokens_avg_latency'] != '':
total_other_tokens_avg_latency += iter_data['other_tokens_avg_latency']
if iter_data['first_token_infer_latency'] != '':
total_first_token_infer_latency += iter_data['first_token_infer_latency']
if iter_data['other_tokens_infer_avg_latency'] != '':
total_other_tokens_infer_avg_latency += iter_data['other_tokens_infer_avg_latency']
if iter_data['infer_count'] != '':
total_infer_count += iter_data['infer_count']
else:
skip_iter_nums = skip_iter_nums + 1
if iter_data['max_rss_mem_consumption'] != '':
if iter_data['max_rss_mem_consumption'] > total_max_rss_mem_consumption:
total_max_rss_mem_consumption = iter_data['max_rss_mem_consumption']
if iter_data['max_shared_mem_consumption'] != '':
if iter_data['max_shared_mem_consumption'] > total_max_shared_mem_consumption:
total_max_shared_mem_consumption = iter_data['max_shared_mem_consumption']
total_iters -= skip_iter_nums
if total_iters > 0:
result['iteration'] = str('-1')
result['pretrain_time(s)'] = ''
if total_input_size > 0:
result['input_size'] = round(total_input_size / total_iters, 5)
if total_infer_count > 0:
result['infer_count'] = round(total_infer_count / total_iters, 5)
if total_generation_time > 0:
result['generation_time(s)'] = round(total_generation_time / total_iters, 5)
if total_num_tokens > 0:
avg_per_token_time = total_generation_time * 1000 / total_num_tokens
result['output_size'] = round(total_num_tokens / total_iters, 5)
result['latency(ms)'] = round(avg_per_token_time, 5)
else:
result['output_size'] = ''
result['latency(ms)'] = ''
if total_first_token_latency > 0:
result['1st_latency(ms)'] = round(total_first_token_latency / total_iters, 5)
if total_other_tokens_avg_latency > 0:
result['2nd_avg_latency(ms)'] = round(total_other_tokens_avg_latency / total_iters, 5)
if total_first_token_infer_latency > 0:
result['1st_infer_latency(ms)'] = round(total_first_token_infer_latency / total_iters, 5)
if total_other_tokens_infer_avg_latency > 0:
result['2nd_infer_avg_latency(ms)'] = round(total_other_tokens_infer_avg_latency / total_iters, 5)
if total_max_rss_mem_consumption > 0:
result['max_rss_mem(MB)'] = total_max_rss_mem_consumption
if total_max_shared_mem_consumption > 0:
result['max_shared_mem(MB)'] = total_max_shared_mem_consumption
writer.writerow(result)
res_data = output_avg_min_median(iter_data_list)

for key in res_data.keys():
for data in res_data[key]:
gen_data_to_csv(result, data, '')
writer.writerow(result)
output_comments(result, model_args['use_case'], writer)

0 comments on commit e6f05c6

Please sign in to comment.