Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into validation_mode
Browse files Browse the repository at this point in the history
  • Loading branch information
iefode committed Sep 27, 2024
2 parents c81ffbd + 2ed9889 commit 5bfa2e7
Show file tree
Hide file tree
Showing 65 changed files with 4,245 additions and 608 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,7 @@ jobs:
output.write('question:\n')
chat_history.append(gen_prompt(prompt))
chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(chat_prompt, return_tensors='pt')
tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
answer = model.generate(**tokenized, max_length=1000, do_sample=False)
answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
chat_history.append(gen_answer(answer_str))
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ jobs:
if: |
always() &&
(needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
timeout-minutes: 90
timeout-minutes: 120
defaults:
run:
shell: bash
Expand Down
115 changes: 113 additions & 2 deletions llm_bench/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,110 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])


def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id):
set_seed(args['seed'])
input_text_list = [input_text] * args['batch_size']
if args["output_dir"] is not None and num == 0:
for bs_index, in_text in enumerate(input_text_list):
llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
pt_inputs = tokenizer(input_text_list, return_tensors="pt")
input_token_size = pt_inputs.input_ids.shape[1]
pipe_tokenizer = model.get_tokenizer()
tok_encode_start = time.perf_counter()
input_data = pipe_tokenizer.encode(input_text_list)
tok_encode_end = time.perf_counter()
tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
if args['batch_size'] > 1:
out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
out_str += " Batch_size={}, ".format(args['batch_size'])
out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size'])
if args['infer_count'] is not None:
out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
log.info(out_str)
max_rss_mem_consumption = ''
max_uss_mem_consumption = ''
max_shared_mem_consumption = ''
if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
mem_consumption.start_collect_memory_consumption()
max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
streamer.reset()
start = time.perf_counter()
generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens
end = time.perf_counter()
if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
mem_consumption.end_collect_momory_consumption()
max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption()
mem_consumption.clear_max_memory_consumption()
generation_time = end - start
tok_decode_start = time.perf_counter()
generated_text = pipe_tokenizer.decode(generated_tokens)
tok_decode_end = time.perf_counter()
tok_decode_time = (tok_decode_end - tok_decode_start) * 1000
# Only text_gen need to minus length of input_data, because generated_text may include input_text
num_tokens = 0
result_md5_list = []
for bs_idx in range(args['batch_size']):
generated_text_len = len(generated_tokens[bs_idx])
num_tokens += generated_text_len
if generated_text_len > max_gen_tokens:
log.error('Output token size is over max output token size!')
result_text = generated_text[bs_idx]
if args["output_dir"] is not None:
llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id)
result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest())
if len(md5_list[num]) == 0:
md5_list[num] = {prompt_index : result_md5_list}
else:
md5_list[num][prompt_index] = result_md5_list
per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
tm_list = streamer.get_time_list()
log.debug('latency of all tokens:')
[log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
iter_data = gen_iterate_data(
num,
input_token_size * args['batch_size'],
len(tm_list),
num_tokens,
generation_time,
per_token_time,
result_md5_list,
max_rss_mem=max_rss_mem_consumption,
max_shared_mem=max_shared_mem_consumption,
max_uss_mem=max_uss_mem_consumption,
prompt_idx=prompt_index,
tokenization_time=(tok_encode_time, tok_decode_time)
)
iter_data_list.append(iter_data)
llm_bench_utils.metrics_print.print_metrics(
num,
iter_data,
tm_list,
[],
warm_up=(num == 0),
max_rss_mem=max_rss_mem_consumption,
max_shared_mem=max_shared_mem_consumption,
max_uss_mem=max_uss_mem_consumption,
tokenization_time=(tok_encode_time, tok_decode_time),
batch_size=args['batch_size']
)
if num > 0:
prev_md5 = md5_list[num - 1][prompt_index]
if result_md5_list != prev_md5:
log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
f"is different from md5 of the {num - 1} iteration {prev_md5}")
llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
if num == 1:
# if the device is CPU, throw exception
if args['devices'].lower().startswith('cpu') is True:
assert (result_md5_list == prev_md5)
else:
# throw exception
assert (result_md5_list == prev_md5)
else:
llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
streamer.reset()


def run_text_generation_benchmark(model_path, framework, device, args, num_iters):
model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, **args)
model_precision = llm_bench_utils.model_utils.get_model_precision(model_path.parts)
Expand All @@ -341,7 +445,12 @@ def run_text_generation_benchmark(model_path, framework, device, args, num_iters
f'prompt nums: {len(text_list)}, prompt idx: {prompt_idx_list}')

# if num_iters == 0, just output warm-up data
text_gen_fn = run_text_generation if not use_genai else run_text_generation_genai
if not use_genai:
text_gen_fn = run_text_generation
elif bench_hook is not None:
text_gen_fn = run_text_generation_genai_with_stream
else:
text_gen_fn = run_text_generation_genai
proc_id = os.getpid()
if args['subsequent'] is False:
for num in range(num_iters + 1):
Expand Down Expand Up @@ -703,7 +812,9 @@ def get_argprser():
)
parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files')
llm_bench_utils.model_utils.add_stateful_model_arguments(parser)
parser.add_argument("--genai", action="store_true")
parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking")
parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode")
parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings")
parser.add_argument(
'--end_token_stopping',
action='store_true',
Expand Down
8 changes: 8 additions & 0 deletions llm_bench/python/llm_bench_utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def analyze_args(args):
model_args['subsequent'] = args.subsequent
model_args['output_dir'] = args.output_dir
model_args['genai'] = args.genai
model_args["use_cb"] = args.use_cb
model_args['devices'] = args.device
model_args['prompt_index'] = [] if args.prompt_index is not None else None
if model_args['prompt_index'] is not None:
Expand Down Expand Up @@ -164,6 +165,13 @@ def analyze_args(args):
log.info(f"PT Config={model_args['config']}")
model_args['model_type'] = get_model_type(model_name, use_case, model_framework)
model_args['model_name'] = model_name

if args.use_cb and not args.genai:
raise RuntimeError("Continious batching mode supported only via OpenVINO GenAI")
cb_config = None
if args.cb_config:
cb_config = get_config(args.cb_config)
model_args["cb_config"] = cb_config
return model_path, model_framework, model_args, model_name


Expand Down
41 changes: 38 additions & 3 deletions llm_bench/python/llm_bench_utils/ov_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def create_text_gen_model(model_path, device, **kwargs):
raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
else:
if kwargs.get("genai", False) and is_genai_available(log_msg=True):
if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]:
if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"], OV_MODEL_CLASSES_MAPPING["chatglm"]]:
log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking")
else:
return create_genai_text_gen_model(model_path, device, ov_config, **kwargs)
Expand Down Expand Up @@ -185,13 +185,48 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs):
convert_ov_tokenizer(model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
start = time.perf_counter()

cb = kwargs.get("use_cb", False)
if cb:
log.info("Continuous Batching mode activated")
scheduler_config = openvino_genai.SchedulerConfig()
scheduler_params = kwargs.get("cb_config") or {"cache_size": 1}
if scheduler_params:
log.info(f"Scheduler parameters:\n{scheduler_params}")

for param, value in scheduler_params.items():
setattr(scheduler_config, param, value)
ov_config["scheduler_config"] = scheduler_config
start = time.perf_counter()
llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config)
end = time.perf_counter()
log.info(f'Pipeline initialization time: {end - start:.2f}s')

return llm_pipe, tokenizer, end - start, None, True
class TokenStreamer(openvino_genai.StreamerBase):
def __init__(self, tokenizer):
openvino_genai.StreamerBase.__init__(self)
self.tokenizer = tokenizer
self.token_generation_time = []
self.generated_tokens = []
self.start_time = time.perf_counter()
def put(self, token_id):
self.token_generation_time.append(time.perf_counter() - self.start_time)
self.generated_tokens.append(token_id)
self.start_time = time.perf_counter()
return False
def reset(self):
self.token_generation_time = []
self.generated_tokens = []
self.start_time = time.perf_counter()
def end(self):
pass
def get_tokens(self):
return self.generated_tokens
def get_time_list(self):
return self.token_generation_time
streamer = TokenStreamer(llm_pipe.get_tokenizer()) if cb else None

return llm_pipe, tokenizer, end - start, streamer, True


def convert_ov_tokenizer(tokenizer_path):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from pathlib import PosixPath
import os
import tempfile

import whowhatbench
from whowhatbench.wwb import load_dataset
from optimum.intel.openvino import OVModelForCausalLM

from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationConfig, CacheEvictionConfig, AggregationMode

from openvino_tokenizers import convert_tokenizer
from openvino import serialize
from transformers import AutoTokenizer

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_NEW_TOKENS = 128
SEQS_PER_REQUEST = 5
MAX_SEQUENCES = 100


model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_path = PosixPath(tempfile.gettempdir()) / model_id
model.save_pretrained(model_path)

ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True)
serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml")
serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml")

scheduler_config_noopt = SchedulerConfig()
scheduler_config_noopt.num_kv_blocks = 300
scheduler_config_noopt.dynamic_split_fuse = True
scheduler_config_noopt.max_num_batched_tokens = 256
scheduler_config_noopt.max_num_seqs = 256
scheduler_config_noopt.enable_prefix_caching = False

scheduler_config_opt = SchedulerConfig()
scheduler_config_opt.num_kv_blocks = 300
scheduler_config_opt.dynamic_split_fuse = True
scheduler_config_opt.max_num_batched_tokens = 256
scheduler_config_opt.max_num_seqs = 256
scheduler_config_opt.use_cache_eviction = True
scheduler_config_opt.enable_prefix_caching = False
eviction_config = CacheEvictionConfig(32, 32, 128, AggregationMode.NORM_SUM)
scheduler_config_opt.cache_eviction_config = eviction_config

generation_config = GenerationConfig()
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = MAX_NEW_TOKENS

data = load_dataset(path='squad', name=None, split='validation')["context"]
data_dict = {"questions": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]}

model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {})
model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {})


GT_DATA_FILE = 'gt_data.csv'

if os.path.exists(GT_DATA_FILE):
evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, gt_data=GT_DATA_FILE, tokenizer=tokenizer,
test_data=data_dict, generation_config=generation_config,
max_new_tokens=MAX_NEW_TOKENS, seqs_per_request=3)
else:
evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict,
generation_config=generation_config, max_new_tokens=MAX_NEW_TOKENS,
seqs_per_request=3)
evaluator.dump_gt('gt_data.csv')


all_metrics_per_question, all_metrics = evaluator.score(model_cb_opt)


print(all_metrics_per_question)
print(all_metrics)

metrics = ["similarity", "SDT norm"]

for metric in metrics:
worst_examples = evaluator.worst_examples(top_k=5, metric=metric)
print("Metric: ", metric)
for e in worst_examples:
print("\t=========================")
print(f"\t{metric}: ", e[metric])
print("\tPrompt: ", e["prompt"])
print("\tSource Model:\n ", "\t" + e["source_model"])
print("\tOptimized Model:\n ", "\t" + e["optimized_model"])

pipeline_opt_metrics = model_cb_opt.get_metrics()
pipeline_noopt_metrics = model_cb_noopt.get_metrics()

print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage)
avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage)
print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
5 changes: 2 additions & 3 deletions llm_bench/python/who_what_benchmark/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
transformers>=4.35.2
sentence-transformers>=2.2.2
openvino>=2024.3.0
openvino-telemetry>=2024.3.0
openvino-telemetry
optimum-intel>=1.14
openvino-tokenizers>=2024.3.0
openvino-genai>=2024.3.0
openvino-tokenizers
pandas>=2.0.3
numpy>=1.23.5
tqdm>=4.66.1
Loading

0 comments on commit 5bfa2e7

Please sign in to comment.