|
| 1 | +import os |
1 | 2 | import time |
| 3 | +import math |
2 | 4 |
|
3 | 5 | import torch |
4 | 6 | import intel_extension_for_pytorch as ipex |
5 | | -from transformers import AutoModelForCausalLM, AutoTokenizer |
| 7 | +import numpy as np |
| 8 | +from transformers import ( |
| 9 | + AutoModelForCausalLM, |
| 10 | + AutoTokenizer, |
| 11 | + LlamaForCausalLM, |
| 12 | + LlamaTokenizer, |
| 13 | +) |
6 | 14 |
|
7 | | -from dl_bench.utils import TimerManager, Benchmark, str_to_dtype |
| 15 | +from dl_bench.utils import Benchmark, get_report, get_time, str_to_dtype |
8 | 16 |
|
9 | 17 |
|
10 | 18 | def get_llm(name, dtype): |
11 | | - if name != "gptj": |
| 19 | + if name == "gptj": |
| 20 | + model_name = "EleutherAI/gpt-j-6B" |
| 21 | + |
| 22 | + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype) |
| 23 | + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") |
| 24 | + elif name == "llama2-13b": |
| 25 | + kwargs = {} |
| 26 | + if "HF_TOKEN" in os.environ: |
| 27 | + kwargs["token"] = os.environ.get("HF_TOKEN") |
| 28 | + |
| 29 | + model_name = "meta-llama/Llama-2-13b-hf" |
| 30 | + model = LlamaForCausalLM.from_pretrained( |
| 31 | + model_name, torch_dtype=dtype, **kwargs |
| 32 | + ) |
| 33 | + tokenizer = LlamaTokenizer.from_pretrained(model_name, **kwargs) |
| 34 | + else: |
12 | 35 | raise ValueError("Unsupported model name") |
13 | | - |
14 | | - model_name = "EleutherAI/gpt-j-6B" |
15 | | - |
16 | | - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype) |
17 | | - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") |
18 | 36 | return tokenizer, model |
19 | 37 |
|
20 | 38 |
|
21 | 39 | class LlmBenchmark(Benchmark): |
22 | 40 | def __init__(self, params) -> None: |
23 | 41 | name = params.get("name", "gptj") |
24 | 42 | dtype = params.get("dtype") |
| 43 | + self.batch_size = params.get("batch_size", 1) |
| 44 | + self.n_iter = params.get("n_iter", 5) |
| 45 | + self.warmup_batches = params.get("warmup", 2) |
| 46 | + |
25 | 47 | self.tokenizer, self.model = get_llm(name, dtype=str_to_dtype(dtype)) |
26 | | - self.warmup_prompt = "There are several ways to travel, but my favourite is" |
27 | | - self.prompt = "Here is a story about a person that find out he was adopted: one day little Timmy was looking through old" |
| 48 | + prompt = "Here is a story about a person that find out he was adopted: one day little Timmy was looking through old" |
| 49 | + self.prompt = [prompt] * self.batch_size |
28 | 50 | self.gen_kwargs = { |
29 | 51 | "early_stopping": True, |
30 | 52 | "max_new_tokens": 128, |
31 | 53 | "min_new_tokens": 30, |
32 | 54 | "num_beams": 4, |
33 | 55 | } |
34 | 56 |
|
35 | | - def generate(self, prompt, backend): |
36 | | - input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids |
| 57 | + def generate(self, backend): |
37 | 58 | backend.sync() |
38 | 59 | start = time.perf_counter() |
39 | | - input_ids = backend.to_device(input_ids) |
| 60 | + input_tokens = self.tokenizer(self.prompt, return_tensors="pt").input_ids |
| 61 | + input_tokens = backend.to_device(input_tokens) |
40 | 62 | gen_tokens = self.model.generate( |
41 | | - input_ids, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id |
| 63 | + input_tokens, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id |
42 | 64 | ) |
43 | 65 | backend.sync() |
| 66 | + text = self.tokenizer.batch_decode(gen_tokens)[0] |
44 | 67 | total_time = time.perf_counter() - start |
45 | 68 |
|
46 | | - # text = self.tokenizer.batch_decode(gen_tokens)[0] |
47 | | - return gen_tokens[0], total_time |
| 69 | + # new tokens are a subset of all tokens |
| 70 | + output_tokens = gen_tokens[:, input_tokens.shape[1] :] |
| 71 | + return output_tokens, total_time |
48 | 72 |
|
49 | 73 | def inference(self, backend): |
50 | | - tm = TimerManager() |
51 | | - |
52 | | - # Recover MACs computation |
| 74 | + # TODO: Recover MACs computation |
53 | 75 | # generate requires several forward passes, so need addtional algo to estimate |
54 | 76 | # self.flops_per_sample = get_macs(self.model, self.in_shape, backend) * 2 |
55 | | - |
56 | 77 | self.model = backend.prepare_eval_transformer(self.model) |
57 | 78 |
|
58 | | - print("Warmup started") |
59 | | - with torch.inference_mode(), tm.timeit("warmup_s"): |
60 | | - self.model.eval() |
61 | | - self.generate(self.warmup_prompt, backend) |
62 | | - print("Warmup done") |
63 | | - |
64 | 79 | self.model.eval() |
65 | 80 | enabled = backend.dtype != torch.float32 |
66 | | - with torch.inference_mode(), torch.autocast( |
67 | | - enabled=enabled, device_type=backend.device_name |
68 | | - ), tm.timeit("duration_s"): |
69 | | - tokens, total_time = self.generate(self.prompt, backend) |
70 | | - outputs = [tokens] |
71 | 81 |
|
72 | | - results = tm.get_results() |
73 | | - results["samples_per_s"] = len(tokens) / total_time |
74 | | - results["flops_per_sample"] = 1 |
| 82 | + n_items = 0 |
| 83 | + outputs = [] |
| 84 | + fw_times = [] |
75 | 85 |
|
76 | | - return results, outputs |
| 86 | + self.model.eval() |
| 87 | + for i in range(self.n_iter): |
| 88 | + print(f"Epoch {i+1}/{self.n_iter}") |
| 89 | + cast = torch.autocast(enabled=enabled, device_type=backend.device_name) |
| 90 | + with torch.inference_mode(), cast: |
| 91 | + tokens, total_time = self.generate(backend) |
| 92 | + |
| 93 | + if i < self.warmup_batches: |
| 94 | + # We restart timer because that was just a warmup |
| 95 | + start = get_time() |
| 96 | + continue |
| 97 | + |
| 98 | + print(f"Fw time: {total_time:.1f}") |
| 99 | + fw_times.append(total_time) |
| 100 | + n_items += math.prod(tokens.shape) |
| 101 | + outputs.append(tokens) |
| 102 | + |
| 103 | + stop = get_time() |
| 104 | + |
| 105 | + report = get_report( |
| 106 | + fw_times=fw_times, |
| 107 | + duration_s=stop - start, |
| 108 | + n_items=n_items, |
| 109 | + flops_per_sample=1, |
| 110 | + ) |
| 111 | + return report, outputs |
0 commit comments