Skip to content
This repository was archived by the owner on Jul 24, 2024. It is now read-only.

Commit 06d1709

Browse files
committed
Merge remote-tracking branch 'origin/main' into egor/margin
2 parents 7f76f32 + 20f109b commit 06d1709

File tree

11 files changed

+250
-109
lines changed

11 files changed

+250
-109
lines changed

.github/workflows/execute-test-script.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ on:
3838
secrets:
3939
DB_URL:
4040
required: true
41+
HF_TOKEN:
42+
required: true
43+
4144

4245
jobs:
4346
print_inputs:
@@ -117,6 +120,8 @@ jobs:
117120
fi
118121
119122
export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} ${URL}"
123+
# We need token to load llama2 from huggingface repo, which is closed
124+
export HF_TOKEN="${{ secrets.HF_TOKEN }}"
120125
121126
# We mainly want to verify our own backend
122127
if [[ ${{ inputs.compiler }} != *torch_mlir* ]]; then
@@ -126,10 +131,13 @@ jobs:
126131
# HOST CONFIG
127132
export KMP_AFFINITY=respect,noreset,granularity=fine,balanced
128133
# This parameter is incredibly important once we use numactl to pick one socket, performance difference was 10x for resnet50 bs=1 torch-inductor
129-
export OMP_NUM_THREADS=32
134+
export OMP_NUM_THREADS=$(grep ^cpu\\scores /proc/cpuinfo | uniq | awk '{print $4}')
135+
echo "CPU cores configured: $OMP_NUM_THREADS"
130136
if [[ ${LABELS} = *glados* ]]; then
137+
export HF_HOME="/cache/torchmlir/huggingface_cache"
131138
numactl -N 1 ${{ inputs.test_script }}
132139
else
140+
export HF_HOME="/data/torchmlir/huggingface_cache"
133141
source ${{ inputs.test_script}}
134142
fi
135143

.github/workflows/test-single-config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ jobs:
8181
test_script: ${{ matrix.test_script }}
8282
secrets:
8383
DB_URL: ${{ secrets.DB_URL }}
84+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
8485

8586
shutdown:
8687
needs: mlp_test

.github/workflows/test.yml

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,28 +35,33 @@ jobs:
3535
mlp_test:
3636
strategy:
3737
matrix:
38-
type: [
39-
{device: 'cpu', compiler: 'torch'},
40-
{device: 'cpu', compiler: 'dynamo'},
41-
{device: 'cpu', compiler: 'torchscript'},
42-
{device: 'cpu', compiler: 'torchscript_onednn'},
43-
{device: 'cpu', compiler: 'ipex'},
44-
{device: 'cpu', compiler: 'ipex_onednn_graph'},
45-
# {device: 'xpu', compiler: 'ipex'},
46-
{device: 'cpu', compiler: 'torch_mlir'},
47-
{device: 'cpu', compiler: 'torch_mlir_xsmm'}
48-
]
38+
compiler:
39+
- torch
40+
- dynamo
41+
- torchscript
42+
- torchscript_onednn
43+
- ipex
44+
- ipex_onednn_graph
45+
- torch_mlir
46+
- torch_mlir_xsmm
4947
test_script: ${{ github.event_name == 'workflow_dispatch' && fromJson(inputs.test_scripts) || fromJson('["./mlp.sh", "./cnn.sh", "./llm.sh"]') }}
48+
exclude:
49+
- test_script: "./llm.sh"
50+
compiler: torchscript
51+
- test_script: "./llm.sh"
52+
compiler: torchscript_onednn
53+
- test_script: "./llm.sh"
54+
compiler: ipex_onednn_graph
5055
fail-fast: false
5156
uses: ./.github/workflows/execute-test-script.yml
5257
with:
53-
compiler: ${{ matrix.type.compiler }}
54-
device: ${{ matrix.type.device }}
58+
compiler: ${{ matrix.compiler }}
59+
device: cpu
5560
tag: ${{ github.event_name == 'workflow_dispatch' && inputs.tag || 'ci' }}
5661
torch_mlir_repo: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_repo || 'intel-ai/torch-mlir' }}
5762
torch_mlir_branch: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_branch || 'cpu-proto' }}
5863
runner_type: spr
59-
shutdown_cloud_runner: false
6064
test_script: ${{ matrix.test_script }}
6165
secrets:
6266
DB_URL: ${{ secrets.DB_URL }}
67+
HF_TOKEN: ${{ secrets.HF_TOKEN }}

.github/workflows/test_amd.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ jobs:
5454
test_script: ${{ matrix.test_script }}
5555
secrets:
5656
DB_URL: ${{ secrets.DB_URL }}
57+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
5758

5859
shutdown:
5960
needs: mlp_test

.github/workflows/test_nvidia.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ jobs:
4242
test_script: ${{ matrix.test_script }}
4343
secrets:
4444
DB_URL: ${{ secrets.DB_URL }}
45+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
4546

4647
shutdown:
4748
runs-on: a100

db_tools/create_view.sql

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
11
CREATE OR REPLACE VIEW torchmlir_benchmark_view AS
22
SELECT
33
id,
4-
REPLACE(REPLACE(CONCAT(host, '-', compiler, '-', dtype, '-', tag), 'torchscript', 'ts'), '-ci', '') AS backend,
4+
REPLACE(
5+
REPLACE(
6+
REPLACE(
7+
REPLACE(
8+
REPLACE(
9+
REPLACE(
10+
CONCAT(host, '-', compiler, '-', dtype, '-', tag),
11+
'torchscript', 'ts'),
12+
'-ci', ''),
13+
'ts_onednn', 'onednn'),
14+
'ipex_onednn_graph', 'ipex_gc'),
15+
'bfloat16', 'b16'),
16+
'float32', 'f32'
17+
) AS backend,
518
host,
619
device,
720
compiler,
8-
dtype,
21+
REPLACE(REPLACE(dtype, 'bfloat16', 'b16'), 'float32', 'f32') AS dtype,
922
tag,
1023
benchmark,
1124
benchmark_desc,

dl_bench/llm.py

Lines changed: 70 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,111 @@
1+
import os
12
import time
3+
import math
24

35
import torch
46
import intel_extension_for_pytorch as ipex
5-
from transformers import AutoModelForCausalLM, AutoTokenizer
7+
import numpy as np
8+
from transformers import (
9+
AutoModelForCausalLM,
10+
AutoTokenizer,
11+
LlamaForCausalLM,
12+
LlamaTokenizer,
13+
)
614

7-
from dl_bench.utils import TimerManager, Benchmark, str_to_dtype
15+
from dl_bench.utils import Benchmark, get_report, get_time, str_to_dtype
816

917

1018
def get_llm(name, dtype):
11-
if name != "gptj":
19+
if name == "gptj":
20+
model_name = "EleutherAI/gpt-j-6B"
21+
22+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
23+
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
24+
elif name == "llama2-13b":
25+
kwargs = {}
26+
if "HF_TOKEN" in os.environ:
27+
kwargs["token"] = os.environ.get("HF_TOKEN")
28+
29+
model_name = "meta-llama/Llama-2-13b-hf"
30+
model = LlamaForCausalLM.from_pretrained(
31+
model_name, torch_dtype=dtype, **kwargs
32+
)
33+
tokenizer = LlamaTokenizer.from_pretrained(model_name, **kwargs)
34+
else:
1235
raise ValueError("Unsupported model name")
13-
14-
model_name = "EleutherAI/gpt-j-6B"
15-
16-
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
17-
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
1836
return tokenizer, model
1937

2038

2139
class LlmBenchmark(Benchmark):
2240
def __init__(self, params) -> None:
2341
name = params.get("name", "gptj")
2442
dtype = params.get("dtype")
43+
self.batch_size = params.get("batch_size", 1)
44+
self.n_iter = params.get("n_iter", 5)
45+
self.warmup_batches = params.get("warmup", 2)
46+
2547
self.tokenizer, self.model = get_llm(name, dtype=str_to_dtype(dtype))
26-
self.warmup_prompt = "There are several ways to travel, but my favourite is"
27-
self.prompt = "Here is a story about a person that find out he was adopted: one day little Timmy was looking through old"
48+
prompt = "Here is a story about a person that find out he was adopted: one day little Timmy was looking through old"
49+
self.prompt = [prompt] * self.batch_size
2850
self.gen_kwargs = {
2951
"early_stopping": True,
3052
"max_new_tokens": 128,
3153
"min_new_tokens": 30,
3254
"num_beams": 4,
3355
}
3456

35-
def generate(self, prompt, backend):
36-
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
57+
def generate(self, backend):
3758
backend.sync()
3859
start = time.perf_counter()
39-
input_ids = backend.to_device(input_ids)
60+
input_tokens = self.tokenizer(self.prompt, return_tensors="pt").input_ids
61+
input_tokens = backend.to_device(input_tokens)
4062
gen_tokens = self.model.generate(
41-
input_ids, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id
63+
input_tokens, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id
4264
)
4365
backend.sync()
66+
text = self.tokenizer.batch_decode(gen_tokens)[0]
4467
total_time = time.perf_counter() - start
4568

46-
# text = self.tokenizer.batch_decode(gen_tokens)[0]
47-
return gen_tokens[0], total_time
69+
# new tokens are a subset of all tokens
70+
output_tokens = gen_tokens[:, input_tokens.shape[1] :]
71+
return output_tokens, total_time
4872

4973
def inference(self, backend):
50-
tm = TimerManager()
51-
52-
# Recover MACs computation
74+
# TODO: Recover MACs computation
5375
# generate requires several forward passes, so need addtional algo to estimate
5476
# self.flops_per_sample = get_macs(self.model, self.in_shape, backend) * 2
55-
5677
self.model = backend.prepare_eval_transformer(self.model)
5778

58-
print("Warmup started")
59-
with torch.inference_mode(), tm.timeit("warmup_s"):
60-
self.model.eval()
61-
self.generate(self.warmup_prompt, backend)
62-
print("Warmup done")
63-
6479
self.model.eval()
6580
enabled = backend.dtype != torch.float32
66-
with torch.inference_mode(), torch.autocast(
67-
enabled=enabled, device_type=backend.device_name
68-
), tm.timeit("duration_s"):
69-
tokens, total_time = self.generate(self.prompt, backend)
70-
outputs = [tokens]
7181

72-
results = tm.get_results()
73-
results["samples_per_s"] = len(tokens) / total_time
74-
results["flops_per_sample"] = 1
82+
n_items = 0
83+
outputs = []
84+
fw_times = []
7585

76-
return results, outputs
86+
self.model.eval()
87+
for i in range(self.n_iter):
88+
print(f"Epoch {i+1}/{self.n_iter}")
89+
cast = torch.autocast(enabled=enabled, device_type=backend.device_name)
90+
with torch.inference_mode(), cast:
91+
tokens, total_time = self.generate(backend)
92+
93+
if i < self.warmup_batches:
94+
# We restart timer because that was just a warmup
95+
start = get_time()
96+
continue
97+
98+
print(f"Fw time: {total_time:.1f}")
99+
fw_times.append(total_time)
100+
n_items += math.prod(tokens.shape)
101+
outputs.append(tokens)
102+
103+
stop = get_time()
104+
105+
report = get_report(
106+
fw_times=fw_times,
107+
duration_s=stop - start,
108+
n_items=n_items,
109+
flops_per_sample=1,
110+
)
111+
return report, outputs

dl_bench/prompt.json

Lines changed: 79 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)