Skip to content

Commit 76ad361

Browse files
author
Stefan Savic
committed
Improved performance
1 parent 4c81695 commit 76ad361

File tree

2 files changed

+83
-38
lines changed

2 files changed

+83
-38
lines changed

ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -360,32 +360,14 @@ void main() {
360360
cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i];
361361
}
362362
}
363-
// [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
364-
// [[unroll]] for (uint j = 0; j < TN; j++) {
365-
// cache_b[j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
366-
// }
367-
368-
// [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
369-
// [[unroll]] for (uint cc = 0; cc < TN; cc++) {
370-
// [[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
371-
// const uint sums_idx = (cr * WNITER + wsic) * (WMITER * TN) + cc * WMITER + wsir;
372-
// sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b[cc].x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b[cc].y), sums[sums_idx].x));
373-
// sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b[cc].x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b[cc].y), sums[sums_idx].y));
374-
// }
375-
// }
376-
// }
377-
// }
378363

379364
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
380365
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
381366
cache_b = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + i];
382367

383368
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
384369
[[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
385-
// [TM / 2][WNITER][TN][WMITER]
386-
// const uint sums_idx = (cr * WNITER + wsic) * (WMITER * TN) + cc * WMITER + wsir;
387-
388-
// [WNITER][TN][WMITER][TM / 2] -> [wsic][]
370+
// [WNITER][TN][WMITER][TM / 2] -> [wsic][cc][wsir][cr]
389371
const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
390372
sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y), sums[sums_idx].x));
391373
sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y), sums[sums_idx].y));

stefan.py

Lines changed: 82 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,36 @@
44
import re
55
from datetime import datetime
66

7-
def run_benchmark(command):
8-
print(f" Running: {command}")
9-
result = subprocess.run(command, shell=True, capture_output=True, text=True)
10-
return result.stdout
7+
def run_benchmark(commands):
8+
results = []
9+
for cmd in commands:
10+
print(f" Running: {cmd}")
11+
completed = subprocess.run(cmd, shell=True, capture_output=True, text=True)
12+
results.append(completed.stdout)
13+
return results
14+
15+
def parse_output(outputs):
1116

12-
def parse_output(output):
1317
pattern = re.compile(r"^(MUL_MAT(?:_ID)?\(.*?\)):\s+\d+\s+runs\s+-\s+([\d.]+)\s+us/run")
1418
perf_data = {}
1519

16-
for line in output.splitlines():
17-
match = pattern.match(line.strip())
18-
if match:
19-
kernel = match.group(1)
20-
us_run = float(match.group(2))
21-
perf_data[kernel] = us_run
20+
for output in outputs:
21+
for line in output.splitlines():
22+
match = pattern.match(line.strip())
23+
if match:
24+
kernel = match.group(1)
25+
us_run = float(match.group(2))
26+
perf_data[kernel] = us_run
2227
return perf_data
2328

24-
def generate_markdown(before, after, label_before, label_after):
29+
def generate_markdown(before, after):
2530
from datetime import datetime
2631
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
2732
filename = f"perf_comparison_{timestamp}.md"
2833

2934
lines = [
3035
f"# Performance Comparison",
31-
f"Comparing `{label_before}` vs `{label_after}`\n",
32-
"| Kernel | {0} (us/run) | {1} (us/run) | Δ % |".format(label_before, label_after),
36+
"| Kernel | Before(us/run) | After(us/run) | Δ % |",
3337
"|--------|--------------|--------------|-----|"
3438
]
3539

@@ -38,7 +42,7 @@ def generate_markdown(before, after, label_before, label_after):
3842
val2 = after.get(kernel)
3943

4044
if val1 is not None and val2 is not None:
41-
delta = ((val2 - val1) / val1) * 100
45+
delta = ((val1 - val2) / val1) * 100
4246
lines.append(f"| `{kernel}` | {val1:.2f} | {val2:.2f} | {delta:+.2f}% |")
4347
elif val1 is not None:
4448
lines.append(f"| `{kernel}` | {val1:.2f} | N/A | N/A |")
@@ -50,14 +54,73 @@ def generate_markdown(before, after, label_before, label_after):
5054
print(f"Markdown report saved to: {filename}")
5155

5256
if __name__ == "__main__":
53-
# Customize these two commands
54-
cmd_before = '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o MUL_MAT'
55-
cmd_after = '/home/stefan/final/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o MUL_MAT'
57+
result = subprocess.run("env | grep GGML", shell=True, capture_output=True, text=True)
58+
print(result.stdout)
59+
60+
cmd_before = ['/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o MUL_MAT_ID']
61+
# cmd_before = [
62+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
63+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
64+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
65+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
66+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
67+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
68+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
69+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
70+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
71+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
72+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
73+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
74+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
75+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
76+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
77+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
78+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
79+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
80+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
81+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
82+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
83+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
84+
# '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"'
85+
# ]
86+
5687

88+
cmd_after = ['/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o MUL_MAT_ID']
89+
90+
# cmd_after = [
91+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
92+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
93+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
94+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
95+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
96+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
97+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
98+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
99+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
100+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
101+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
102+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
103+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
104+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
105+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
106+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
107+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
108+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
109+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
110+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
111+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
112+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"',
113+
# '/home/stefan/finalno/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o "MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)"'
114+
# ]
115+
116+
print("BEFORE")
57117
output_before = run_benchmark(cmd_before)
118+
print("AFTER")
58119
output_after = run_benchmark(cmd_after)
59120

60121
data_before = parse_output(output_before)
61122
data_after = parse_output(output_after)
62123

63-
generate_markdown(data_before, data_after, "MUL_MAT", "MUL_MAT")
124+
# print(data_before)
125+
126+
generate_markdown(data_before, data_after)

0 commit comments

Comments
 (0)