Skip to content

Commit 70f07c6

Browse files
committed
Fix C++ benchmark.
1 parent d9aacc5 commit 70f07c6

7 files changed

+77
-29
lines changed

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
*.bin
21
*.out
32
perf.data*

Makefile

+12-6
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
1+
.PHONY: gbench-cc
2+
gbench-cc: gbench.cc
3+
clang++ gbench.cc -O2 -march=haswell -lbenchmark
4+
15
.PHONY: naive-cc
26
naive-cc: naive.cc
3-
clang++ naive.cc -O3 -march=haswell -lbenchmark -o naive-cc.bin
4-
./naive-cc.bin
7+
clang++ naive.cc -O2 -march=haswell
58

69
.PHONY: naive-rs
710
naive-rs: naive.rs
8-
rustc -C opt-level=3 -C target-feature=+avx2,+fma -o naive-rs.bin naive.rs
9-
./naive-rs.bin
11+
rustc -C opt-level=3 -C target-feature=+avx2,+fma -o a.out naive.rs
1012

1113
.PHONY: functional-rs
1214
functional-rs: functional.rs
13-
rustc -C opt-level=3 -C target-feature=+avx2,+fma -o functional-rs.bin functional.rs
14-
./functional-rs.bin
15+
rustc -C opt-level=3 -C target-feature=+avx2,+fma -o a.out functional.rs
16+
17+
.PHONE: profile
18+
profile:
19+
perf record ./a.out
20+
perf annotate

README.md

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# Dot Product Benchmarks
22

3-
## Prerequisites
4-
5-
1. Install https://github.com/google/benchmark
6-
7-
## Usage
3+
Usage:
4+
```sh
5+
make naive-cc && ./a.out
6+
make naive-rs && ./a.out
7+
make functional-rs && ./a.out
8+
```
89

9-
Run benchmarks:
10+
Inspect assembly:
1011
```sh
11-
make naive-cc
12-
make naive-rs
13-
make functional-rs
12+
make naive-cc && make profile
13+
...
1414
```

functional.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ fn dot(a: &[f32], b: &[f32]) -> f32 {
99

1010
fn main() {
1111
const SAMPLES: usize = 10;
12-
const ITERS: usize = 1000;
12+
const ITERS: usize = 10000;
1313
const LEN: usize = 100000;
1414

1515
let a = [0.0; LEN];
@@ -21,6 +21,6 @@ fn main() {
2121
black_box(dot(black_box(&a), black_box(&b)));
2222
}
2323
let time_us = 1e6 * start.elapsed().unwrap().as_secs_f32() / ITERS as f32;
24-
println!("{:4.2} us", time_us);
24+
println!("{:8.2} us", time_us);
2525
}
2626
}

gbench.cc

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#include <benchmark/benchmark.h>
2+
3+
float dot(float *a, float *b, size_t len) {
4+
#pragma clang fp reassociate(on)
5+
float sum = 0.0;
6+
for (size_t i = 0; i < len; ++i) {
7+
sum += a[i] * b[i];
8+
}
9+
return sum;
10+
}
11+
12+
void BM_dot(benchmark::State& state) {
13+
constexpr size_t LEN = 100000;
14+
float a[LEN] = {};
15+
float b[LEN] = {};
16+
17+
for (auto _ : state) {
18+
benchmark::DoNotOptimize(dot(a, b, LEN));
19+
}
20+
}
21+
22+
BENCHMARK(BM_dot);
23+
BENCHMARK_MAIN();

naive.cc

+29-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,17 @@
1-
#include <benchmark/benchmark.h>
1+
#include <chrono>
2+
#include <cstdio>
3+
#include <type_traits>
4+
5+
using namespace std::literals::chrono_literals;
6+
7+
template<typename T> inline __attribute__((always_inline))
8+
void black_box(T &&value) noexcept {
9+
if constexpr(std::is_pointer_v<T>) {
10+
asm volatile("":"+m"(value)::"memory");
11+
} else {
12+
asm volatile("":"+r"(value)::);
13+
}
14+
}
215

316
float dot(float *a, float *b, size_t len) {
417
#pragma clang fp reassociate(on)
@@ -9,13 +22,20 @@ float dot(float *a, float *b, size_t len) {
922
return sum;
1023
}
1124

12-
void BM_dot(benchmark::State& state) {
13-
constexpr size_t len = 100000;
14-
float a[len], b[len];
15-
for (auto _ : state) {
16-
benchmark::DoNotOptimize(dot(a, b, len));
25+
int main() {
26+
constexpr size_t SAMPLES = 10;
27+
constexpr size_t ITERS = 10000;
28+
constexpr size_t LEN = 100000;
29+
30+
float a[LEN] = {};
31+
float b[LEN] = {};
32+
33+
for (size_t s = 0; s < SAMPLES; ++s) {
34+
auto start = std::chrono::system_clock::now();
35+
for (size_t i = 0; i < ITERS; ++i) {
36+
black_box(dot(a, b, LEN));
37+
}
38+
float time_us = (std::chrono::system_clock::now() - start) / 1ns / 1e3 / ITERS;
39+
printf("%8.2f us\n", time_us);
1740
}
1841
}
19-
20-
BENCHMARK(BM_dot);
21-
BENCHMARK_MAIN();

naive.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ fn dot(a: &[f32], b: &[f32]) -> f32 {
1414

1515
fn main() {
1616
const SAMPLES: usize = 10;
17-
const ITERS: usize = 1000;
17+
const ITERS: usize = 10000;
1818
const LEN: usize = 100000;
1919

2020
let a = [0.0; LEN];
@@ -26,6 +26,6 @@ fn main() {
2626
black_box(dot(black_box(&a), black_box(&b)));
2727
}
2828
let time_us = 1e6 * start.elapsed().unwrap().as_secs_f32() / ITERS as f32;
29-
println!("{:4.2} us", time_us);
29+
println!("{:8.2} us", time_us);
3030
}
3131
}

0 commit comments

Comments
 (0)