forked from mratsim/laser
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreduction_packed_sse.nim
314 lines (263 loc) · 10.5 KB
/
reduction_packed_sse.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# Apache v2.0 License
# Copyright (c) 2018 Mamy André-Ratsimbazafy
import
../../laser/strided_iteration/foreach,
../../laser/tensor/[allocator, datatypes, initialization],
../../laser/[compiler_optim_hints, dynamic_stack_arrays],
../../laser/simd,
../../laser/primitives/reductions
withCompilerOptimHints()
proc newTensor*[T](shape: varargs[int]): Tensor[T] =
var size: int
initTensorMetadata(result, size, shape)
allocCpuStorage(result.storage, size)
setZero(result, check_contiguous = false)
proc newTensor*[T](shape: Metadata): Tensor[T] =
var size: int
initTensorMetadata(result, size, shape)
allocCpuStorage(result.storage, size)
setZero(result, check_contiguous = false)
proc randomTensor*[T](shape: openarray[int], valrange: Slice[T]): Tensor[T] =
var size: int
initTensorMetadata(result, size, shape)
allocCpuStorage(result.storage, size)
forEachContiguousSerial val in result:
val = T(rand(valrange))
func transpose*(t: Tensor): Tensor =
t.shape.reversed(result.shape)
t.strides.reversed(result.strides)
result.offset = t.offset
result.storage = t.storage
func getIndex[T](t: Tensor[T], idx: varargs[int]): int =
## Convert [i, j, k, l ...] to the memory location referred by the index
result = t.offset
for i in 0 ..< t.shape.len:
result += t.strides[i] * idx[i]
func `[]`*[T](t: Tensor[T], idx: varargs[int]): T {.inline.}=
## Index tensor
t.storage.raw_data[t.getIndex(idx)]
################################################################
import random, times, stats, strformat, math
proc warmup() =
# Warmup - make sure cpu is on max perf
let start = epochTime() # cpuTime() - cannot use cpuTime for multithreaded
var foo = 123
for i in 0 ..< 300_000_000:
foo += i*i mod 456
foo = foo mod 789
# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
let stop = epochTime() # cpuTime() - cannot use cpuTime for multithreaded
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)"
template printStats(name: string, accum: float32) {.dirty.} =
echo "\n" & name & " - float32"
echo &"Collected {stats.n} samples in {global_stop - global_start:>4.3f} seconds"
echo &"Average time: {stats.mean * 1000 :>4.3f} ms"
echo &"Stddev time: {stats.standardDeviationS * 1000 :>4.3f} ms"
echo &"Min time: {stats.min * 1000 :>4.3f} ms"
echo &"Max time: {stats.max * 1000 :>4.3f} ms"
# FLOPS: for sum, we have one add per element
echo &"Perf: {a.size.float / (float(10^9) * stats.mean):>4.3f} GFLOP/s"
echo "\nDisplay sum of samples sums to make sure it's not optimized away"
echo accum # Prevents compiler from optimizing stuff away
template bench(name: string, accum: var float32, body: untyped) {.dirty.}=
block: # Actual bench
var stats: RunningStat
let global_start = epochTime() # cpuTime() - cannot use cpuTime for multithreaded
for _ in 0 ..< nb_samples:
let start = epochTime() # cpuTime() - cannot use cpuTime for multithreaded
body
let stop = epochTime() # cpuTime() - cannot use cpuTime for multithreaded
stats.push stop - start
let global_stop = epochTime() # cpuTime() - cannot use cpuTime for multithreaded
printStats(name, accum)
func round_down_power_of_2(x: Natural, step: static Natural): int {.inline.} =
static: assert (step and (step - 1)) == 0, "Step must be a power of 2"
result = x and not(step - 1)
func sum_ps_sse3(vec: m128): float32 =
let shuf = mm_movehdup_ps(vec)
let sums = mm_add_ps(vec, shuf)
let shuf2 = mm_movehl_ps(sums, sums)
result = mm_add_ss(sums, shuf2).mm_cvtss_f32
func sum_ps_avx(vec: m256): float32 =
let lo = mm256_castps256_ps128(vec)
let hi = mm256_extractf128_ps(vec, 1)
result = mm_add_ps(lo, hi).sum_ps_sse3()
proc mainBench_4_packed_sse_accums(a: Tensor[float32], nb_samples: int) =
var accum = 0'f32
bench("Reduction - packed 4 accumulators SSE", accum):
let size = a.size
let unroll_stop = size.round_down_power_of_2(4)
var accums: m128
var ptr_data = a.unsafe_raw_data()
for i in countup(0, unroll_stop - 1, 4):
let data4 = a.storage.raw_data[i].unsafeaddr.mm_load_ps() # Can't use ptr_data, no address :/
accums = mm_add_ps(accums, data4)
for i in unroll_stop ..< size:
accum += ptr_data[i]
accum += accums.sum_ps_sse3()
proc mainBench_8_packed_sse_accums(a: Tensor[float32], nb_samples: int) =
var accum = 0'f32
bench("Reduction - packed 8 accumulators SSE", accum):
let size = a.size
let unroll_stop = size.round_down_power_of_2(8)
var accums0, accums1: m128
var ptr_data = a.unsafe_raw_data()
for i in countup(0, unroll_stop - 1, 8):
# Can't use ptr_data, no address :/
let
data4_0 = a.storage.raw_data[i ].unsafeaddr.mm_load_ps()
data4_1 = a.storage.raw_data[i+4].unsafeaddr.mm_load_ps()
accums0 = mm_add_ps(accums0, data4_0)
accums1 = mm_add_ps(accums1, data4_1)
for i in unroll_stop ..< size:
accum += ptr_data[i]
let accum0 = accums0.sum_ps_sse3()
let accum1 = accums1.sum_ps_sse3()
accum += accum0
accum += accum1
proc mainBench_packed_sse_prod(a: Tensor[float32], nb_samples: int) =
var accum = 0'f32
bench("Reduction - prod impl", accum):
accum += sum_kernel(a.storage.raw_data, a.size)
proc mainBench_8_packed_avx_accums(a: Tensor[float32], nb_samples: int) =
var accum = 0'f32
bench("Reduction - packed 8 accumulators AVX", accum):
let size = a.size
let unroll_stop = size.round_down_power_of_2(8)
var accums0: m256
var ptr_data = a.unsafe_raw_data()
for i in countup(0, unroll_stop - 1, 8):
# Can't use ptr_data, no address :/
let data8_0 = a.storage.raw_data[i ].unsafeaddr.mm256_load_ps()
accums0 = mm256_add_ps(accums0, data8_0)
for i in unroll_stop ..< size:
accum += ptr_data[i]
accum += accums0.sum_ps_avx()
proc mainBench_16_packed_avx_accums(a: Tensor[float32], nb_samples: int) =
var accum = 0'f32
bench("Reduction - packed 16 accumulators AVX", accum):
let size = a.size
let unroll_stop = size.round_down_power_of_2(16)
var accums0, accums1: m256
var ptr_data = a.unsafe_raw_data()
for i in countup(0, unroll_stop - 1, 16):
# Can't use ptr_data, no address :/
let data8_0 = a.storage.raw_data[i ].unsafeaddr.mm256_load_ps()
let data8_1 = a.storage.raw_data[i+8].unsafeaddr.mm256_load_ps()
accums0 = mm256_add_ps(accums0, data8_0)
accums1 = mm256_add_ps(accums1, data8_1)
for i in unroll_stop ..< size:
accum += ptr_data[i]
accum += accums0.sum_ps_avx() + accums1.sum_ps_avx()
when defined(fastmath):
{.passC:"-ffast-math".}
when defined(march_native):
{.passC:"-march=native".}
when isMainModule:
randomize(42) # For reproducibility
warmup()
block: # All contiguous
let
a = randomTensor([10000, 1000], -1.0'f32 .. 1.0'f32)
mainBench_4_packed_sse_accums(a, 1000)
mainBench_8_packed_sse_accums(a, 1000)
mainBench_packed_sse_prod(a, 1000)
{.passC: "-mavx".}
mainBench_8_packed_avx_accums(a, 1000)
mainBench_16_packed_avx_accums(a, 1000)
## Bench on i5 Broadwell - serial implementation
# Warmup: 1.1946 s, result 224 (displayed to avoid compiler optimizing warmup away)
# Reduction - packed 4 accumulators SSE - float32
# Collected 1000 samples in 2.841 seconds
# Average time: 2.837 ms
# Stddev time: 0.251 ms
# Min time: 2.569 ms
# Max time: 5.680 ms
# Theoretical perf: 3524.917 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356696.15625
# Reduction - packed 8 accumulators SSE - float32
# Collected 1000 samples in 2.502 seconds
# Average time: 2.498 ms
# Stddev time: 0.213 ms
# Min time: 2.299 ms
# Max time: 5.111 ms
# Theoretical perf: 4003.616 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356923.1875
# Reduction - prod impl - float32
# Collected 1000 samples in 2.442 seconds
# Average time: 2.439 ms
# Stddev time: 0.162 ms
# Min time: 2.274 ms
# Max time: 4.916 ms
# Theoretical perf: 4100.865 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -170817.09375
# Reduction - packed 8 accumulators AVX - float32
# Collected 1000 samples in 2.567 seconds
# Average time: 2.563 ms
# Stddev time: 0.186 ms
# Min time: 2.373 ms
# Max time: 5.158 ms
# Theoretical perf: 3902.290 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356915.03125
# Reduction - packed 16 accumulators AVX - float32
# Collected 1000 samples in 2.580 seconds
# Average time: 2.576 ms
# Stddev time: 0.230 ms
# Min time: 2.371 ms
# Max time: 5.134 ms
# Theoretical perf: 3881.285 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356914.875
################################################################
## Bench on i5 Broadwell - prod implementation is OpenMP-enabled
# Unfortunately we are memory-bandwith bound
# Warmup: 1.1888 s, result 224 (displayed to avoid compiler optimizing warmup away)
# Reduction - packed 4 accumulators SSE - float32
# Collected 1000 samples in 2.825 seconds
# Average time: 2.824 ms
# Stddev time: 0.259 ms
# Min time: 2.552 ms
# Max time: 5.193 ms
# Theoretical perf: 3540.637 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356696.15625
# Reduction - packed 8 accumulators SSE - float32
# Collected 1000 samples in 2.498 seconds
# Average time: 2.498 ms
# Stddev time: 0.227 ms
# Min time: 2.266 ms
# Max time: 4.867 ms
# Theoretical perf: 4003.727 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356923.1875
# Reduction - prod impl - float32
# Collected 1000 samples in 2.129 seconds
# Average time: 2.129 ms
# Stddev time: 0.190 ms
# Min time: 1.925 ms
# Max time: 3.508 ms
# Theoretical perf: 4697.260 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356874.40625
# Reduction - packed 8 accumulators AVX - float32
# Collected 1000 samples in 2.539 seconds
# Average time: 2.538 ms
# Stddev time: 0.255 ms
# Min time: 2.327 ms
# Max time: 5.358 ms
# Theoretical perf: 3939.552 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356915.03125
# Reduction - packed 16 accumulators AVX - float32
# Collected 1000 samples in 2.528 seconds
# Average time: 2.528 ms
# Stddev time: 0.221 ms
# Min time: 2.336 ms
# Max time: 5.301 ms
# Theoretical perf: 3955.728 MFLOP/s
# Display sum of samples sums to make sure it's not optimized away
# -356914.875