forked from pleriche/FastMM4
-
Notifications
You must be signed in to change notification settings - Fork 23
/
FastMM4_AVX512.asm
343 lines (299 loc) · 9.08 KB
/
FastMM4_AVX512.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
; This file is needed to enable AVX-512 code for FastMM4-AVX.
; Use "nasm.exe -Ox -f win64 FastMM4_AVX512.asm" to compile this file
; You can get The Netwide Assembler (NASM) from http://www.nasm.us/
; This file is a part of FastMM4-AVX.
; - Copyright (C) 2017-2020 Ritlabs, SRL. All rights reserved.
; - Copyright (C) 2020-2021 Maxim Masiutin. All rights reserved.
; Written by Maxim Masiutin <maxim@masiutin.com>
; FastMM4-AVX is a fork of the Fast Memory Manager 4.992 by Pierre le Riche
; FastMM4-AVX is released under a dual license, and you may choose to use it
; under either the Mozilla Public License 2.0 (MPL 2.1, available from
; https://www.mozilla.org/en-US/MPL/2.0/) or the GNU Lesser General Public
; License Version 3, dated 29 June 2007 (LGPL 3, available from
; https://www.gnu.org/licenses/lgpl.html).
; This code uses zmm26 - zmm31 registers to avoid AVX-SSE transition penalty.
; These regsters (zmm16 - zmm31) have no non-VEX counterpart. According to the
; advise of Agner Fog, there is no state transition and no penalty for mixing
; zmm16 - zmm31 with non-VEX SSE code. By using these registers (zmm16 - zmm31)
; rather than zmm0-xmm15 we save us from calling "vzeroupper".
; Source:
; https://stackoverflow.com/questions/43879935/avoiding-avx-sse-vex-transition-penalties/54587480#54587480
%define EVEXR512N0 zmm31
%define EVEXR512N1 zmm30
%define EVEXR512N2 zmm29
%define EVEXR512N3 zmm28
%define EVEXR512N4 zmm27
%define EVEXR512N5 zmm26
%define EVEXR256N0 ymm31
%define EVEXR256N1 ymm30
%define EVEXR256N2 ymm29
%define EVEXR256N3 ymm28
%define EVEXR256N4 ymm27
%define EVEXR256N5 ymm26
%define EVEXR128N0 xmm31
%define EVEXR128N1 xmm30
%define EVEXR128N2 xmm29
%define EVEXR128N3 xmm28
%define EVEXR128N4 xmm27
%define EVEXR128N5 xmm26
section .text
global Move24AVX512
global Move56AVX512
global Move88AVX512
global Move120AVX512
global Move152AVX512
global Move184AVX512
global Move216AVX512
global Move248AVX512
global Move280AVX512
global Move312AVX512
global Move344AVX512
global MoveX32LpAvx512WithErms
%use smartalign
ALIGNMODE p6, 32 ; p6 NOP strategy, and jump over the NOPs only if they're 32B or larger.
align 16
Move24AVX512:
vmovdqa64 EVEXR128N0, [rcx]
mov r8, [rcx+10h]
vmovdqa64 [rdx], EVEXR128N0
mov [rdx+10h], r8
vpxord EVEXR128N0, EVEXR128N0, EVEXR128N0
ret
Move56AVX512:
vmovdqa64 EVEXR256N0, [rcx+00h]
vmovdqa64 EVEXR128N1, [rcx+20h]
mov r8, [rcx+30h]
vmovdqa64 [rdx+00h], EVEXR256N0
vmovdqa64 [rdx+20h], EVEXR128N1
mov [rdx + 48], r8
vpxord EVEXR256N0, EVEXR256N0, EVEXR256N0
vpxord EVEXR128N1, EVEXR128N1, EVEXR128N1
ret
align 16
Move88AVX512:
vmovdqu64 EVEXR512N0, [rcx]
vmovdqa64 EVEXR128N1, [rcx+40h]
mov rcx, [rcx+50h]
vmovdqu64 [rdx], EVEXR512N0
vmovdqa64 [rdx+40h], EVEXR128N1
mov [rdx+50h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR128N1,EVEXR128N1,EVEXR128N1
ret
align 16
Move120AVX512:
vmovdqu64 EVEXR512N0, [rcx]
vmovdqa64 EVEXR256N1, [rcx+40h]
vmovdqa64 EVEXR128N2, [rcx+60h]
mov rcx, [rcx + 70h]
vmovdqu64 [rdx], EVEXR512N0
vmovdqa64 [rdx+40h], EVEXR256N1
vmovdqa64 [rdx+60h], EVEXR128N2
mov [rdx+70h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR256N1,EVEXR256N1,EVEXR256N1
vpxord EVEXR128N2,EVEXR128N2,EVEXR128N2
ret
align 16
Move152AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqa64 EVEXR128N2, [rcx+80h]
mov rcx, [rcx+90h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqa64 [rdx+80h], EVEXR128N2
mov [rdx+90h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR128N2,EVEXR128N2,EVEXR128N2
ret
align 16
Move184AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqa64 EVEXR256N2, [rcx+80h]
vmovdqa64 EVEXR128N3, [rcx+0A0h]
mov rcx, [rcx+0B0h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqa64 [rdx+80h], EVEXR256N2
vmovdqa64 [rdx+0A0h],EVEXR128N3
mov [rdx+0B0h],rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR256N2,EVEXR256N2,EVEXR256N2
vpxord EVEXR128N3,EVEXR128N3,EVEXR128N3
ret
align 16
Move216AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqa64 EVEXR128N3, [rcx+0C0h]
mov rcx, [rcx+0D0h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqa64 [rdx+0C0h], EVEXR128N3
mov [rdx+0D0h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR128N3,EVEXR128N3,EVEXR128N3
ret
align 16
Move248AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqa64 EVEXR256N3, [rcx+0C0h]
vmovdqa64 EVEXR128N4, [rcx+0E0h]
mov rcx, [rcx+0F0h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqa64 [rdx+0C0h], EVEXR256N3
vmovdqa64 [rdx+0E0h], EVEXR128N4
mov [rdx+0F0h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR256N3,EVEXR256N3,EVEXR256N3
vpxord EVEXR128N4,EVEXR128N4,EVEXR128N4
ret
align 16
Move280AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqu64 EVEXR512N3, [rcx+0C0h]
vmovdqa64 EVEXR128N4, [rcx+100h]
mov rcx, [rcx+110h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqu64 [rdx+0C0h], EVEXR512N3
vmovdqa64 [rdx+100h], EVEXR128N4
mov [rdx+110h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
vpxord EVEXR128N4,EVEXR128N4,EVEXR128N4
ret
align 16
Move312AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqu64 EVEXR512N3, [rcx+0C0h]
vmovdqa64 EVEXR256N4, [rcx+100h]
vmovdqa64 EVEXR128N5, [rcx+120h]
mov rcx, [rcx+130h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqu64 [rdx+0C0h], EVEXR512N3
vmovdqa64 [rdx+100h], EVEXR256N4
vmovdqa64 [rdx+120h], EVEXR128N5
mov [rdx+130h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
vpxord EVEXR256N4,EVEXR256N4,EVEXR256N4
vpxord EVEXR128N5,EVEXR128N5,EVEXR128N5
ret
align 16
Move344AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqu64 EVEXR512N3, [rcx+0C0h]
vmovdqu64 EVEXR512N4, [rcx+100h]
vmovdqa64 EVEXR128N5, [rcx+140h]
mov rcx, [rcx+150h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqu64 [rdx+0C0h], EVEXR512N3
vmovdqu64 [rdx+100h], EVEXR512N4
vmovdqa64 [rdx+140h], EVEXR128N5
mov [rdx+150h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
vpxord EVEXR512N4,EVEXR512N4,EVEXR512N4
vpxord EVEXR128N5,EVEXR128N5,EVEXR128N5
ret
align 16
MoveX32LpAvx512WithErms:
; Make the counter negative based: The last 24 bytes are moved separately
mov eax, 8
sub r8, rax
add rcx, r8
add rdx, r8
neg r8
jns @MoveLast8
cmp r8, -2048 ; According to the Intel Manual, rep movsb outperforms AVX copy on blocks of 2048 bytes and above
jg @DontDoRepMovsb
align 4
@DoRepMovsb:
mov r10, rsi
mov r9, rdi
lea rsi, [rcx+r8]
lea rdi, [rdx+r8]
neg r8
add r8, rax
mov rcx, r8
cld
rep movsb
mov rdi, r9
mov rsi, r10
jmp @exit
align 16
@DontDoRepMovsb:
cmp r8, -(128+64)
jg @SmallAvxMove
mov eax, 128
sub rcx, rax
sub rdx, rax
add r8, rax
lea r9, [rdx+r8]
test r9b, 63
jz @Avx512BigMoveDestAligned
; destination is already 32-bytes aligned, so we just align by 64 bytes
vmovdqa64 EVEXR256N0, [rcx+r8]
vmovdqa64 [rdx+r8], EVEXR256N0
add r8, 20h
align 16
@Avx512BigMoveDestAligned:
vmovdqu64 EVEXR512N0, [rcx+r8+00h]
vmovdqu64 EVEXR512N1, [rcx+r8+40h]
vmovdqa64 [rdx+r8+00h], EVEXR512N0
vmovdqa64 [rdx+r8+40h], EVEXR512N1
add r8, rax
js @Avx512BigMoveDestAligned
sub r8, rax
add rcx, rax
add rdx, rax
align 16
@SmallAvxMove:
@MoveLoopAvx:
; Move a 16 byte block
vmovdqa64 EVEXR128N0, [rcx+r8]
vmovdqa64 [rdx+r8], EVEXR128N0
; Are there another 16 bytes to move?
add r8, 16
js @MoveLoopAvx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
align 8
@MoveLast8:
; Do the last 8 bytes
mov rcx, [rcx+r8]
mov [rdx+r8], rcx
@exit:
ret