This project provides a unified compute & graphics host API, as well as a unified compute & graphics C++ device language and library to enable same-source CUDA/Host/Metal/OpenCL/Vulkan programming and execution.
The unified host API is implemented at compute and graphics. All backends (CUDA/Host/Metal/OpenCL/Vulkan) currently provide compute support, while graphics support is limited to Metal and Vulkan.
To provide a unified device language, a clang/LLVM/libc++ 14.0 toolchain has been modified.
Certain parts of libfloor are used by both host and device code (math and constexpr). Additional device library code is located at device.
Advanced examples can be found in the floor_examples repository.
Let’s take this fairly simple C++ kernel below that computes the body/body-interactions in a N-body simulation and compile it for each backend. Note that loop unrolling is omitted for conciseness.
// define global constants
static constexpr constant const uint32_t NBODY_TILE_SIZE { 256u };
static constexpr constant const float NBODY_DAMPING { 0.999f };
static constexpr constant const float NBODY_SOFTENING { 0.01f };
// define a 1D kernel with a required local size of (NBODY_TILE_SIZE = 256, 1, 1)
kernel_1d(NBODY_TILE_SIZE)
void simplified_nbody(buffer<const float4> in_positions, // read-only global memory buffer
buffer<float4> out_positions, // read-write global memory buffer
buffer<float3> inout_velocities, // read-write global memory buffer
param<float> time_delta) { // read-only parameter
// each work-item represents/computes one body
const auto position = in_positions[global_id.x];
auto velocity = inout_velocities[global_id.x];
float3 acceleration; // vectors are automatically zero-initialized
local_buffer<float4, NBODY_TILE_SIZE> local_body_positions; // local memory array allocation
// loop over all bodies
for (uint32_t i = 0, tile = 0, count = global_size.x; i < count; i += NBODY_TILE_SIZE, ++tile) {
// move resp. body position/mass from global to local memory
local_body_positions[local_id.x] = in_positions[tile * NBODY_TILE_SIZE + local_id.x];
local_barrier(); // barrier across all work-items in this work-group
// loop over bodies in this work-group
for (uint32_t j = 0; j < NBODY_TILE_SIZE; ++j) {
const auto r = local_body_positions[j].xyz - position.xyz;
const auto dist_sq = r.dot(r) + (NBODY_SOFTENING * NBODY_SOFTENING);
const auto inv_dist = rsqrt(dist_sq);
const auto s = local_body_positions[j].w * (inv_dist * inv_dist * inv_dist); // .w is mass
acceleration += r * s;
}
local_barrier();
}
velocity = (velocity + acceleration * time_delta) * NBODY_DAMPING;
out_positions[global_id.x].xyz += velocity * time_delta; // update XYZ position
inout_velocities[global_id.x] = velocity; // update velocity
}
click to unfold the output for each backend
CUDA / PTX
You can download the PTX file here and the CUBIN file here (note that building CUBINs is optional and requiresptxas
).
//
// Generated by LLVM NVPTX Back-End
//
.version 8.4
.target sm_86
.address_size 64
// .globl simplified_nbody
// _ZZ16simplified_nbodyE20local_body_positions has been demoted
.visible .entry simplified_nbody(
.param .u64 simplified_nbody_param_0,
.param .u64 simplified_nbody_param_1,
.param .u64 simplified_nbody_param_2,
.param .f32 simplified_nbody_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<3>;
.reg .b32 %r<25>;
.reg .f32 %f<71>;
.reg .b64 %rd<18>;
// demoted variable
.shared .align 4 .b8 _ZZ16simplified_nbodyE20local_body_positions[4096];
mov.u32 %r1, %tid.x;
mov.u32 %r11, %ntid.x;
mov.u32 %r12, %ctaid.x;
mad.lo.s32 %r13, %r12, %r11, %r1;
cvt.u64.u32 %rd3, %r13;
mul.wide.u32 %rd7, %r13, 12;
ld.param.u64 %rd8, [simplified_nbody_param_2];
cvta.to.global.u64 %rd9, %rd8;
add.s64 %rd4, %rd9, %rd7;
ld.global.f32 %f6, [%rd4+8];
add.s64 %rd6, %rd4, 8;
ld.global.f32 %f5, [%rd4+4];
add.s64 %rd5, %rd4, 4;
ld.global.f32 %f4, [%rd4];
mul.wide.u32 %rd10, %r13, 16;
ld.param.u64 %rd11, [simplified_nbody_param_0];
cvta.to.global.u64 %rd2, %rd11;
add.s64 %rd12, %rd2, %rd10;
ld.global.nc.f32 %f3, [%rd12+8];
ld.global.nc.f32 %f2, [%rd12+4];
ld.global.nc.f32 %f1, [%rd12];
mov.u32 %r14, %nctaid.x;
mul.lo.s32 %r2, %r14, %r11;
shl.b32 %r15, %r1, 4;
mov.u32 %r16, _ZZ16simplified_nbodyE20local_body_positions;
add.s32 %r3, %r16, %r15;
ld.param.u64 %rd13, [simplified_nbody_param_1];
cvta.to.global.u64 %rd1, %rd13;
mov.f32 %f68, 0f00000000;
mov.u32 %r10, 0;
ld.param.f32 %f16, [simplified_nbody_param_3];
mov.u32 %r22, %r10;
mov.u32 %r23, %r10;
mov.f32 %f69, %f68;
mov.f32 %f70, %f68;
LBB0_1:
shl.b32 %r18, %r23, 8;
add.s32 %r19, %r18, %r1;
mul.wide.u32 %rd14, %r19, 16;
add.s64 %rd15, %rd2, %rd14;
ld.global.nc.f32 %f18, [%rd15];
st.shared.f32 [%r3], %f18;
ld.global.nc.f32 %f19, [%rd15+4];
st.shared.f32 [%r3+4], %f19;
ld.global.nc.f32 %f20, [%rd15+8];
st.shared.f32 [%r3+8], %f20;
ld.global.nc.f32 %f21, [%rd15+12];
st.shared.f32 [%r3+12], %f21;
barrier.sync 0;
mov.u32 %r24, %r10;
LBB0_2:
add.s32 %r21, %r16, %r24;
ld.shared.f32 %f22, [%r21+4];
sub.f32 %f23, %f22, %f2;
ld.shared.f32 %f24, [%r21];
sub.f32 %f25, %f24, %f1;
fma.rn.f32 %f26, %f25, %f25, 0f38D1B717;
fma.rn.f32 %f27, %f23, %f23, %f26;
ld.shared.f32 %f28, [%r21+8];
sub.f32 %f29, %f28, %f3;
fma.rn.f32 %f30, %f29, %f29, %f27;
rsqrt.approx.ftz.f32 %f31, %f30;
mul.f32 %f32, %f31, %f31;
mul.f32 %f33, %f32, %f31;
ld.shared.f32 %f34, [%r21+12];
mul.f32 %f35, %f33, %f34;
fma.rn.f32 %f36, %f35, %f29, %f68;
ld.shared.f32 %f37, [%r21+20];
sub.f32 %f38, %f37, %f2;
ld.shared.f32 %f39, [%r21+16];
sub.f32 %f40, %f39, %f1;
fma.rn.f32 %f41, %f40, %f40, 0f38D1B717;
fma.rn.f32 %f42, %f38, %f38, %f41;
ld.shared.f32 %f43, [%r21+24];
sub.f32 %f44, %f43, %f3;
fma.rn.f32 %f45, %f44, %f44, %f42;
rsqrt.approx.ftz.f32 %f46, %f45;
mul.f32 %f47, %f46, %f46;
mul.f32 %f48, %f47, %f46;
ld.shared.f32 %f49, [%r21+28];
mul.f32 %f50, %f48, %f49;
fma.rn.f32 %f68, %f50, %f44, %f36;
fma.rn.f32 %f51, %f35, %f23, %f69;
fma.rn.f32 %f69, %f50, %f38, %f51;
fma.rn.f32 %f52, %f35, %f25, %f70;
fma.rn.f32 %f70, %f50, %f40, %f52;
add.s32 %r24, %r24, 32;
setp.eq.s32 %p1, %r24, 4096;
@%p1 bra LBB0_3;
bra.uni LBB0_2;
LBB0_3:
add.s32 %r22, %r22, 256;
setp.lt.u32 %p2, %r22, %r2;
barrier.sync 0;
add.s32 %r23, %r23, 1;
@%p2 bra LBB0_1;
fma.rn.f32 %f53, %f70, %f16, %f4;
mul.f32 %f54, %f53, 0f3F7FBE77;
shl.b64 %rd16, %rd3, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.f32 %f55, [%rd17];
fma.rn.f32 %f56, %f54, %f16, %f55;
st.global.f32 [%rd17], %f56;
fma.rn.f32 %f57, %f69, %f16, %f5;
mul.f32 %f58, %f57, 0f3F7FBE77;
ld.global.f32 %f59, [%rd17+4];
fma.rn.f32 %f60, %f58, %f16, %f59;
st.global.f32 [%rd17+4], %f60;
fma.rn.f32 %f61, %f68, %f16, %f6;
mul.f32 %f62, %f61, 0f3F7FBE77;
ld.global.f32 %f63, [%rd17+8];
fma.rn.f32 %f64, %f62, %f16, %f63;
st.global.f32 [%rd17+8], %f64;
st.global.f32 [%rd4], %f54;
st.global.f32 [%rd5], %f58;
st.global.f32 [%rd6], %f62;
ret;
}
Host-Compute (x86 CPU)
Note that the compiler would usually directly output a .bin file (ELF format). The output below comes from disassembling it withobjdump -d
.
Also note that this has been compiled for the x86-5
target (AVX-512+).
nbody.bin: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <simplified_nbody>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: 41 57 push %r15
6: 41 56 push %r14
8: 41 55 push %r13
a: 41 54 push %r12
c: 53 push %rbx
d: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
11: 48 81 ec 40 09 00 00 sub $0x940,%rsp
18: 48 8d 05 f9 ff ff ff lea -0x7(%rip),%rax # 18 <simplified_nbody+0x18>
1f: 49 be 00 00 00 00 00 movabs $0x0,%r14
26: 00 00 00
29: 48 89 4c 24 50 mov %rcx,0x50(%rsp)
2e: 48 89 74 24 68 mov %rsi,0x68(%rsp)
33: 48 89 7c 24 48 mov %rdi,0x48(%rsp)
38: 49 01 c6 add %rax,%r14
3b: 48 b8 00 00 00 00 00 movabs $0x0,%rax
42: 00 00 00
45: 49 8b 04 06 mov (%r14,%rax,1),%rax
49: 8b 00 mov (%rax),%eax
4b: 48 8d 0c 40 lea (%rax,%rax,2),%rcx
4f: 48 89 c6 mov %rax,%rsi
52: 48 c1 e6 04 shl $0x4,%rsi
56: 48 89 74 24 58 mov %rsi,0x58(%rsp)
5b: 48 8d 04 8a lea (%rdx,%rcx,4),%rax
5f: c5 fa 10 04 8a vmovss (%rdx,%rcx,4),%xmm0
64: c5 f9 6e 54 8a 04 vmovd 0x4(%rdx,%rcx,4),%xmm2
6a: c5 fa 10 4c 8a 08 vmovss 0x8(%rdx,%rcx,4),%xmm1
70: 48 89 44 24 60 mov %rax,0x60(%rsp)
75: 48 b8 00 00 00 00 00 movabs $0x0,%rax
7c: 00 00 00
7f: 49 8b 04 06 mov (%r14,%rax,1),%rax
83: 8b 18 mov (%rax),%ebx
85: c5 fa 11 44 24 3c vmovss %xmm0,0x3c(%rsp)
8b: c5 f9 7e 54 24 40 vmovd %xmm2,0x40(%rsp)
91: c5 fa 11 4c 24 44 vmovss %xmm1,0x44(%rsp)
97: 85 db test %ebx,%ebx
99: 0f 84 f9 16 00 00 je 1798 <simplified_nbody+0x1798>
9f: 48 8b 44 24 48 mov 0x48(%rsp),%rax
a4: 49 bd 00 00 00 00 00 movabs $0x0,%r13
ab: 00 00 00
ae: 45 31 ff xor %r15d,%r15d
b1: c5 fa 10 04 30 vmovss (%rax,%rsi,1),%xmm0
b6: c5 fa 10 4c 30 04 vmovss 0x4(%rax,%rsi,1),%xmm1
bc: c5 fa 10 54 30 08 vmovss 0x8(%rax,%rsi,1),%xmm2
c2: 48 b8 00 00 00 00 00 movabs $0x0,%rax
c9: 00 00 00
cc: 49 8b 04 06 mov (%r14,%rax,1),%rax
d0: 48 89 44 24 78 mov %rax,0x78(%rsp)
d5: 4b 8d 04 2e lea (%r14,%r13,1),%rax
d9: 48 89 44 24 70 mov %rax,0x70(%rsp)
de: 48 b8 00 00 00 00 00 movabs $0x0,%rax
e5: 00 00 00
e8: 62 f2 7d 48 18 c0 vbroadcastss %xmm0,%zmm0
ee: 4d 8b 24 06 mov (%r14,%rax,1),%r12
f2: 62 f2 7d 48 18 c9 vbroadcastss %xmm1,%zmm1
f8: 48 b8 00 00 00 00 00 movabs $0x0,%rax
ff: 00 00 00
102: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x700(%rsp)
109: 1c
10a: 62 f2 7d 48 18 c2 vbroadcastss %xmm2,%zmm0
110: 62 d2 fd 48 5b 14 06 vbroadcasti64x4 (%r14,%rax,1),%zmm2
117: 48 b8 00 00 00 00 00 movabs $0x0,%rax
11e: 00 00 00
121: 62 f1 7c 48 29 4c 24 vmovaps %zmm1,0x6c0(%rsp)
128: 1b
129: 62 d2 fd 48 5b 0c 06 vbroadcasti64x4 (%r14,%rax,1),%zmm1
130: 48 b8 00 00 00 00 00 movabs $0x0,%rax
137: 00 00 00
13a: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x680(%rsp)
141: 1a
142: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
146: c5 f8 29 84 24 80 00 vmovaps %xmm0,0x80(%rsp)
14d: 00 00
14f: 62 f1 fd 48 7f 54 24 vmovdqa64 %zmm2,0x640(%rsp)
156: 19
157: 62 d2 fd 48 5b 14 06 vbroadcasti64x4 (%r14,%rax,1),%zmm2
15e: 48 b8 00 00 00 00 00 movabs $0x0,%rax
165: 00 00 00
168: 62 f1 fd 48 7f 4c 24 vmovdqa64 %zmm1,0x840(%rsp)
16f: 21
170: 62 d2 7d 48 18 0c 06 vbroadcastss (%r14,%rax,1),%zmm1
177: 48 b8 00 00 00 00 00 movabs $0x0,%rax
17e: 00 00 00
181: 62 f1 fd 48 7f 54 24 vmovdqa64 %zmm2,0x800(%rsp)
188: 20
189: 62 d2 fd 48 5b 14 06 vbroadcasti64x4 (%r14,%rax,1),%zmm2
190: 48 b8 00 00 00 00 00 movabs $0x0,%rax
197: 00 00 00
19a: 62 f1 7c 48 29 4c 24 vmovaps %zmm1,0x600(%rsp)
1a1: 18
1a2: 62 d2 7d 48 18 0c 06 vbroadcastss (%r14,%rax,1),%zmm1
1a9: 48 b8 00 00 00 00 00 movabs $0x0,%rax
1b0: 00 00 00
1b3: 62 d2 7d 48 18 04 06 vbroadcastss (%r14,%rax,1),%zmm0
1ba: 62 f1 fd 48 7f 54 24 vmovdqa64 %zmm2,0x7c0(%rsp)
1c1: 1f
1c2: 62 f1 7c 48 29 4c 24 vmovaps %zmm1,0x780(%rsp)
1c9: 1e
1ca: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x740(%rsp)
1d1: 1d
1d2: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
1d6: c5 f8 29 84 24 c0 00 vmovaps %xmm0,0xc0(%rsp)
1dd: 00 00
1df: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
1e3: c5 f8 29 84 24 00 01 vmovaps %xmm0,0x100(%rsp)
1ea: 00 00
1ec: 0f 1f 40 00 nopl 0x0(%rax)
1f0: 48 8b 44 24 78 mov 0x78(%rsp),%rax
1f5: 48 8b 54 24 48 mov 0x48(%rsp),%rdx
1fa: 8b 00 mov (%rax),%eax
1fc: 42 8d 0c 38 lea (%rax,%r15,1),%ecx
200: 48 c1 e0 04 shl $0x4,%rax
204: 48 c1 e1 04 shl $0x4,%rcx
208: c5 f8 10 04 0a vmovups (%rdx,%rcx,1),%xmm0
20d: 48 8b 4c 24 70 mov 0x70(%rsp),%rcx
212: c5 f8 29 04 08 vmovaps %xmm0,(%rax,%rcx,1)
217: c5 f8 77 vzeroupper
21a: 41 ff d4 call *%r12
21d: 62 91 7c 48 28 5c 2e vmovaps 0x80(%r14,%r13,1),%zmm3
224: 02
225: 62 f1 7c 48 28 64 24 vmovaps 0x640(%rsp),%zmm4
22c: 19
22d: 62 81 7c 48 28 5c 2e vmovaps 0xc0(%r14,%r13,1),%zmm19
234: 03
235: 62 91 7c 48 28 54 2e vmovaps 0x180(%r14,%r13,1),%zmm2
23c: 06
23d: 62 11 7c 48 28 4c 2e vmovaps 0x100(%r14,%r13,1),%zmm9
244: 04
245: 62 11 7c 48 28 6c 2e vmovaps 0x140(%r14,%r13,1),%zmm13
24c: 05
24d: 62 81 7c 48 28 4c 2e vmovaps 0x1c0(%r14,%r13,1),%zmm17
254: 07
255: 62 71 7c 48 28 74 24 vmovaps 0x800(%rsp),%zmm14
25c: 20
25d: 62 91 7c 48 28 04 2e vmovaps (%r14,%r13,1),%zmm0
264: 62 81 7c 48 28 54 2e vmovaps 0x40(%r14,%r13,1),%zmm18
26b: 01
26c: 62 f1 7c 48 28 74 24 vmovaps 0x7c0(%rsp),%zmm6
273: 1f
274: 62 01 7c 48 28 44 2e vmovaps 0x280(%r14,%r13,1),%zmm24
27b: 0a
27c: 62 81 7c 48 28 74 2e vmovaps 0x200(%r14,%r13,1),%zmm22
283: 08
284: 62 81 7c 48 28 6c 2e vmovaps 0x240(%r14,%r13,1),%zmm21
28b: 09
28c: 62 81 7c 48 28 7c 2e vmovaps 0x2c0(%r14,%r13,1),%zmm23
293: 0b
294: 62 01 7c 48 28 64 2e vmovaps 0x380(%r14,%r13,1),%zmm28
29b: 0e
29c: 62 01 7c 48 28 54 2e vmovaps 0x300(%r14,%r13,1),%zmm26
2a3: 0c
2a4: 62 01 7c 48 28 5c 2e vmovaps 0x3c0(%r14,%r13,1),%zmm27
2ab: 0f
2ac: 62 f1 7c 48 28 cb vmovaps %zmm3,%zmm1
2b2: 62 e1 7c 48 28 e2 vmovaps %zmm2,%zmm20
2b8: 62 d1 7c 48 28 e9 vmovaps %zmm9,%zmm5
2be: 62 61 7c 48 28 ca vmovaps %zmm2,%zmm25
2c4: 62 f1 7c 48 28 f8 vmovaps %zmm0,%zmm7
2ca: 62 71 7c 48 28 fb vmovaps %zmm3,%zmm15
2d0: 62 e1 7c 48 28 c0 vmovaps %zmm0,%zmm16
2d6: 62 71 7c 48 28 c3 vmovaps %zmm3,%zmm8
2dc: 62 71 7c 48 28 e0 vmovaps %zmm0,%zmm12
2e2: 62 71 7c 48 28 d2 vmovaps %zmm2,%zmm10
2e8: 62 b2 4d 48 7f db vpermt2ps %zmm19,%zmm6,%zmm3
2ee: 62 b2 4d 48 7f c2 vpermt2ps %zmm18,%zmm6,%zmm0
2f4: 62 61 7c 48 28 f4 vmovaps %zmm4,%zmm30
2fa: 62 b2 4d 48 7f d1 vpermt2ps %zmm17,%zmm6,%zmm2
300: 62 51 7c 48 28 d9 vmovaps %zmm9,%zmm11
306: 62 01 7c 48 28 e8 vmovaps %zmm24,%zmm29
30c: 62 01 7c 48 28 fc vmovaps %zmm28,%zmm31
312: 62 b2 5d 48 7f cb vpermt2ps %zmm19,%zmm4,%zmm1
318: 62 a2 5d 48 7f e1 vpermt2ps %zmm17,%zmm4,%zmm20
31e: 62 d2 5d 48 7f ed vpermt2ps %zmm13,%zmm4,%zmm5
324: 62 22 0d 48 7f c9 vpermt2ps %zmm17,%zmm14,%zmm25
32a: 62 b2 5d 48 7f fa vpermt2ps %zmm18,%zmm4,%zmm7
330: 62 d1 7c 48 28 e1 vmovaps %zmm9,%zmm4
336: 62 32 0d 48 7f fb vpermt2ps %zmm19,%zmm14,%zmm15
33c: 62 a2 0d 48 7f c2 vpermt2ps %zmm18,%zmm14,%zmm16
342: 62 52 4d 48 7f cd vpermt2ps %zmm13,%zmm6,%zmm9
348: 62 52 0d 48 7f dd vpermt2ps %zmm13,%zmm14,%zmm11
34e: 62 91 7c 48 28 f2 vmovaps %zmm26,%zmm6
354: 62 22 0d 40 7f ef vpermt2ps %zmm23,%zmm30,%zmm29
35a: 62 f3 fd 48 23 c3 e4 vshuff64x2 $0xe4,%zmm3,%zmm0,%zmm0
361: 62 91 7c 48 28 dc vmovaps %zmm28,%zmm3
367: 62 f1 7c 48 29 4c 24 vmovaps %zmm1,0x140(%rsp)
36e: 05
36f: 62 f1 7c 48 28 4c 24 vmovaps 0x840(%rsp),%zmm1
376: 21
377: 62 b3 d5 48 23 ec e4 vshuff64x2 $0xe4,%zmm20,%zmm5,%zmm5
37e: 62 61 7c 48 29 4c 24 vmovaps %zmm25,0x280(%rsp)
385: 0a
386: 62 01 7c 48 28 4c 2e vmovaps 0x340(%r14,%r13,1),%zmm25
38d: 0d
38e: 62 a1 7c 48 28 e6 vmovaps %zmm22,%zmm20
394: 62 f3 b5 48 23 d2 e4 vshuff64x2 $0xe4,%zmm2,%zmm9,%zmm2
39b: 62 71 7c 48 28 4c 24 vmovaps 0x640(%rsp),%zmm9
3a2: 19
3a3: 62 92 0d 48 7f db vpermt2ps %zmm27,%zmm14,%zmm3
3a9: 62 f3 c5 48 23 7c 24 vshuff64x2 $0xe4,0x140(%rsp),%zmm7,%zmm7
3b0: 05 e4
3b2: 62 a2 0d 48 7f e5 vpermt2ps %zmm21,%zmm14,%zmm20
3b8: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x140(%rsp)
3bf: 05
3c0: 62 f1 fd 48 29 6c 24 vmovapd %zmm5,0x4c0(%rsp)
3c7: 13
3c8: 62 f1 7c 48 28 6c 24 vmovaps 0x7c0(%rsp),%zmm5
3cf: 1f
3d0: 62 f1 fd 48 29 54 24 vmovapd %zmm2,0x500(%rsp)
3d7: 14
3d8: 62 32 75 48 7f c3 vpermt2ps %zmm19,%zmm1,%zmm8
3de: 62 32 75 48 7f e2 vpermt2ps %zmm18,%zmm1,%zmm12
3e4: 62 a1 7c 48 28 de vmovaps %zmm22,%zmm19
3ea: 62 81 7c 48 28 d0 vmovaps %zmm24,%zmm18
3f0: 62 32 75 48 7f d1 vpermt2ps %zmm17,%zmm1,%zmm10
3f6: 62 81 7c 48 28 c8 vmovaps %zmm24,%zmm17
3fc: 62 d2 75 48 7f e5 vpermt2ps %zmm13,%zmm1,%zmm4
402: 62 11 7c 48 28 ee vmovaps %zmm30,%zmm13
408: 62 21 7c 48 28 f6 vmovaps %zmm22,%zmm30
40e: 62 a2 0d 48 7f d7 vpermt2ps %zmm23,%zmm14,%zmm18
414: 62 a2 75 48 7f cf vpermt2ps %zmm23,%zmm1,%zmm17
41a: 62 a2 75 48 7f dd vpermt2ps %zmm21,%zmm1,%zmm19
420: 62 02 15 48 7f fb vpermt2ps %zmm27,%zmm13,%zmm31
426: 62 92 15 48 7f f1 vpermt2ps %zmm25,%zmm13,%zmm6
42c: 62 22 15 48 7f f5 vpermt2ps %zmm21,%zmm13,%zmm30
432: 62 11 7c 48 28 ec vmovaps %zmm28,%zmm13
438: 62 f1 fd 48 29 7c 24 vmovapd %zmm7,0x240(%rsp)
43f: 09
440: 62 f3 a5 48 23 7c 24 vshuff64x2 $0xe4,0x280(%rsp),%zmm11,%zmm7
447: 0a e4
449: 62 02 55 48 7f e3 vpermt2ps %zmm27,%zmm5,%zmm28
44f: 62 22 55 48 7f c7 vpermt2ps %zmm23,%zmm5,%zmm24
455: 62 a2 55 48 7f f5 vpermt2ps %zmm21,%zmm5,%zmm22
45b: 62 12 75 48 7f eb vpermt2ps %zmm27,%zmm1,%zmm13
461: 62 81 7c 48 28 7c 2e vmovaps 0x4c0(%r14,%r13,1),%zmm23
468: 13
469: 62 e1 7c 48 28 6c 24 vmovaps 0x6c0(%rsp),%zmm21
470: 1b
471: 62 d3 dd 48 23 c2 e4 vshuff64x2 $0xe4,%zmm10,%zmm4,%zmm0
478: 62 53 fd 40 23 d7 e4 vshuff64x2 $0xe4,%zmm15,%zmm16,%zmm10
47f: 62 11 7c 48 28 fa vmovaps %zmm26,%zmm15
485: 62 53 9d 48 23 c0 e4 vshuff64x2 $0xe4,%zmm8,%zmm12,%zmm8
48c: 62 11 7c 48 28 e2 vmovaps %zmm26,%zmm12
492: 62 02 55 48 7f d1 vpermt2ps %zmm25,%zmm5,%zmm26
498: 62 81 7c 48 28 44 2e vmovaps 0x540(%r14,%r13,1),%zmm16
49f: 15
4a0: 62 33 e5 40 23 d9 e4 vshuff64x2 $0xe4,%zmm17,%zmm19,%zmm11
4a7: 62 a3 dd 40 23 d2 e4 vshuff64x2 $0xe4,%zmm18,%zmm20,%zmm18
4ae: 62 81 7c 48 28 64 2e vmovaps 0x580(%r14,%r13,1),%zmm20
4b5: 16
4b6: 62 81 7c 48 28 4c 2e vmovaps 0x500(%r14,%r13,1),%zmm17
4bd: 14
4be: 62 12 0d 48 7f f9 vpermt2ps %zmm25,%zmm14,%zmm15
4c4: 62 12 75 48 7f e1 vpermt2ps %zmm25,%zmm1,%zmm12
4ca: 62 01 7c 48 28 4c 2e vmovaps 0x5c0(%r14,%r13,1),%zmm25
4d1: 17
4d2: 62 93 8d 40 23 d5 e4 vshuff64x2 $0xe4,%zmm29,%zmm30,%zmm2
4d9: 62 e1 7c 48 28 5c 24 vmovaps 0x780(%rsp),%zmm19
4e0: 1e
4e1: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x440(%rsp)
4e8: 11
4e9: 62 93 cd 48 23 c7 e4 vshuff64x2 $0xe4,%zmm31,%zmm6,%zmm0
4f0: 62 f1 fd 48 29 54 24 vmovapd %zmm2,0x200(%rsp)
4f7: 08
4f8: 62 f1 7c 48 28 d5 vmovaps %zmm5,%zmm2
4fe: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x400(%rsp)
505: 10
506: 62 93 cd 40 23 c0 e4 vshuff64x2 $0xe4,%zmm24,%zmm22,%zmm0
50d: 62 81 7c 48 28 74 2e vmovaps 0x400(%r14,%r13,1),%zmm22
514: 10
515: 62 01 7c 48 28 44 2e vmovaps 0x480(%r14,%r13,1),%zmm24
51c: 12
51d: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x480(%rsp)
524: 12
525: 62 93 ad 40 23 e4 e4 vshuff64x2 $0xe4,%zmm28,%zmm26,%zmm4
52c: 62 d3 9d 48 23 ed e4 vshuff64x2 $0xe4,%zmm13,%zmm12,%zmm5
533: 62 f3 85 48 23 db e4 vshuff64x2 $0xe4,%zmm3,%zmm15,%zmm3
53a: 62 21 7c 48 28 dc vmovaps %zmm20,%zmm27
540: 62 21 7c 48 28 e1 vmovaps %zmm17,%zmm28
546: 62 f1 fd 48 29 64 24 vmovapd %zmm4,0x280(%rsp)
54d: 0a
54e: 62 91 7c 48 28 64 2e vmovaps 0x440(%r14,%r13,1),%zmm4
555: 11
556: 62 21 7c 48 28 f4 vmovaps %zmm20,%zmm30
55c: 62 21 7c 48 28 f9 vmovaps %zmm17,%zmm31
562: 62 02 35 48 7f d9 vpermt2ps %zmm25,%zmm9,%zmm27
568: 62 22 35 48 7f e0 vpermt2ps %zmm16,%zmm9,%zmm28
56e: 62 02 0d 48 7f f1 vpermt2ps %zmm25,%zmm14,%zmm30
574: 62 22 0d 48 7f f8 vpermt2ps %zmm16,%zmm14,%zmm31
57a: 62 01 7c 48 28 d0 vmovaps %zmm24,%zmm26
580: 62 31 7c 48 28 ee vmovaps %zmm22,%zmm13
586: 62 11 7c 48 28 f8 vmovaps %zmm24,%zmm15
58c: 62 21 7c 48 28 ee vmovaps %zmm22,%zmm29
592: 62 22 35 48 7f d7 vpermt2ps %zmm23,%zmm9,%zmm26
598: 62 32 75 48 7f ff vpermt2ps %zmm23,%zmm1,%zmm15
59e: 62 93 9d 40 23 f3 e4 vshuff64x2 $0xe4,%zmm27,%zmm28,%zmm6
5a5: 62 72 35 48 7f ec vpermt2ps %zmm4,%zmm9,%zmm13
5ab: 62 21 7c 48 28 e4 vmovaps %zmm20,%zmm28
5b1: 62 62 0d 48 7f ec vpermt2ps %zmm4,%zmm14,%zmm29
5b7: 62 02 75 48 7f e1 vpermt2ps %zmm25,%zmm1,%zmm28
5bd: 62 f1 fd 48 29 74 24 vmovapd %zmm6,0x1c0(%rsp)
5c4: 07
5c5: 62 b1 7c 48 28 f6 vmovaps %zmm22,%zmm6
5cb: 62 f2 75 48 7f f4 vpermt2ps %zmm4,%zmm1,%zmm6
5d1: 62 93 95 48 23 c2 e4 vshuff64x2 $0xe4,%zmm26,%zmm13,%zmm0
5d8: 62 71 7c 48 28 e9 vmovaps %zmm1,%zmm13
5de: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x180(%rsp)
5e5: 06
5e6: 62 d3 cd 48 23 c7 e4 vshuff64x2 $0xe4,%zmm15,%zmm6,%zmm0
5ed: 62 f1 7c 48 28 74 24 vmovaps 0x600(%rsp),%zmm6
5f4: 18
5f5: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x300(%rsp)
5fc: 0c
5fd: 62 b1 7c 48 28 c1 vmovaps %zmm17,%zmm0
603: 62 b2 75 48 7f c0 vpermt2ps %zmm16,%zmm1,%zmm0
609: 62 f1 7c 48 28 4c 24 vmovaps 0x240(%rsp),%zmm1
610: 09
611: 62 93 fd 48 23 c4 e4 vshuff64x2 $0xe4,%zmm28,%zmm0,%zmm0
618: 62 61 7c 48 28 e2 vmovaps %zmm2,%zmm28
61e: 62 e2 1d 40 7f f4 vpermt2ps %zmm4,%zmm28,%zmm22
624: 62 f1 7c 48 28 64 24 vmovaps 0x4c0(%rsp),%zmm4
62b: 13
62c: 62 a2 1d 40 7f c8 vpermt2ps %zmm16,%zmm28,%zmm17
632: 62 82 1d 40 7f e1 vpermt2ps %zmm25,%zmm28,%zmm20
638: 62 e1 7c 48 28 44 24 vmovaps 0x1c0(%rsp),%zmm16
63f: 07
640: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x2c0(%rsp)
647: 0b
648: 62 91 7c 48 28 c0 vmovaps %zmm24,%zmm0
64e: 62 22 6d 48 7f c7 vpermt2ps %zmm23,%zmm2,%zmm24
654: 62 f1 7c 48 28 54 24 vmovaps 0x680(%rsp),%zmm2
65b: 1a
65c: 62 b2 0d 48 7f c7 vpermt2ps %zmm23,%zmm14,%zmm0
662: 62 e1 7c 48 28 7c 24 vmovaps 0x740(%rsp),%zmm23
669: 1d
66a: 62 a3 f5 40 23 e4 e4 vshuff64x2 $0xe4,%zmm20,%zmm17,%zmm20
671: 62 83 cd 40 23 f0 e4 vshuff64x2 $0xe4,%zmm24,%zmm22,%zmm22
678: 62 f3 95 40 23 c0 e4 vshuff64x2 $0xe4,%zmm0,%zmm29,%zmm0
67f: 62 03 85 40 23 ee e4 vshuff64x2 $0xe4,%zmm30,%zmm31,%zmm29
686: 62 21 3c 48 5c f5 vsubps %zmm21,%zmm8,%zmm30
68c: 62 71 7c 48 28 44 24 vmovaps 0x440(%rsp),%zmm8
693: 11
694: 62 61 2c 48 5c fa vsubps %zmm2,%zmm10,%zmm31
69a: 62 61 44 48 5c da vsubps %zmm2,%zmm7,%zmm27
6a0: 62 b1 7c 48 28 fb vmovaps %zmm19,%zmm7
6a6: 62 f1 64 48 5c da vsubps %zmm2,%zmm3,%zmm3
6ac: 62 f1 7c 48 29 5c 24 vmovaps %zmm3,0x240(%rsp)
6b3: 09
6b4: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x5c0(%rsp)
6bb: 17
6bc: 62 f1 7c 48 28 44 24 vmovaps 0x700(%rsp),%zmm0
6c3: 1c
6c4: 62 21 3c 48 5c d5 vsubps %zmm21,%zmm8,%zmm26
6ca: 62 71 74 48 5c e0 vsubps %zmm0,%zmm1,%zmm12
6d0: 62 f1 5c 48 5c e0 vsubps %zmm0,%zmm4,%zmm4
6d6: 62 e1 7c 40 5c c0 vsubps %zmm0,%zmm16,%zmm16
6dc: 62 51 7c 48 28 cc vmovaps %zmm12,%zmm9
6e2: 62 71 7c 48 28 c4 vmovaps %zmm4,%zmm8
6e8: 62 e1 7c 48 29 44 24 vmovaps %zmm16,0x340(%rsp)
6ef: 0d
6f0: 62 72 1d 48 a8 ce vfmadd213ps %zmm6,%zmm12,%zmm9
6f6: 62 72 5d 48 a8 c6 vfmadd213ps %zmm6,%zmm4,%zmm8
6fc: 62 12 0d 40 b8 ce vfmadd231ps %zmm30,%zmm30,%zmm9
702: 62 12 2d 40 b8 c2 vfmadd231ps %zmm26,%zmm26,%zmm8
708: 62 12 05 40 b8 cf vfmadd231ps %zmm31,%zmm31,%zmm9
70e: 62 12 25 40 b8 c3 vfmadd231ps %zmm27,%zmm27,%zmm8
714: 62 52 7d 48 4e d1 vrsqrt14ps %zmm9,%zmm10
71a: 62 52 7d 48 4e f8 vrsqrt14ps %zmm8,%zmm15
720: 62 51 34 48 59 ca vmulps %zmm10,%zmm9,%zmm9
726: 62 51 3c 48 59 c7 vmulps %zmm15,%zmm8,%zmm8
72c: 62 32 2d 48 a8 cb vfmadd213ps %zmm19,%zmm10,%zmm9
732: 62 31 2c 48 59 d7 vmulps %zmm23,%zmm10,%zmm10
738: 62 32 05 48 a8 c3 vfmadd213ps %zmm19,%zmm15,%zmm8
73e: 62 51 2c 48 59 d1 vmulps %zmm9,%zmm10,%zmm10
744: 62 31 04 48 59 cf vmulps %zmm23,%zmm15,%zmm9
74a: 62 71 7c 48 28 7c 24 vmovaps 0x200(%rsp),%zmm15
751: 08
752: 62 d1 34 48 59 c8 vmulps %zmm8,%zmm9,%zmm1
758: 62 31 24 48 5c cd vsubps %zmm21,%zmm11,%zmm9
75e: 62 71 6c 40 5c c2 vsubps %zmm2,%zmm18,%zmm8
764: 62 71 7c 48 29 4c 24 vmovaps %zmm9,0x200(%rsp)
76b: 08
76c: 62 71 7c 48 29 44 24 vmovaps %zmm8,0x3c0(%rsp)
773: 0f
774: 62 e1 04 48 5c d8 vsubps %zmm0,%zmm15,%zmm19
77a: 62 31 7c 48 28 db vmovaps %zmm19,%zmm11
780: 62 72 65 40 a8 de vfmadd213ps %zmm6,%zmm19,%zmm11
786: 62 52 35 48 b8 d9 vfmadd231ps %zmm9,%zmm9,%zmm11
78c: 62 71 7c 48 28 4c 24 vmovaps 0x400(%rsp),%zmm9
793: 10
794: 62 52 3d 48 b8 d8 vfmadd231ps %zmm8,%zmm8,%zmm11
79a: 62 31 54 48 5c c5 vsubps %zmm21,%zmm5,%zmm8
7a0: 62 c2 7d 48 4e d3 vrsqrt14ps %zmm11,%zmm18
7a6: 62 71 7c 48 29 44 24 vmovaps %zmm8,0x380(%rsp)
7ad: 0e
7ae: 62 31 24 48 59 da vmulps %zmm18,%zmm11,%zmm11
7b4: 62 72 6d 40 a8 df vfmadd213ps %zmm7,%zmm18,%zmm11
7ba: 62 a1 6c 40 59 d7 vmulps %zmm23,%zmm18,%zmm18
7c0: 62 c1 6c 40 59 d3 vmulps %zmm11,%zmm18,%zmm18
7c6: 62 61 6c 40 59 44 24 vmulps 0x480(%rsp),%zmm18,%zmm24
7cd: 12
7ce: 62 71 34 48 5c f8 vsubps %zmm0,%zmm9,%zmm15
7d4: 62 d1 7c 48 28 ef vmovaps %zmm15,%zmm5
7da: 62 f2 05 48 a8 ee vfmadd213ps %zmm6,%zmm15,%zmm5
7e0: 62 d2 3d 48 b8 e8 vfmadd231ps %zmm8,%zmm8,%zmm5
7e6: 62 71 74 48 59 44 24 vmulps 0x500(%rsp),%zmm1,%zmm8
7ed: 14
7ee: 62 f1 74 48 59 c9 vmulps %zmm1,%zmm1,%zmm1
7f4: 62 f2 65 48 b8 eb vfmadd231ps %zmm3,%zmm3,%zmm5
7fa: 62 f1 2c 48 59 5c 24 vmulps 0x140(%rsp),%zmm10,%zmm3
801: 05
802: 62 51 2c 48 59 d2 vmulps %zmm10,%zmm10,%zmm10
808: 62 72 7d 48 4e dd vrsqrt14ps %zmm5,%zmm11
80e: 62 d1 54 48 59 eb vmulps %zmm11,%zmm5,%zmm5
814: 62 f2 25 48 a8 ef vfmadd213ps %zmm7,%zmm11,%zmm5
81a: 62 31 24 48 59 df vmulps %zmm23,%zmm11,%zmm11
820: 62 51 74 48 59 c0 vmulps %zmm8,%zmm1,%zmm8
826: 62 91 7c 48 28 4c 2e vmovaps 0x780(%r14,%r13,1),%zmm1
82d: 1e
82e: 62 61 2c 48 59 cb vmulps %zmm3,%zmm10,%zmm25
834: c4 41 28 57 d2 vxorps %xmm10,%xmm10,%xmm10
839: c4 63 29 0c 8c 24 00 vblendps $0x1,0x100(%rsp),%xmm10,%xmm9
840: 01 00 00 01
844: 62 f1 24 48 59 ed vmulps %zmm5,%zmm11,%zmm5
84a: c4 63 29 0c 9c 24 c0 vblendps $0x1,0xc0(%rsp),%xmm10,%xmm11
851: 00 00 00 01
855: c4 e3 29 0c 9c 24 80 vblendps $0x1,0x80(%rsp),%xmm10,%xmm3
85c: 00 00 00 01
860: 62 71 7c 48 28 54 24 vmovaps 0x180(%rsp),%zmm10
867: 06
868: 62 e1 2c 48 5c c8 vsubps %zmm0,%zmm10,%zmm17
86e: 62 f1 7c 48 28 44 24 vmovaps 0x300(%rsp),%zmm0
875: 0c
876: 62 71 7c 48 28 54 24 vmovaps 0x2c0(%rsp),%zmm10
87d: 0b
87e: 62 f1 7c 48 29 5c 24 vmovaps %zmm3,0x100(%rsp)
885: 04
886: 62 b1 6c 40 59 da vmulps %zmm18,%zmm18,%zmm3
88c: 62 e1 54 48 59 54 24 vmulps 0x280(%rsp),%zmm5,%zmm18
893: 0a
894: 62 f1 54 48 59 ed vmulps %zmm5,%zmm5,%zmm5
89a: 62 12 35 40 b8 de vfmadd231ps %zmm30,%zmm25,%zmm11
8a0: 62 01 7c 48 28 74 2e vmovaps 0x600(%r14,%r13,1),%zmm30
8a7: 18
8a8: 62 52 35 40 b8 cc vfmadd231ps %zmm12,%zmm25,%zmm9
8ae: 62 01 64 48 59 c0 vmulps %zmm24,%zmm3,%zmm24
8b4: 62 f1 14 40 5c da vsubps %zmm2,%zmm29,%zmm3
8ba: 62 21 7c 48 28 e8 vmovaps %zmm16,%zmm29
8c0: 62 e1 7c 48 29 4c 24 vmovaps %zmm17,0x80(%rsp)
8c7: 02
8c8: 62 e2 75 40 a8 ce vfmadd213ps %zmm6,%zmm17,%zmm17
8ce: 62 62 15 40 a8 ee vfmadd213ps %zmm6,%zmm29,%zmm29
8d4: 62 f1 3c 48 59 f4 vmulps %zmm4,%zmm8,%zmm6
8da: 62 32 3d 40 b8 cb vfmadd231ps %zmm19,%zmm24,%zmm9
8e0: 62 81 7c 48 28 5c 2e vmovaps 0x700(%r14,%r13,1),%zmm19
8e7: 1c
8e8: 62 f1 7c 48 29 5c 24 vmovaps %zmm3,0x1c0(%rsp)
8ef: 07
8f0: 62 a1 54 48 59 c2 vmulps %zmm18,%zmm5,%zmm16
8f6: 62 e1 7c 48 28 54 24 vmovaps 0x640(%rsp),%zmm18
8fd: 19
8fe: 62 d2 7d 40 b8 f7 vfmadd231ps %zmm15,%zmm16,%zmm6
904: 62 11 7c 48 28 fc vmovaps %zmm28,%zmm15
90a: 62 b1 7c 48 5c c5 vsubps %zmm21,%zmm0,%zmm0
910: 62 31 2c 48 5c d5 vsubps %zmm21,%zmm10,%zmm10
916: 62 e1 7c 48 28 6c 24 vmovaps 0x5c0(%rsp),%zmm21
91d: 17
91e: 62 11 7c 48 28 e6 vmovaps %zmm30,%zmm12
924: 62 e2 7d 48 b8 c8 vfmadd231ps %zmm0,%zmm0,%zmm17
92a: 62 42 2d 48 b8 ea vfmadd231ps %zmm10,%zmm10,%zmm29
930: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0xc0(%rsp)
937: 03
938: 62 71 7c 48 29 54 24 vmovaps %zmm10,0x140(%rsp)
93f: 05
940: 62 51 7c 48 28 d3 vmovaps %zmm11,%zmm10
946: 62 71 7c 48 28 de vmovaps %zmm6,%zmm11
94c: 62 62 65 48 b8 eb vfmadd231ps %zmm3,%zmm3,%zmm29
952: 62 b1 7c 48 28 f3 vmovaps %zmm19,%zmm6
958: 62 92 7d 48 4e c5 vrsqrt14ps %zmm29,%zmm0
95e: 62 f1 14 40 59 e8 vmulps %zmm0,%zmm29,%zmm5
964: 62 f2 7d 48 a8 ef vfmadd213ps %zmm7,%zmm0,%zmm5
96a: 62 e1 54 40 5c ea vsubps %zmm2,%zmm21,%zmm21
970: 62 a2 55 40 b8 cd vfmadd231ps %zmm21,%zmm21,%zmm17
976: 62 e1 7c 48 29 6c 24 vmovaps %zmm21,0x180(%rsp)
97d: 06
97e: 62 81 3c 48 59 ea vmulps %zmm26,%zmm8,%zmm21
984: 62 01 7c 48 28 54 2e vmovaps 0x940(%r14,%r13,1),%zmm26
98b: 25
98c: 62 b2 7d 48 4e d1 vrsqrt14ps %zmm17,%zmm2
992: 62 e2 7d 40 b8 6c 24 vfmadd231ps 0x380(%rsp),%zmm16,%zmm21
999: 0e
99a: 62 f1 74 40 59 e2 vmulps %zmm2,%zmm17,%zmm4
9a0: 62 f2 6d 48 a8 e7 vfmadd213ps %zmm7,%zmm2,%zmm4
9a6: 62 b1 6c 48 59 d7 vmulps %zmm23,%zmm2,%zmm2
9ac: 62 f1 6c 48 59 d4 vmulps %zmm4,%zmm2,%zmm2
9b2: 62 b1 7c 48 59 e7 vmulps %zmm23,%zmm0,%zmm4
9b8: 62 81 3c 48 59 fb vmulps %zmm27,%zmm8,%zmm23
9be: 62 51 7c 48 28 c1 vmovaps %zmm9,%zmm8
9c4: 62 01 7c 48 28 5c 2e vmovaps 0xb40(%r14,%r13,1),%zmm27
9cb: 2d
9cc: 62 61 5c 48 59 ed vmulps %zmm5,%zmm4,%zmm29
9d2: 62 f1 7c 48 28 6c 24 vmovaps 0x100(%rsp),%zmm5
9d9: 04
9da: 62 f1 6c 48 59 e2 vmulps %zmm2,%zmm2,%zmm4
9e0: 62 f1 4c 40 59 d2 vmulps %zmm2,%zmm22,%zmm2
9e6: 62 81 7c 48 28 74 2e vmovaps 0x640(%r14,%r13,1),%zmm22
9ed: 19
9ee: 62 e1 5c 48 59 ca vmulps %zmm2,%zmm4,%zmm17
9f4: 62 91 7c 48 28 54 2e vmovaps 0x6c0(%r14,%r13,1),%zmm2
9fb: 1b
9fc: 62 91 7c 48 28 64 2e vmovaps 0x740(%r14,%r13,1),%zmm4
a03: 1d
a04: 62 72 75 40 b8 44 24 vfmadd231ps 0x80(%rsp),%zmm17,%zmm8
a0b: 02
a0c: 62 71 7c 48 29 44 24 vmovaps %zmm8,0x80(%rsp)
a13: 02
a14: 62 71 7c 48 28 44 24 vmovaps 0x600(%rsp),%zmm8
a1b: 18
a1c: 62 92 35 40 b8 ef vfmadd231ps %zmm31,%zmm25,%zmm5
a22: 62 01 7c 48 28 7c 2e vmovaps 0x680(%r14,%r13,1),%zmm31
a29: 1a
a2a: 62 01 7c 48 28 4c 2e vmovaps 0x7c0(%r14,%r13,1),%zmm25
a31: 1f
a32: 62 32 6d 40 7f e6 vpermt2ps %zmm22,%zmm18,%zmm12
a38: 62 f2 15 48 7f f4 vpermt2ps %zmm4,%zmm13,%zmm6
a3e: 62 f2 3d 40 b8 6c 24 vfmadd231ps 0x3c0(%rsp),%zmm24,%zmm5
a45: 0f
a46: 62 f2 75 40 b8 6c 24 vfmadd231ps 0x180(%rsp),%zmm17,%zmm5
a4d: 06
a4e: 62 91 7c 48 28 ff vmovaps %zmm31,%zmm7
a54: 62 11 7c 48 28 cf vmovaps %zmm31,%zmm9
a5a: 62 f2 6d 40 7f fa vpermt2ps %zmm2,%zmm18,%zmm7
a60: 62 72 15 48 7f ca vpermt2ps %zmm2,%zmm13,%zmm9
a66: 62 f1 7c 48 29 6c 24 vmovaps %zmm5,0x100(%rsp)
a6d: 04
a6e: 62 f1 7c 48 28 6c 24 vmovaps 0x700(%rsp),%zmm5
a75: 1c
a76: 62 f3 9d 48 23 c7 e4 vshuff64x2 $0xe4,%zmm7,%zmm12,%zmm0
a7d: 62 f1 7c 48 28 f9 vmovaps %zmm1,%zmm7
a83: 62 31 7c 48 28 e3 vmovaps %zmm19,%zmm12
a89: 62 92 6d 40 7f f9 vpermt2ps %zmm25,%zmm18,%zmm7
a8f: 62 72 6d 40 7f e4 vpermt2ps %zmm4,%zmm18,%zmm12
a95: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x4c0(%rsp)
a9c: 13
a9d: 62 f3 9d 48 23 df e4 vshuff64x2 $0xe4,%zmm7,%zmm12,%zmm3
aa4: 62 71 7c 48 28 e1 vmovaps %zmm1,%zmm12
aaa: 62 91 7c 48 28 fe vmovaps %zmm30,%zmm7
ab0: 62 12 15 48 7f e1 vpermt2ps %zmm25,%zmm13,%zmm12
ab6: 62 b2 15 48 7f fe vpermt2ps %zmm22,%zmm13,%zmm7
abc: 62 f1 fd 48 29 5c 24 vmovapd %zmm3,0x500(%rsp)
ac3: 14
ac4: 62 d3 cd 48 23 f4 e4 vshuff64x2 $0xe4,%zmm12,%zmm6,%zmm6
acb: 62 53 c5 48 23 c9 e4 vshuff64x2 $0xe4,%zmm9,%zmm7,%zmm9
ad2: 62 91 7c 48 28 ff vmovaps %zmm31,%zmm7
ad8: 62 62 1d 40 7f fa vpermt2ps %zmm2,%zmm28,%zmm31
ade: 62 11 7c 48 28 64 2e vmovaps 0x980(%r14,%r13,1),%zmm12
ae5: 26
ae6: 62 f2 0d 48 7f fa vpermt2ps %zmm2,%zmm14,%zmm7
aec: 62 f1 7c 48 28 d1 vmovaps %zmm1,%zmm2
af2: 62 92 1d 40 7f c9 vpermt2ps %zmm25,%zmm28,%zmm1
af8: 62 f1 fd 48 29 74 24 vmovapd %zmm6,0x440(%rsp)
aff: 11
b00: 62 91 7c 48 28 f6 vmovaps %zmm30,%zmm6
b06: 62 92 0d 48 7f d1 vpermt2ps %zmm25,%zmm14,%zmm2
b0c: 62 22 1d 40 7f f6 vpermt2ps %zmm22,%zmm28,%zmm30
b12: 62 01 7c 48 28 4c 2e vmovaps 0x880(%r14,%r13,1),%zmm25
b19: 22
b1a: 62 b2 0d 48 7f f6 vpermt2ps %zmm22,%zmm14,%zmm6
b20: 62 f3 cd 48 23 df e4 vshuff64x2 $0xe4,%zmm7,%zmm6,%zmm3
b27: 62 91 5c 40 59 f5 vmulps %zmm29,%zmm20,%zmm6
b2d: 62 93 8d 40 23 ff e4 vshuff64x2 $0xe4,%zmm31,%zmm30,%zmm7
b34: 62 01 7c 48 28 74 2e vmovaps 0x9c0(%r14,%r13,1),%zmm30
b3b: 27
b3c: 62 c1 7c 48 28 e4 vmovaps %zmm12,%zmm20
b42: 62 01 7c 48 28 7c 2e vmovaps 0xa80(%r14,%r13,1),%zmm31
b49: 2a
b4a: 62 f1 fd 48 29 5c 24 vmovapd %zmm3,0x400(%rsp)
b51: 10
b52: 62 d1 7c 48 28 da vmovaps %zmm10,%zmm3
b58: 62 31 7c 48 28 d3 vmovaps %zmm19,%zmm10
b5e: 62 e2 1d 40 7f dc vpermt2ps %zmm4,%zmm28,%zmm19
b64: 62 f1 fd 48 29 7c 24 vmovapd %zmm7,0x280(%rsp)
b6b: 0a
b6c: 62 72 0d 48 7f d4 vpermt2ps %zmm4,%zmm14,%zmm10
b72: 62 91 7c 48 28 64 2e vmovaps 0x800(%r14,%r13,1),%zmm4
b79: 20
b7a: 62 f2 3d 40 b8 5c 24 vfmadd231ps 0x200(%rsp),%zmm24,%zmm3
b81: 08
b82: 62 01 7c 48 28 44 2e vmovaps 0x8c0(%r14,%r13,1),%zmm24
b89: 23
b8a: 62 f2 75 40 b8 5c 24 vfmadd231ps 0xc0(%rsp),%zmm17,%zmm3
b91: 03
b92: 62 82 15 48 7f e6 vpermt2ps %zmm30,%zmm13,%zmm20
b98: 62 81 7c 48 28 cf vmovaps %zmm31,%zmm17
b9e: 62 f3 e5 40 23 c1 e4 vshuff64x2 $0xe4,%zmm1,%zmm19,%zmm0
ba5: 62 91 7c 48 28 4c 2e vmovaps 0x840(%r14,%r13,1),%zmm1
bac: 21
bad: 62 81 7c 48 28 d9 vmovaps %zmm25,%zmm19
bb3: 62 f3 ad 48 23 d2 e4 vshuff64x2 $0xe4,%zmm2,%zmm10,%zmm2
bba: 62 11 14 40 59 d5 vmulps %zmm29,%zmm29,%zmm10
bc0: 62 01 7c 48 28 6c 2e vmovaps 0x900(%r14,%r13,1),%zmm29
bc7: 24
bc8: 62 82 6d 40 7f d8 vpermt2ps %zmm24,%zmm18,%zmm19
bce: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x480(%rsp)
bd5: 12
bd6: 62 f1 7c 48 29 5c 24 vmovaps %zmm3,0xc0(%rsp)
bdd: 03
bde: 62 f1 fd 48 29 54 24 vmovapd %zmm2,0x200(%rsp)
be5: 08
be6: 62 b1 7c 48 28 d7 vmovaps %zmm23,%zmm2
bec: 62 e1 2c 48 59 fe vmulps %zmm6,%zmm10,%zmm23
bf2: 62 f1 7c 48 28 f4 vmovaps %zmm4,%zmm6
bf8: 62 51 7c 48 28 d4 vmovaps %zmm12,%zmm10
bfe: 62 12 6d 40 7f d6 vpermt2ps %zmm30,%zmm18,%zmm10
c04: 62 72 45 40 b8 5c 24 vfmadd231ps 0x340(%rsp),%zmm23,%zmm11
c0b: 0d
c0c: 62 f2 7d 40 b8 54 24 vfmadd231ps 0x240(%rsp),%zmm16,%zmm2
c13: 09
c14: 62 e2 45 40 b8 6c 24 vfmadd231ps 0x140(%rsp),%zmm23,%zmm21
c1b: 05
c1c: 62 f2 6d 40 7f f1 vpermt2ps %zmm1,%zmm18,%zmm6
c22: 62 f2 45 40 b8 54 24 vfmadd231ps 0x1c0(%rsp),%zmm23,%zmm2
c29: 07
c2a: 62 e1 7c 48 29 6c 24 vmovaps %zmm21,0x140(%rsp)
c31: 05
c32: 62 71 7c 48 29 5c 24 vmovaps %zmm11,0x240(%rsp)
c39: 09
c3a: 62 81 7c 48 28 f5 vmovaps %zmm29,%zmm22
c40: 62 91 7c 48 28 fd vmovaps %zmm29,%zmm7
c46: 62 82 6d 40 7f f2 vpermt2ps %zmm26,%zmm18,%zmm22
c4c: 62 92 15 48 7f fa vpermt2ps %zmm26,%zmm13,%zmm7
c52: 62 f1 7c 48 29 54 24 vmovaps %zmm2,0x1c0(%rsp)
c59: 07
c5a: 62 b3 cd 48 23 c3 e4 vshuff64x2 $0xe4,%zmm19,%zmm6,%zmm0
c61: 62 91 7c 48 28 f1 vmovaps %zmm25,%zmm6
c67: 62 92 15 48 7f f0 vpermt2ps %zmm24,%zmm13,%zmm6
c6d: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x380(%rsp)
c74: 0e
c75: 62 d3 cd 40 23 c2 e4 vshuff64x2 $0xe4,%zmm10,%zmm22,%zmm0
c7c: 62 71 7c 48 28 d4 vmovaps %zmm4,%zmm10
c82: 62 72 0d 48 7f d1 vpermt2ps %zmm1,%zmm14,%zmm10
c88: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x3c0(%rsp)
c8f: 0f
c90: 62 f1 7c 48 28 c4 vmovaps %zmm4,%zmm0
c96: 62 f2 1d 40 7f e1 vpermt2ps %zmm1,%zmm28,%zmm4
c9c: 62 f2 15 48 7f c1 vpermt2ps %zmm1,%zmm13,%zmm0
ca2: 62 f3 fd 48 23 c6 e4 vshuff64x2 $0xe4,%zmm6,%zmm0,%zmm0
ca9: 62 91 7c 48 28 74 2e vmovaps 0xb80(%r14,%r13,1),%zmm6
cb0: 2e
cb1: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x2c0(%rsp)
cb8: 0b
cb9: 62 b3 c5 48 23 c4 e4 vshuff64x2 $0xe4,%zmm20,%zmm7,%zmm0
cc0: 62 f1 7c 48 28 7c 24 vmovaps 0x6c0(%rsp),%zmm7
cc7: 1b
cc8: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x340(%rsp)
ccf: 0d
cd0: 62 91 7c 48 28 c1 vmovaps %zmm25,%zmm0
cd6: 62 02 1d 40 7f c8 vpermt2ps %zmm24,%zmm28,%zmm25
cdc: 62 92 0d 48 7f c0 vpermt2ps %zmm24,%zmm14,%zmm0
ce2: 62 41 7c 48 28 c4 vmovaps %zmm12,%zmm24
ce8: 62 12 1d 40 7f e6 vpermt2ps %zmm30,%zmm28,%zmm12
cee: 62 02 0d 48 7f c6 vpermt2ps %zmm30,%zmm14,%zmm24
cf4: 62 01 7c 48 28 74 2e vmovaps 0xa40(%r14,%r13,1),%zmm30
cfb: 29
cfc: 62 93 dd 48 23 c9 e4 vshuff64x2 $0xe4,%zmm25,%zmm4,%zmm1
d03: 62 91 7c 48 28 64 2e vmovaps 0xbc0(%r14,%r13,1),%zmm4
d0a: 2f
d0b: 62 e3 ad 48 23 f0 e4 vshuff64x2 $0xe4,%zmm0,%zmm10,%zmm22
d12: 62 91 7c 48 28 c5 vmovaps %zmm29,%zmm0
d18: 62 02 1d 40 7f ea vpermt2ps %zmm26,%zmm28,%zmm29
d1e: 62 01 7c 48 28 64 2e vmovaps 0xa00(%r14,%r13,1),%zmm28
d25: 28
d26: 62 11 7c 48 28 54 2e vmovaps 0xac0(%r14,%r13,1),%zmm10
d2d: 2b
d2e: 62 92 0d 48 7f c2 vpermt2ps %zmm26,%zmm14,%zmm0
d34: 62 f1 fd 48 29 4c 24 vmovapd %zmm1,0x300(%rsp)
d3b: 0c
d3c: 62 91 7c 48 28 4c 2e vmovaps 0xb00(%r14,%r13,1),%zmm1
d43: 2c
d44: 62 61 7c 48 28 54 24 vmovaps 0x780(%rsp),%zmm26
d4b: 1e
d4c: 62 d3 95 40 23 dc e4 vshuff64x2 $0xe4,%zmm12,%zmm29,%zmm3
d53: 62 71 34 48 5c e7 vsubps %zmm7,%zmm9,%zmm12
d59: 62 71 7c 48 28 4c 24 vmovaps 0x680(%rsp),%zmm9
d60: 1a
d61: 62 81 7c 48 28 fc vmovaps %zmm28,%zmm23
d67: 62 c2 6d 40 7f ca vpermt2ps %zmm10,%zmm18,%zmm17
d6d: 62 83 fd 48 23 c0 e4 vshuff64x2 $0xe4,%zmm24,%zmm0,%zmm16
d74: 62 01 7c 48 28 c7 vmovaps %zmm31,%zmm24
d7a: 62 01 7c 48 28 cc vmovaps %zmm28,%zmm25
d80: 62 82 6d 40 7f fe vpermt2ps %zmm30,%zmm18,%zmm23
d86: 62 42 15 48 7f c2 vpermt2ps %zmm10,%zmm13,%zmm24
d8c: 62 02 15 48 7f ce vpermt2ps %zmm30,%zmm13,%zmm25
d92: 62 f1 fd 48 29 5c 24 vmovapd %zmm3,0x180(%rsp)
d99: 06
d9a: 62 f1 7c 48 28 5c 24 vmovaps 0x4c0(%rsp),%zmm3
da1: 13
da2: 62 c1 7c 40 5c c1 vsubps %zmm9,%zmm16,%zmm16
da8: 62 b3 c5 40 23 c1 e4 vshuff64x2 $0xe4,%zmm17,%zmm23,%zmm0
daf: 62 e1 7c 48 28 ce vmovaps %zmm6,%zmm17
db5: 62 e1 7c 48 28 f9 vmovaps %zmm1,%zmm23
dbb: 62 93 b5 40 23 d0 e4 vshuff64x2 $0xe4,%zmm24,%zmm25,%zmm2
dc2: 62 e2 6d 40 7f cc vpermt2ps %zmm4,%zmm18,%zmm17
dc8: 62 82 15 48 7f fb vpermt2ps %zmm27,%zmm13,%zmm23
dce: 62 71 64 48 5c dd vsubps %zmm5,%zmm3,%zmm11
dd4: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x540(%rsp)
ddb: 15
ddc: 62 f1 7c 48 28 c1 vmovaps %zmm1,%zmm0
de2: 62 f1 fd 48 29 54 24 vmovapd %zmm2,0x8c0(%rsp)
de9: 23
dea: 62 f1 7c 48 28 54 24 vmovaps 0x400(%rsp),%zmm2
df1: 10
df2: 62 92 6d 40 7f c3 vpermt2ps %zmm27,%zmm18,%zmm0
df8: 62 b3 fd 48 23 c1 e4 vshuff64x2 $0xe4,%zmm17,%zmm0,%zmm0
dff: 62 e1 7c 48 28 ce vmovaps %zmm6,%zmm17
e05: 62 c1 6c 48 5c e1 vsubps %zmm9,%zmm2,%zmm20
e0b: 62 f1 7c 48 28 54 24 vmovaps 0x440(%rsp),%zmm2
e12: 11
e13: 62 e1 7c 48 29 44 24 vmovaps %zmm16,0x440(%rsp)
e1a: 11
e1b: 62 e2 15 48 7f cc vpermt2ps %zmm4,%zmm13,%zmm17
e21: 62 f1 fd 48 29 44 24 vmovapd %zmm0,0x880(%rsp)
e28: 22
e29: 62 a3 c5 40 23 e9 e4 vshuff64x2 $0xe4,%zmm17,%zmm23,%zmm21
e30: 62 81 7c 48 28 cf vmovaps %zmm31,%zmm17
e36: 62 42 05 48 7f fa vpermt2ps %zmm10,%zmm15,%zmm31
e3c: 62 e1 7c 48 28 7c 24 vmovaps 0x740(%rsp),%zmm23
e43: 1d
e44: 62 f1 6c 48 5c d7 vsubps %zmm7,%zmm2,%zmm2
e4a: 62 c2 0d 48 7f ca vpermt2ps %zmm10,%zmm14,%zmm17
e50: 62 11 7c 48 28 d4 vmovaps %zmm28,%zmm10
e56: 62 f1 7c 48 29 54 24 vmovaps %zmm2,0x580(%rsp)
e5d: 16
e5e: 62 02 05 48 7f e6 vpermt2ps %zmm30,%zmm15,%zmm28
e64: 62 12 0d 48 7f d6 vpermt2ps %zmm30,%zmm14,%zmm10
e6a: 62 03 9d 40 23 f7 e4 vshuff64x2 $0xe4,%zmm31,%zmm28,%zmm30
e71: 62 a3 ad 48 23 d9 e4 vshuff64x2 $0xe4,%zmm17,%zmm10,%zmm19
e78: 62 51 7c 48 28 d3 vmovaps %zmm11,%zmm10
e7e: 62 52 25 48 a8 d0 vfmadd213ps %zmm8,%zmm11,%zmm10
e84: 62 52 1d 48 b8 d4 vfmadd231ps %zmm12,%zmm12,%zmm10
e8a: 62 32 5d 40 b8 d4 vfmadd231ps %zmm20,%zmm20,%zmm10
e90: 62 c2 7d 48 4e ca vrsqrt14ps %zmm10,%zmm17
e96: 62 31 2c 48 59 d1 vmulps %zmm17,%zmm10,%zmm10
e9c: 62 12 75 40 a8 d2 vfmadd213ps %zmm26,%zmm17,%zmm10
ea2: 62 a1 74 40 59 cf vmulps %zmm23,%zmm17,%zmm17
ea8: 62 d1 74 40 59 da vmulps %zmm10,%zmm17,%zmm3
eae: 62 71 7c 48 28 54 24 vmovaps 0x500(%rsp),%zmm10
eb5: 14
eb6: 62 61 2c 48 5c cd vsubps %zmm5,%zmm10,%zmm25
ebc: 62 71 7c 48 28 54 24 vmovaps 0x200(%rsp),%zmm10
ec3: 08
ec4: 62 81 7c 48 28 c9 vmovaps %zmm25,%zmm17
eca: 62 c2 35 40 a8 c8 vfmadd213ps %zmm8,%zmm25,%zmm17
ed0: 62 e2 6d 48 b8 ca vfmadd231ps %zmm2,%zmm2,%zmm17
ed6: 62 d1 2c 48 5c c1 vsubps %zmm9,%zmm10,%zmm0
edc: 62 71 7c 48 28 54 24 vmovaps 0x2c0(%rsp),%zmm10
ee3: 0b
ee4: 62 e2 7d 48 b8 c8 vfmadd231ps %zmm0,%zmm0,%zmm17
eea: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x5c0(%rsp)
ef1: 17
ef2: 62 d1 4c 40 5c c1 vsubps %zmm9,%zmm22,%zmm0
ef8: 62 22 7d 48 4e c1 vrsqrt14ps %zmm17,%zmm24
efe: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x200(%rsp)
f05: 08
f06: 62 81 74 40 59 c8 vmulps %zmm24,%zmm17,%zmm17
f0c: 62 82 3d 40 a8 ca vfmadd213ps %zmm26,%zmm24,%zmm17
f12: 62 21 3c 40 59 c7 vmulps %zmm23,%zmm24,%zmm24
f18: 62 b1 3c 40 59 d1 vmulps %zmm17,%zmm24,%zmm2
f1e: 62 e1 7c 48 28 4c 24 vmovaps 0x380(%rsp),%zmm17
f25: 0e
f26: 62 71 2c 48 5c d7 vsubps %zmm7,%zmm10,%zmm10
f2c: 62 71 7c 48 29 54 24 vmovaps %zmm10,0x380(%rsp)
f33: 0e
f34: 62 61 74 40 5c c5 vsubps %zmm5,%zmm17,%zmm24
f3a: 62 81 7c 48 28 c8 vmovaps %zmm24,%zmm17
f40: 62 c2 3d 40 a8 c8 vfmadd213ps %zmm8,%zmm24,%zmm17
f46: 62 c2 2d 48 b8 ca vfmadd231ps %zmm10,%zmm10,%zmm17
f4c: 62 71 7c 48 28 54 24 vmovaps 0x340(%rsp),%zmm10
f53: 0d
f54: 62 e2 7d 48 b8 c8 vfmadd231ps %zmm0,%zmm0,%zmm17
f5a: 62 a2 7d 48 4e f1 vrsqrt14ps %zmm17,%zmm22
f60: 62 a1 74 40 59 ce vmulps %zmm22,%zmm17,%zmm17
f66: 62 82 4d 40 a8 ca vfmadd213ps %zmm26,%zmm22,%zmm17
f6c: 62 a1 4c 40 59 f7 vmulps %zmm23,%zmm22,%zmm22
f72: 62 a1 4c 40 59 f1 vmulps %zmm17,%zmm22,%zmm22
f78: 62 e1 7c 48 28 4c 24 vmovaps 0x3c0(%rsp),%zmm17
f7f: 0f
f80: 62 71 2c 48 5c d7 vsubps %zmm7,%zmm10,%zmm10
f86: 62 71 7c 48 29 54 24 vmovaps %zmm10,0x340(%rsp)
f8d: 0d
f8e: 62 e1 74 40 5c cd vsubps %zmm5,%zmm17,%zmm17
f94: 62 21 7c 48 28 e9 vmovaps %zmm17,%zmm29
f9a: 62 42 75 40 a8 e8 vfmadd213ps %zmm8,%zmm17,%zmm29
fa0: 62 42 2d 48 b8 ea vfmadd231ps %zmm10,%zmm10,%zmm29
fa6: 62 71 64 48 59 54 24 vmulps 0x280(%rsp),%zmm3,%zmm10
fad: 0a
fae: 62 f1 64 48 59 db vmulps %zmm3,%zmm3,%zmm3
fb4: 62 22 7d 40 b8 e8 vfmadd231ps %zmm16,%zmm16,%zmm29
fba: 62 92 7d 48 4e c5 vrsqrt14ps %zmm29,%zmm0
fc0: 62 e1 14 40 59 c0 vmulps %zmm0,%zmm29,%zmm16
fc6: 62 61 7c 48 28 e9 vmovaps %zmm1,%zmm29
fcc: 62 92 05 48 7f cb vpermt2ps %zmm27,%zmm15,%zmm1
fd2: 62 82 7d 48 a8 c2 vfmadd213ps %zmm26,%zmm0,%zmm16
fd8: 62 b1 7c 48 59 c7 vmulps %zmm23,%zmm0,%zmm0
fde: 62 d1 64 48 59 da vmulps %zmm10,%zmm3,%zmm3
fe4: 62 02 0d 48 7f eb vpermt2ps %zmm27,%zmm14,%zmm29
fea: 62 a1 7c 48 59 c0 vmulps %zmm16,%zmm0,%zmm16
ff0: 62 f1 7c 48 28 c6 vmovaps %zmm6,%zmm0
ff6: 62 f2 05 48 7f f4 vpermt2ps %zmm4,%zmm15,%zmm6
ffc: 62 f2 0d 48 7f c4 vpermt2ps %zmm4,%zmm14,%zmm0
1002: 62 f1 6c 48 59 64 24 vmulps 0x480(%rsp),%zmm2,%zmm4
1009: 12
100a: 62 f1 6c 48 59 d2 vmulps %zmm2,%zmm2,%zmm2
1010: 62 61 6c 48 59 e4 vmulps %zmm4,%zmm2,%zmm28
1016: 62 f1 4c 40 59 54 24 vmulps 0x300(%rsp),%zmm22,%zmm2
101d: 0c
101e: 62 73 f5 48 23 d6 e4 vshuff64x2 $0xe4,%zmm6,%zmm1,%zmm10
1025: 62 b1 4c 40 59 ce vmulps %zmm22,%zmm22,%zmm1
102b: 62 d1 64 40 5c f1 vsubps %zmm9,%zmm19,%zmm6
1031: 62 e1 7c 40 59 5c 24 vmulps 0x180(%rsp),%zmm16,%zmm19
1038: 06
1039: 62 a1 7c 40 59 c0 vmulps %zmm16,%zmm16,%zmm16
103f: 62 63 95 40 23 e8 e4 vshuff64x2 $0xe4,%zmm0,%zmm29,%zmm29
1046: 62 f1 7c 48 28 44 24 vmovaps 0x880(%rsp),%zmm0
104d: 22
104e: 62 f1 7c 48 29 74 24 vmovaps %zmm6,0x280(%rsp)
1055: 0a
1056: 62 e1 74 48 59 f2 vmulps %zmm2,%zmm1,%zmm22
105c: 62 f1 7c 48 28 54 24 vmovaps 0x540(%rsp),%zmm2
1063: 15
1064: 62 91 7c 48 28 4c 2e vmovaps 0xd80(%r14,%r13,1),%zmm1
106b: 36
106c: 62 51 14 40 5c c9 vsubps %zmm9,%zmm29,%zmm9
1072: 62 71 7c 48 29 4c 24 vmovaps %zmm9,0x4c0(%rsp)
1079: 13
107a: 62 61 7c 48 5c dd vsubps %zmm5,%zmm0,%zmm27
1080: 62 f1 7c 48 28 c7 vmovaps %zmm7,%zmm0
1086: 62 f1 6c 48 5c e5 vsubps %zmm5,%zmm2,%zmm4
108c: 62 f1 7c 48 28 54 24 vmovaps 0x8c0(%rsp),%zmm2
1093: 23
1094: 62 91 7c 48 28 eb vmovaps %zmm27,%zmm5
109a: 62 61 7c 48 29 5c 24 vmovaps %zmm27,0x400(%rsp)
10a1: 10
10a2: 62 f1 7c 48 29 64 24 vmovaps %zmm4,0x3c0(%rsp)
10a9: 0f
10aa: 62 d2 5d 48 a8 e0 vfmadd213ps %zmm8,%zmm4,%zmm4
10b0: 62 d2 25 40 a8 e8 vfmadd213ps %zmm8,%zmm27,%zmm5
10b6: 62 21 7c 40 59 db vmulps %zmm19,%zmm16,%zmm27
10bc: 62 11 7c 48 28 44 2e vmovaps 0xcc0(%r14,%r13,1),%zmm8
10c3: 33
10c4: 62 81 7c 48 28 44 2e vmovaps 0xc40(%r14,%r13,1),%zmm16
10cb: 31
10cc: 62 f1 6c 48 5c ff vsubps %zmm7,%zmm2,%zmm7
10d2: 62 f1 54 40 5c d0 vsubps %zmm0,%zmm21,%zmm2
10d8: 62 e1 7c 48 28 e8 vmovaps %zmm0,%zmm21
10de: 62 f1 7c 48 28 44 24 vmovaps 0x80(%rsp),%zmm0
10e5: 02
10e6: 62 f2 45 48 b8 e7 vfmadd231ps %zmm7,%zmm7,%zmm4
10ec: 62 f1 7c 48 29 7c 24 vmovaps %zmm7,0x480(%rsp)
10f3: 12
10f4: 62 f2 6d 48 b8 ea vfmadd231ps %zmm2,%zmm2,%zmm5
10fa: 62 f1 7c 48 29 54 24 vmovaps %zmm2,0x500(%rsp)
1101: 14
1102: 62 91 7c 48 28 54 2e vmovaps 0xc00(%r14,%r13,1),%zmm2
1109: 30
110a: 62 f2 4d 48 b8 e6 vfmadd231ps %zmm6,%zmm6,%zmm4
1110: 62 d2 35 48 b8 e9 vfmadd231ps %zmm9,%zmm9,%zmm5
1116: 62 91 7c 48 28 74 2e vmovaps 0xdc0(%r14,%r13,1),%zmm6
111d: 37
111e: 62 f2 7d 48 4e fc vrsqrt14ps %zmm4,%zmm7
1124: 62 f1 5c 48 59 e7 vmulps %zmm7,%zmm4,%zmm4
112a: 62 92 45 48 a8 e2 vfmadd213ps %zmm26,%zmm7,%zmm4
1130: 62 b1 44 48 59 ff vmulps %zmm23,%zmm7,%zmm7
1136: 62 f1 44 48 59 e4 vmulps %zmm4,%zmm7,%zmm4
113c: 62 f2 7d 48 4e fd vrsqrt14ps %zmm5,%zmm7
1142: 62 f1 54 48 59 ef vmulps %zmm7,%zmm5,%zmm5
1148: 62 d2 65 48 b8 c3 vfmadd231ps %zmm11,%zmm3,%zmm0
114e: 62 92 45 48 a8 ea vfmadd213ps %zmm26,%zmm7,%zmm5
1154: 62 b1 44 48 59 ff vmulps %zmm23,%zmm7,%zmm7
115a: 62 71 7c 48 28 d9 vmovaps %zmm1,%zmm11
1160: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x80(%rsp)
1167: 02
1168: 62 f1 7c 48 28 44 24 vmovaps 0xc0(%rsp),%zmm0
116f: 03
1170: 62 71 44 48 59 cd vmulps %zmm5,%zmm7,%zmm9
1176: 62 91 7c 48 28 6c 2e vmovaps 0xc80(%r14,%r13,1),%zmm5
117d: 32
117e: 62 f1 0c 40 59 fc vmulps %zmm4,%zmm30,%zmm7
1184: 62 61 7c 48 28 ea vmovaps %zmm2,%zmm29
118a: 62 f1 5c 48 59 e4 vmulps %zmm4,%zmm4,%zmm4
1190: 62 61 7c 48 28 f2 vmovaps %zmm2,%zmm30
1196: 62 61 5c 48 59 ff vmulps %zmm7,%zmm4,%zmm31
119c: 62 91 7c 48 28 64 2e vmovaps 0xd40(%r14,%r13,1),%zmm4
11a3: 35
11a4: 62 22 15 48 7f e8 vpermt2ps %zmm16,%zmm13,%zmm29
11aa: 62 22 6d 40 7f f0 vpermt2ps %zmm16,%zmm18,%zmm30
11b0: 62 72 15 48 7f de vpermt2ps %zmm6,%zmm13,%zmm11
11b6: 62 51 2c 48 59 d1 vmulps %zmm9,%zmm10,%zmm10
11bc: 62 51 34 48 59 c9 vmulps %zmm9,%zmm9,%zmm9
11c2: 62 f1 7c 48 28 f9 vmovaps %zmm1,%zmm7
11c8: 62 f2 6d 40 7f fe vpermt2ps %zmm6,%zmm18,%zmm7
11ce: 62 d2 65 48 b8 c4 vfmadd231ps %zmm12,%zmm3,%zmm0
11d4: 62 71 7c 48 28 64 24 vmovaps 0x100(%rsp),%zmm12
11db: 04
11dc: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0xc0(%rsp)
11e3: 03
11e4: 62 91 7c 48 28 44 2e vmovaps 0xd00(%r14,%r13,1),%zmm0
11eb: 34
11ec: 62 32 65 48 b8 e4 vfmadd231ps %zmm20,%zmm3,%zmm12
11f2: 62 e1 7c 48 28 e5 vmovaps %zmm5,%zmm20
11f8: 62 f1 7c 48 28 dd vmovaps %zmm5,%zmm3
11fe: 62 c2 15 48 7f e0 vpermt2ps %zmm8,%zmm13,%zmm20
1204: 62 d2 6d 40 7f d8 vpermt2ps %zmm8,%zmm18,%zmm3
120a: 62 e1 7c 48 28 d8 vmovaps %zmm0,%zmm19
1210: 62 72 4d 40 b8 64 24 vfmadd231ps 0x200(%rsp),%zmm22,%zmm12
1217: 08
1218: 62 e2 6d 40 7f dc vpermt2ps %zmm4,%zmm18,%zmm19
121e: 62 72 05 40 b8 64 24 vfmadd231ps 0x280(%rsp),%zmm31,%zmm12
1225: 0a
1226: 62 a3 95 40 23 e4 e4 vshuff64x2 $0xe4,%zmm20,%zmm29,%zmm20
122d: 62 61 7c 48 28 e8 vmovaps %zmm0,%zmm29
1233: 62 63 8d 40 23 f3 e4 vshuff64x2 $0xe4,%zmm3,%zmm30,%zmm30
123a: 62 d1 34 48 59 da vmulps %zmm10,%zmm9,%zmm3
1240: 62 71 7c 48 28 d2 vmovaps %zmm2,%zmm10
1246: 62 b2 05 48 7f d0 vpermt2ps %zmm16,%zmm15,%zmm2
124c: 62 62 15 48 7f ec vpermt2ps %zmm4,%zmm13,%zmm29
1252: 62 f1 7c 48 29 5c 24 vmovaps %zmm3,0x540(%rsp)
1259: 15
125a: 62 f1 7c 48 28 dd vmovaps %zmm5,%zmm3
1260: 62 32 0d 48 7f d0 vpermt2ps %zmm16,%zmm14,%zmm10
1266: 62 e1 7c 48 28 44 24 vmovaps 0x700(%rsp),%zmm16
126d: 1c
126e: 62 d2 05 48 7f e8 vpermt2ps %zmm8,%zmm15,%zmm5
1274: 62 f3 e5 40 23 ff e4 vshuff64x2 $0xe4,%zmm7,%zmm19,%zmm7
127b: 62 71 7c 48 29 64 24 vmovaps %zmm12,0x100(%rsp)
1282: 04
1283: 62 d2 0d 48 7f d8 vpermt2ps %zmm8,%zmm14,%zmm3
1289: 62 71 7c 48 28 c1 vmovaps %zmm1,%zmm8
128f: 62 f2 05 48 7f ce vpermt2ps %zmm6,%zmm15,%zmm1
1295: 62 72 0d 48 7f c6 vpermt2ps %zmm6,%zmm14,%zmm8
129b: 62 43 95 40 23 eb e4 vshuff64x2 $0xe4,%zmm11,%zmm29,%zmm29
12a2: 62 71 7c 48 28 d8 vmovaps %zmm0,%zmm11
12a8: 62 f2 05 48 7f c4 vpermt2ps %zmm4,%zmm15,%zmm0
12ae: 62 f3 ed 48 23 ed e4 vshuff64x2 $0xe4,%zmm5,%zmm2,%zmm5
12b5: 62 b1 7c 48 28 d5 vmovaps %zmm21,%zmm2
12bb: 62 73 ad 48 23 cb e4 vshuff64x2 $0xe4,%zmm3,%zmm10,%zmm9
12c2: 62 f1 7c 48 28 5c 24 vmovaps 0x240(%rsp),%zmm3
12c9: 09
12ca: 62 b1 0c 40 5c f0 vsubps %zmm16,%zmm30,%zmm6
12d0: 62 21 7c 48 28 f0 vmovaps %zmm16,%zmm30
12d6: 62 72 0d 48 7f dc vpermt2ps %zmm4,%zmm14,%zmm11
12dc: 62 f1 7c 48 28 64 24 vmovaps 0x80(%rsp),%zmm4
12e3: 02
12e4: 62 71 14 40 5c d2 vsubps %zmm2,%zmm29,%zmm10
12ea: 62 71 7c 48 29 54 24 vmovaps %zmm10,0x180(%rsp)
12f1: 06
12f2: 62 e3 fd 48 23 d9 e4 vshuff64x2 $0xe4,%zmm1,%zmm0,%zmm19
12f9: 62 b1 44 48 5c c0 vsubps %zmm16,%zmm7,%zmm0
12ff: 62 e1 7c 48 28 44 24 vmovaps 0x600(%rsp),%zmm16
1306: 18
1307: 62 53 a5 48 23 c0 e4 vshuff64x2 $0xe4,%zmm8,%zmm11,%zmm8
130e: 62 71 7c 48 28 5c 24 vmovaps 0x140(%rsp),%zmm11
1315: 05
1316: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x2c0(%rsp)
131d: 0b
131e: 62 92 1d 40 b8 d9 vfmadd231ps %zmm25,%zmm28,%zmm3
1324: 62 21 5c 40 5c cd vsubps %zmm21,%zmm20,%zmm25
132a: 62 e1 7c 48 28 64 24 vmovaps 0x680(%rsp),%zmm20
1331: 1a
1332: 62 92 4d 40 b8 e0 vfmadd231ps %zmm24,%zmm22,%zmm4
1338: 62 e1 7c 48 28 6c 24 vmovaps 0x1c0(%rsp),%zmm21
133f: 07
1340: 62 01 7c 48 28 44 2e vmovaps 0xfc0(%r14,%r13,1),%zmm24
1347: 3f
1348: 62 b2 25 40 b8 d9 vfmadd231ps %zmm17,%zmm27,%zmm3
134e: 62 61 7c 48 29 4c 24 vmovaps %zmm25,0x300(%rsp)
1355: 0c
1356: 62 f2 05 40 b8 64 24 vfmadd231ps 0x3c0(%rsp),%zmm31,%zmm4
135d: 0f
135e: 62 f1 7c 48 29 64 24 vmovaps %zmm4,0x80(%rsp)
1365: 02
1366: 62 b2 7d 48 a8 c0 vfmadd213ps %zmm16,%zmm0,%zmm0
136c: 62 d2 2d 48 b8 c2 vfmadd231ps %zmm10,%zmm10,%zmm0
1372: 62 71 7c 48 28 d3 vmovaps %zmm3,%zmm10
1378: 62 f1 7c 48 28 5c 24 vmovaps 0xc0(%rsp),%zmm3
137f: 03
1380: 62 72 1d 40 b8 5c 24 vfmadd231ps 0x580(%rsp),%zmm28,%zmm11
1387: 16
1388: 62 f1 7c 48 29 74 24 vmovaps %zmm6,0x580(%rsp)
138f: 16
1390: 62 b2 4d 48 a8 f0 vfmadd213ps %zmm16,%zmm6,%zmm6
1396: 62 b1 34 48 5c fc vsubps %zmm20,%zmm9,%zmm7
139c: 62 92 35 40 b8 f1 vfmadd231ps %zmm25,%zmm25,%zmm6
13a2: 62 01 7c 48 28 4c 2e vmovaps 0xf80(%r14,%r13,1),%zmm25
13a9: 3e
13aa: 62 31 3c 48 5c c4 vsubps %zmm20,%zmm8,%zmm8
13b0: 62 11 7c 48 28 4c 2e vmovaps 0xf40(%r14,%r13,1),%zmm9
13b7: 3d
13b8: 62 e2 1d 40 b8 6c 24 vfmadd231ps 0x5c0(%rsp),%zmm28,%zmm21
13bf: 17
13c0: 62 f2 45 48 b8 f7 vfmadd231ps %zmm7,%zmm7,%zmm6
13c6: 62 f1 7c 48 29 7c 24 vmovaps %zmm7,0x240(%rsp)
13cd: 09
13ce: 62 d2 3d 48 b8 c0 vfmadd231ps %zmm8,%zmm8,%zmm0
13d4: 62 71 7c 48 29 44 24 vmovaps %zmm8,0x1c0(%rsp)
13db: 07
13dc: 62 f2 7d 48 4e fe vrsqrt14ps %zmm6,%zmm7
13e2: 62 f2 7d 48 4e d0 vrsqrt14ps %zmm0,%zmm2
13e8: 62 f1 4c 48 59 cf vmulps %zmm7,%zmm6,%zmm1
13ee: 62 f1 7c 48 59 f2 vmulps %zmm2,%zmm0,%zmm6
13f4: 62 72 25 40 b8 5c 24 vfmadd231ps 0x340(%rsp),%zmm27,%zmm11
13fb: 0d
13fc: 62 92 45 48 a8 ca vfmadd213ps %zmm26,%zmm7,%zmm1
1402: 62 b1 44 48 59 ff vmulps %zmm23,%zmm7,%zmm7
1408: 62 92 6d 48 a8 f2 vfmadd213ps %zmm26,%zmm2,%zmm6
140e: 62 e2 25 40 b8 6c 24 vfmadd231ps 0x440(%rsp),%zmm27,%zmm21
1415: 11
1416: 62 f1 44 48 59 c9 vmulps %zmm1,%zmm7,%zmm1
141c: 62 b1 6c 48 59 ff vmulps %zmm23,%zmm2,%zmm7
1422: 62 f1 44 48 59 f6 vmulps %zmm6,%zmm7,%zmm6
1428: 62 f1 74 48 59 c1 vmulps %zmm1,%zmm1,%zmm0
142e: 62 f1 54 48 59 c9 vmulps %zmm1,%zmm5,%zmm1
1434: 62 91 7c 48 28 7c 2e vmovaps 0xe80(%r14,%r13,1),%zmm7
143b: 3a
143c: 62 91 7c 48 28 6c 2e vmovaps 0xe00(%r14,%r13,1),%zmm5
1443: 38
1444: 62 f2 4d 40 b8 5c 24 vfmadd231ps 0x380(%rsp),%zmm22,%zmm3
144b: 0e
144c: 62 81 7c 48 28 74 2e vmovaps 0xf00(%r14,%r13,1),%zmm22
1453: 3c
1454: 62 e1 7c 48 59 c9 vmulps %zmm1,%zmm0,%zmm17
145a: 62 f1 64 40 59 c6 vmulps %zmm6,%zmm19,%zmm0
1460: 62 71 4c 48 59 c6 vmulps %zmm6,%zmm6,%zmm8
1466: 62 91 7c 48 28 4c 2e vmovaps 0xe40(%r14,%r13,1),%zmm1
146d: 39
146e: 62 91 7c 48 28 74 2e vmovaps 0xec0(%r14,%r13,1),%zmm6
1475: 3b
1476: 62 71 7c 48 29 5c 24 vmovaps %zmm11,0x140(%rsp)
147d: 05
147e: 62 f1 3c 48 59 d0 vmulps %zmm0,%zmm8,%zmm2
1484: 62 11 7c 48 28 c1 vmovaps %zmm25,%zmm8
148a: 62 12 6d 40 7f c0 vpermt2ps %zmm24,%zmm18,%zmm8
1490: 62 f2 05 40 b8 5c 24 vfmadd231ps 0x480(%rsp),%zmm31,%zmm3
1497: 12
1498: 62 61 7c 48 28 df vmovaps %zmm7,%zmm27
149e: 62 f1 7c 48 28 c5 vmovaps %zmm5,%zmm0
14a4: 62 71 7c 48 28 df vmovaps %zmm7,%zmm11
14aa: 62 71 7c 48 28 e7 vmovaps %zmm7,%zmm12
14b0: 62 e1 7c 48 28 dd vmovaps %zmm5,%zmm19
14b6: 62 21 7c 48 28 e6 vmovaps %zmm22,%zmm28
14bc: 62 21 7c 48 28 ee vmovaps %zmm22,%zmm29
14c2: 62 62 6d 40 7f de vpermt2ps %zmm6,%zmm18,%zmm27
14c8: 62 f2 6d 40 7f c1 vpermt2ps %zmm1,%zmm18,%zmm0
14ce: 62 72 15 48 7f de vpermt2ps %zmm6,%zmm13,%zmm11
14d4: 62 72 0d 48 7f e6 vpermt2ps %zmm6,%zmm14,%zmm12
14da: 62 e2 0d 48 7f d9 vpermt2ps %zmm1,%zmm14,%zmm19
14e0: 62 f2 05 48 7f fe vpermt2ps %zmm6,%zmm15,%zmm7
14e6: 62 42 6d 40 7f e1 vpermt2ps %zmm9,%zmm18,%zmm28
14ec: 62 81 7c 48 28 d1 vmovaps %zmm25,%zmm18
14f2: 62 42 15 48 7f e9 vpermt2ps %zmm9,%zmm13,%zmm29
14f8: 62 82 15 48 7f d0 vpermt2ps %zmm24,%zmm13,%zmm18
14fe: 62 93 fd 48 23 e3 e4 vshuff64x2 $0xe4,%zmm27,%zmm0,%zmm4
1505: 62 f1 7c 48 28 44 24 vmovaps 0x540(%rsp),%zmm0
150c: 15
150d: 62 53 e5 40 23 e4 e4 vshuff64x2 $0xe4,%zmm12,%zmm19,%zmm12
1514: 62 81 7c 48 28 d9 vmovaps %zmm25,%zmm19
151a: 62 02 05 48 7f c8 vpermt2ps %zmm24,%zmm15,%zmm25
1520: 62 53 9d 40 23 c0 e4 vshuff64x2 $0xe4,%zmm8,%zmm28,%zmm8
1527: 62 61 7c 48 28 e5 vmovaps %zmm5,%zmm28
152d: 62 f2 05 48 7f e9 vpermt2ps %zmm1,%zmm15,%zmm5
1533: 62 82 0d 48 7f d8 vpermt2ps %zmm24,%zmm14,%zmm19
1539: 62 62 15 48 7f e1 vpermt2ps %zmm1,%zmm13,%zmm28
153f: 62 72 7d 48 b8 54 24 vfmadd231ps 0x400(%rsp),%zmm0,%zmm10
1546: 10
1547: 62 e2 7d 48 b8 6c 24 vfmadd231ps 0x4c0(%rsp),%zmm0,%zmm21
154e: 13
154f: 62 f3 d5 48 23 cf e4 vshuff64x2 $0xe4,%zmm7,%zmm5,%zmm1
1556: 62 91 3c 48 5c ee vsubps %zmm30,%zmm8,%zmm5
155c: 62 31 1c 48 5c c4 vsubps %zmm20,%zmm12,%zmm8
1562: 62 43 9d 40 23 db e4 vshuff64x2 $0xe4,%zmm11,%zmm28,%zmm27
1569: 62 23 95 40 23 e2 e4 vshuff64x2 $0xe4,%zmm18,%zmm29,%zmm28
1570: 62 e1 7c 48 28 54 24 vmovaps 0x140(%rsp),%zmm18
1577: 05
1578: 62 71 7c 48 28 db vmovaps %zmm3,%zmm11
157e: 62 b1 7c 48 28 de vmovaps %zmm22,%zmm3
1584: 62 c2 05 48 7f f1 vpermt2ps %zmm9,%zmm15,%zmm22
158a: 62 d2 0d 48 7f d9 vpermt2ps %zmm9,%zmm14,%zmm3
1590: 62 71 7c 48 28 74 24 vmovaps 0x80(%rsp),%zmm14
1597: 02
1598: 62 51 7c 48 28 eb vmovaps %zmm11,%zmm13
159e: 62 71 7c 48 28 5c 24 vmovaps 0x100(%rsp),%zmm11
15a5: 04
15a6: 62 72 75 40 b8 6c 24 vfmadd231ps 0x300(%rsp),%zmm17,%zmm13
15ad: 0c
15ae: 62 72 6d 48 b8 54 24 vfmadd231ps 0x2c0(%rsp),%zmm2,%zmm10
15b5: 0b
15b6: 62 e2 6d 48 b8 6c 24 vfmadd231ps 0x1c0(%rsp),%zmm2,%zmm21
15bd: 07
15be: 62 e2 7d 48 b8 54 24 vfmadd231ps 0x500(%rsp),%zmm0,%zmm18
15c5: 14
15c6: 62 91 5c 48 5c c6 vsubps %zmm30,%zmm4,%zmm0
15cc: 62 f1 7c 48 28 64 24 vmovaps 0x6c0(%rsp),%zmm4
15d3: 1b
15d4: 62 a3 e5 48 23 db e4 vshuff64x2 $0xe4,%zmm19,%zmm3,%zmm19
15db: 62 93 cd 40 23 d9 e4 vshuff64x2 $0xe4,%zmm25,%zmm22,%zmm3
15e2: 62 71 7c 48 28 e0 vmovaps %zmm0,%zmm12
15e8: 62 72 75 40 b8 74 24 vfmadd231ps 0x580(%rsp),%zmm17,%zmm14
15ef: 16
15f0: 62 72 75 40 b8 5c 24 vfmadd231ps 0x240(%rsp),%zmm17,%zmm11
15f7: 09
15f8: 62 32 7d 48 a8 e0 vfmadd213ps %zmm16,%zmm0,%zmm12
15fe: 62 31 64 40 5c cc vsubps %zmm20,%zmm19,%zmm9
1604: 62 e2 6d 48 b8 54 24 vfmadd231ps 0x180(%rsp),%zmm2,%zmm18
160b: 06
160c: 62 f1 24 40 5c f4 vsubps %zmm4,%zmm27,%zmm6
1612: 62 f1 1c 40 5c fc vsubps %zmm4,%zmm28,%zmm7
1618: 62 f1 7c 48 28 e5 vmovaps %zmm5,%zmm4
161e: 62 b2 55 48 a8 e0 vfmadd213ps %zmm16,%zmm5,%zmm4
1624: 62 72 4d 48 b8 e6 vfmadd231ps %zmm6,%zmm6,%zmm12
162a: 62 f2 45 48 b8 e7 vfmadd231ps %zmm7,%zmm7,%zmm4
1630: 62 52 3d 48 b8 e0 vfmadd231ps %zmm8,%zmm8,%zmm12
1636: 62 d2 35 48 b8 e1 vfmadd231ps %zmm9,%zmm9,%zmm4
163c: 62 c2 7d 48 4e e4 vrsqrt14ps %zmm12,%zmm20
1642: 62 e2 7d 48 4e c4 vrsqrt14ps %zmm4,%zmm16
1648: 62 31 1c 48 59 e4 vmulps %zmm20,%zmm12,%zmm12
164e: 62 a1 5c 48 59 d8 vmulps %zmm16,%zmm4,%zmm19
1654: 62 12 5d 40 a8 e2 vfmadd213ps %zmm26,%zmm20,%zmm12
165a: 62 a1 5c 40 59 e7 vmulps %zmm23,%zmm20,%zmm20
1660: 62 b1 7c 40 59 e7 vmulps %zmm23,%zmm16,%zmm4
1666: 62 82 7d 40 a8 da vfmadd213ps %zmm26,%zmm16,%zmm19
166c: 62 51 5c 40 59 e4 vmulps %zmm12,%zmm20,%zmm12
1672: 62 b1 5c 48 59 e3 vmulps %zmm19,%zmm4,%zmm4
1678: 62 c1 1c 48 59 c4 vmulps %zmm12,%zmm12,%zmm16
167e: 62 d1 74 48 59 cc vmulps %zmm12,%zmm1,%zmm1
1684: 62 f1 64 48 59 d4 vmulps %zmm4,%zmm3,%zmm2
168a: 62 e1 5c 48 59 cc vmulps %zmm4,%zmm4,%zmm17
1690: 62 f1 7c 40 59 c9 vmulps %zmm1,%zmm16,%zmm1
1696: 62 d1 7c 48 28 de vmovaps %zmm14,%zmm3
169c: 62 b1 7c 48 28 e2 vmovaps %zmm18,%zmm4
16a2: 62 f1 74 40 59 d2 vmulps %zmm2,%zmm17,%zmm2
16a8: 62 f2 75 48 b8 d8 vfmadd231ps %zmm0,%zmm1,%zmm3
16ae: 62 72 6d 48 b8 d5 vfmadd231ps %zmm5,%zmm2,%zmm10
16b4: 62 f2 6d 48 b8 e7 vfmadd231ps %zmm7,%zmm2,%zmm4
16ba: 62 c2 6d 48 b8 e9 vfmadd231ps %zmm9,%zmm2,%zmm21
16c0: 62 f1 2c 48 58 c3 vaddps %zmm3,%zmm10,%zmm0
16c6: 62 d1 7c 48 28 dd vmovaps %zmm13,%zmm3
16cc: 62 f3 fd 48 1b c2 01 vextractf64x4 $0x1,%zmm0,%ymm2
16d3: 62 f2 75 48 b8 de vfmadd231ps %zmm6,%zmm1,%zmm3
16d9: 62 f1 5c 48 58 db vaddps %zmm3,%zmm4,%zmm3
16df: 62 d1 7c 48 28 e3 vmovaps %zmm11,%zmm4
16e5: 62 f1 7c 48 58 c2 vaddps %zmm2,%zmm0,%zmm0
16eb: 62 d2 75 48 b8 e0 vfmadd231ps %zmm8,%zmm1,%zmm4
16f1: c4 e3 7d 19 c2 01 vextractf128 $0x1,%ymm0,%xmm2
16f7: 62 f1 54 40 58 cc vaddps %zmm4,%zmm21,%zmm1
16fd: 62 f3 fd 48 1b dc 01 vextractf64x4 $0x1,%zmm3,%ymm4
1704: c5 f8 58 c2 vaddps %xmm2,%xmm0,%xmm0
1708: 62 f1 64 48 58 dc vaddps %zmm4,%zmm3,%zmm3
170e: c4 e3 7d 19 dc 01 vextractf128 $0x1,%ymm3,%xmm4
1714: c4 e3 79 05 d0 01 vpermilpd $0x1,%xmm0,%xmm2
171a: c5 e0 58 dc vaddps %xmm4,%xmm3,%xmm3
171e: c5 f8 58 c2 vaddps %xmm2,%xmm0,%xmm0
1722: c5 fa 16 d0 vmovshdup %xmm0,%xmm2
1726: c5 fa 58 c2 vaddss %xmm2,%xmm0,%xmm0
172a: c5 f8 29 84 24 00 01 vmovaps %xmm0,0x100(%rsp)
1731: 00 00
1733: c4 e3 79 05 c3 01 vpermilpd $0x1,%xmm3,%xmm0
1739: c5 e0 58 c0 vaddps %xmm0,%xmm3,%xmm0
173d: 62 f3 fd 48 1b cb 01 vextractf64x4 $0x1,%zmm1,%ymm3
1744: 62 f1 74 48 58 cb vaddps %zmm3,%zmm1,%zmm1
174a: c5 fa 16 d0 vmovshdup %xmm0,%xmm2
174e: c5 fa 58 c2 vaddss %xmm2,%xmm0,%xmm0
1752: c5 f8 29 84 24 c0 00 vmovaps %xmm0,0xc0(%rsp)
1759: 00 00
175b: c4 e3 7d 19 c8 01 vextractf128 $0x1,%ymm1,%xmm0
1761: c5 f0 58 c0 vaddps %xmm0,%xmm1,%xmm0
1765: c4 e3 79 05 c8 01 vpermilpd $0x1,%xmm0,%xmm1
176b: c5 f8 58 c1 vaddps %xmm1,%xmm0,%xmm0
176f: c5 fa 16 c8 vmovshdup %xmm0,%xmm1
1773: c5 fa 58 c1 vaddss %xmm1,%xmm0,%xmm0
1777: c5 f8 29 84 24 80 00 vmovaps %xmm0,0x80(%rsp)
177e: 00 00
1780: c5 f8 77 vzeroupper
1783: 41 ff d4 call *%r12
1786: 41 81 c7 00 01 00 00 add $0x100,%r15d
178d: 41 39 df cmp %ebx,%r15d
1790: 0f 82 5a ea ff ff jb 1f0 <simplified_nbody+0x1f0>
1796: eb 27 jmp 17bf <simplified_nbody+0x17bf>
1798: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
179c: c5 f8 29 84 24 00 01 vmovaps %xmm0,0x100(%rsp)
17a3: 00 00
17a5: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
17a9: c5 f8 29 84 24 c0 00 vmovaps %xmm0,0xc0(%rsp)
17b0: 00 00
17b2: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
17b6: c5 f8 29 84 24 80 00 vmovaps %xmm0,0x80(%rsp)
17bd: 00 00
17bf: 48 8b 44 24 50 mov 0x50(%rsp),%rax
17c4: c5 f8 28 94 24 00 01 vmovaps 0x100(%rsp),%xmm2
17cb: 00 00
17cd: c5 f8 28 9c 24 c0 00 vmovaps 0xc0(%rsp),%xmm3
17d4: 00 00
17d6: c5 f8 28 a4 24 80 00 vmovaps 0x80(%rsp),%xmm4
17dd: 00 00
17df: 48 8b 4c 24 58 mov 0x58(%rsp),%rcx
17e4: c5 fa 10 00 vmovss (%rax),%xmm0
17e8: 48 b8 00 00 00 00 00 movabs $0x0,%rax
17ef: 00 00 00
17f2: c4 c1 7a 10 0c 06 vmovss (%r14,%rax,1),%xmm1
17f8: 48 8b 44 24 68 mov 0x68(%rsp),%rax
17fd: c4 e2 79 a9 54 24 3c vfmadd213ss 0x3c(%rsp),%xmm0,%xmm2
1804: c4 e2 79 a9 5c 24 40 vfmadd213ss 0x40(%rsp),%xmm0,%xmm3
180b: c4 e2 79 a9 64 24 44 vfmadd213ss 0x44(%rsp),%xmm0,%xmm4
1812: c5 ea 59 d1 vmulss %xmm1,%xmm2,%xmm2
1816: c5 e2 59 d9 vmulss %xmm1,%xmm3,%xmm3
181a: c5 da 59 c9 vmulss %xmm1,%xmm4,%xmm1
181e: c5 fa 10 24 08 vmovss (%rax,%rcx,1),%xmm4
1823: c4 e2 69 b9 e0 vfmadd231ss %xmm0,%xmm2,%xmm4
1828: c5 fa 11 24 08 vmovss %xmm4,(%rax,%rcx,1)
182d: c5 fa 10 64 08 04 vmovss 0x4(%rax,%rcx,1),%xmm4
1833: c4 e2 61 b9 e0 vfmadd231ss %xmm0,%xmm3,%xmm4
1838: c5 fa 11 64 08 04 vmovss %xmm4,0x4(%rax,%rcx,1)
183e: c4 e2 71 a9 44 08 08 vfmadd213ss 0x8(%rax,%rcx,1),%xmm1,%xmm0
1845: c5 fa 11 44 08 08 vmovss %xmm0,0x8(%rax,%rcx,1)
184b: 48 8b 4c 24 60 mov 0x60(%rsp),%rcx
1850: c5 fa 11 11 vmovss %xmm2,(%rcx)
1854: c5 fa 11 59 04 vmovss %xmm3,0x4(%rcx)
1859: c5 fa 11 49 08 vmovss %xmm1,0x8(%rcx)
185e: 48 8d 65 d8 lea -0x28(%rbp),%rsp
1862: 5b pop %rbx
1863: 41 5c pop %r12
1865: 41 5d pop %r13
1867: 41 5e pop %r14
1869: 41 5f pop %r15
186b: 5d pop %rbp
186c: c3 ret
Host-Compute (ARM CPU)
Note that the compiler would usually directly output a .bin file (ELF format). The output below comes from disassembling it withobjdump -d
.
Also note that this has been compiled for the arm-7
target (ARMv8.6 + FP16 + FP16FML, e.g. Apple M2+/A15+).
nbody_aarch64.bin: file format elf64-littleaarch64
Disassembly of section .text:
0000000000000000 <simplified_nbody>:
0: d104c3ff sub sp, sp, #0x130
4: 90000008 adrp x8, 0 <floor_global_idx>
8: 6d0a33ed stp d13, d12, [sp, #160]
c: 6d0b2beb stp d11, d10, [sp, #176]
10: 6d0c23e9 stp d9, d8, [sp, #192]
14: a90d7bfd stp x29, x30, [sp, #208]
18: 910343fd add x29, sp, #0xd0
1c: a90e6ffc stp x28, x27, [sp, #224]
20: a90f67fa stp x26, x25, [sp, #240]
24: a9105ff8 stp x24, x23, [sp, #256]
28: a91157f6 stp x22, x21, [sp, #272]
2c: a9124ff4 stp x20, x19, [sp, #288]
30: f9400108 ldr x8, [x8]
34: b9400117 ldr w23, [x8]
38: 52800188 mov w8, #0xc // #12
3c: 9b080af6 madd x22, x23, x8, x2
40: 90000008 adrp x8, 0 <floor_global_work_size>
44: aa1603f8 mov x24, x22
48: f9400108 ldr x8, [x8]
4c: fd4002c8 ldr d8, [x22]
50: bc408f09 ldr s9, [x24, #8]!
54: b9400119 ldr w25, [x8]
58: 34000d79 cbz w25, 204 <simplified_nbody+0x204>
5c: 2f00e403 movi d3, #0x0
60: 8b171008 add x8, x0, x23, lsl #4
64: 9000001c adrp x28, 0 <floor_local_idx>
68: a90007e3 stp x3, x1, [sp]
6c: 90000013 adrp x19, 0 <simplified_nbody>
70: 90000014 adrp x20, 0 <host_compute_device_barrier>
74: aa0003f5 mov x21, x0
78: 2a1f03fa mov w26, wzr
7c: f940039c ldr x28, [x28]
80: 3c9a03a3 stur q3, [x29, #-96]
84: 2d400500 ldp s0, s1, [x8]
88: bd400902 ldr s2, [x8, #8]
8c: 5296e2e8 mov w8, #0xb717 // #46871
90: 4f03f603 fmov v3.4s, #1.000000000000000000e+00
94: 72a71a28 movk w8, #0x38d1, lsl #16
98: 2a1f03fb mov w27, wzr
9c: 3d8017e3 str q3, [sp, #80]
a0: 4e040403 dup v3.4s, v0.s[0]
a4: 4e040d00 dup v0.4s, w8
a8: f9400273 ldr x19, [x19]
ac: ad018fe0 stp q0, q3, [sp, #48]
b0: 2f00e400 movi d0, #0x0
b4: 4e040423 dup v3.4s, v1.s[0]
b8: 3d801be0 str q0, [sp, #96]
bc: 2f00e400 movi d0, #0x0
c0: 3c9b03a0 stur q0, [x29, #-80]
c4: 4e040440 dup v0.4s, v2.s[0]
c8: f9400294 ldr x20, [x20]
cc: ad008fe0 stp q0, q3, [sp, #16]
d0: b9400388 ldr w8, [x28]
d4: 0b1b2109 add w9, w8, w27, lsl #8
d8: 3ce95aa0 ldr q0, [x21, w9, uxtw #4]
dc: 3ca87a60 str q0, [x19, x8, lsl #4]
e0: d63f0280 blr x20
e4: 6f00e400 movi v0.2d, #0x0
e8: 3cda03a4 ldur q4, [x29, #-96]
ec: 6f00e402 movi v2.2d, #0x0
f0: aa1f03e8 mov x8, xzr
f4: 6f00e403 movi v3.2d, #0x0
f8: ad41abeb ldp q11, q10, [sp, #48]
fc: 6e040480 mov v0.s[0], v4.s[0]
100: 6f00e401 movi v1.2d, #0x0
104: 6f00e405 movi v5.2d, #0x0
108: ad4293ff ldp q31, q4, [sp, #80]
10c: 6e040482 mov v2.s[0], v4.s[0]
110: 3cdb03a4 ldur q4, [x29, #-80]
114: ad40b3ed ldp q13, q12, [sp, #16]
118: 6e040483 mov v3.s[0], v4.s[0]
11c: 6f00e404 movi v4.2d, #0x0
120: 8b080269 add x9, x19, x8
124: 91020108 add x8, x8, #0x80
128: 4eab1d67 mov v7.16b, v11.16b
12c: f140051f cmp x8, #0x1, lsl #12
130: 4eab1d7b mov v27.16b, v11.16b
134: 4cdf0930 ld4 {v16.4s-v19.4s}, [x9], #64
138: 4eaad606 fsub v6.4s, v16.4s, v10.4s
13c: 4eacd638 fsub v24.4s, v17.4s, v12.4s
140: 4eadd659 fsub v25.4s, v18.4s, v13.4s
144: 4e26ccc7 fmla v7.4s, v6.4s, v6.4s
148: 4e38cf07 fmla v7.4s, v24.4s, v24.4s
14c: 4c400934 ld4 {v20.4s-v23.4s}, [x9]
150: 4e39cf27 fmla v7.4s, v25.4s, v25.4s
154: 6ea1f8e7 fsqrt v7.4s, v7.4s
158: 4eaad69a fsub v26.4s, v20.4s, v10.4s
15c: 4eacd6bc fsub v28.4s, v21.4s, v12.4s
160: 4eadd6dd fsub v29.4s, v22.4s, v13.4s
164: 4e3acf5b fmla v27.4s, v26.4s, v26.4s
168: 6e27ffe7 fdiv v7.4s, v31.4s, v7.4s
16c: 4e3ccf9b fmla v27.4s, v28.4s, v28.4s
170: 4e3dcfbb fmla v27.4s, v29.4s, v29.4s
174: 6ea1fb7b fsqrt v27.4s, v27.4s
178: 6e27dcfe fmul v30.4s, v7.4s, v7.4s
17c: 6e27de67 fmul v7.4s, v19.4s, v7.4s
180: 6e27dfc7 fmul v7.4s, v30.4s, v7.4s
184: 6e3bfffb fdiv v27.4s, v31.4s, v27.4s
188: 4e26cce3 fmla v3.4s, v7.4s, v6.4s
18c: 4e38cce2 fmla v2.4s, v7.4s, v24.4s
190: 4e39cce0 fmla v0.4s, v7.4s, v25.4s
194: 6e3bdf70 fmul v16.4s, v27.4s, v27.4s
198: 6e3bdef1 fmul v17.4s, v23.4s, v27.4s
19c: 6e31de10 fmul v16.4s, v16.4s, v17.4s
1a0: 4e3ace05 fmla v5.4s, v16.4s, v26.4s
1a4: 4e3cce04 fmla v4.4s, v16.4s, v28.4s
1a8: 4e3dce01 fmla v1.4s, v16.4s, v29.4s
1ac: 54fffba1 b.ne 120 <simplified_nbody+0x120> // b.any
1b0: 4e23d4a3 fadd v3.4s, v5.4s, v3.4s
1b4: 4e20d420 fadd v0.4s, v1.4s, v0.4s
1b8: 4e22d482 fadd v2.4s, v4.4s, v2.4s
1bc: 6e20d461 faddp v1.4s, v3.4s, v0.4s
1c0: 6e20d442 faddp v2.4s, v2.4s, v0.4s
1c4: 6e20d400 faddp v0.4s, v0.4s, v0.4s
1c8: 7e30d821 faddp s1, v1.2s
1cc: 7e30d800 faddp s0, v0.2s
1d0: ad3d07a0 stp q0, q1, [x29, #-96]
1d4: 7e30d841 faddp s1, v2.2s
1d8: 3d801be1 str q1, [sp, #96]
1dc: d63f0280 blr x20
1e0: 1104035a add w26, w26, #0x100
1e4: 1100077b add w27, w27, #0x1
1e8: 6b19035f cmp w26, w25
1ec: 54fff723 b.cc d0 <simplified_nbody+0xd0> // b.lo, b.ul, b.last
1f0: ad7d07a2 ldp q2, q1, [x29, #-96]
1f4: 3dc01be0 ldr q0, [sp, #96]
1f8: a94007e3 ldp x3, x1, [sp]
1fc: 6e0c0401 mov v1.s[1], v0.s[0]
200: 14000003 b 20c <simplified_nbody+0x20c>
204: 2f00e401 movi d1, #0x0
208: 2f00e402 movi d2, #0x0
20c: 5297cee8 mov w8, #0xbe77 // #48759
210: bd400060 ldr s0, [x3]
214: 72a7efe8 movk w8, #0x3f7f, lsl #16
218: 8b171029 add x9, x1, x23, lsl #4
21c: a9524ff4 ldp x20, x19, [sp, #288]
220: 0f801028 fmla v8.2s, v1.2s, v0.s[0]
224: 1f022402 fmadd s2, s0, s2, s9
228: 0e040d01 dup v1.2s, w8
22c: 1e270103 fmov s3, w8
230: fd400124 ldr d4, [x9]
234: a94f67fa ldp x26, x25, [sp, #240]
238: 1e230842 fmul s2, s2, s3
23c: 2e21dd01 fmul v1.2s, v8.2s, v1.2s
240: bd400923 ldr s3, [x9, #8]
244: a94e6ffc ldp x28, x27, [sp, #224]
248: bd000302 str s2, [x24]
24c: 0f801024 fmla v4.2s, v1.2s, v0.s[0]
250: 1f000c40 fmadd s0, s2, s0, s3
254: fd0002c1 str d1, [x22]
258: a95157f6 ldp x22, x21, [sp, #272]
25c: a9505ff8 ldp x24, x23, [sp, #256]
260: fd000124 str d4, [x9]
264: a94d7bfd ldp x29, x30, [sp, #208]
268: bd000920 str s0, [x9, #8]
26c: 6d4c23e9 ldp d9, d8, [sp, #192]
270: 6d4b2beb ldp d11, d10, [sp, #176]
274: 6d4a33ed ldp d13, d12, [sp, #160]
278: 9104c3ff add sp, sp, #0x130
27c: d65f03c0 ret
Metal / AIR
Note that the compiler would usually directly output a .metallib file. The output below comes from disassembling it withmetallib-dis
(provided by the toolchain).
; ModuleID = 'bc_module'
source_filename = "simplified_nbody"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.0.0"
%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }
@_ZZ16simplified_nbodyE20local_body_positions = internal addrspace(3) unnamed_addr global [256 x %class.vector4] undef, align 16
; Function Attrs: nounwind
define void @simplified_nbody(%class.vector4 addrspace(1)* noalias nocapture readonly %0, %class.vector4 addrspace(1)* noalias nocapture %1, %class.vector3 addrspace(1)* noalias nocapture %2, float addrspace(2)* noalias nocapture readonly align 4 dereferenceable(4) %3, <3 x i32> %4, <3 x i32> %5, <3 x i32> %6, <3 x i32> %7, <3 x i32> %8, <3 x i32> %9, i32 %10, i32 %11, i32 %12, i32 %13) local_unnamed_addr #0 !reqd_work_group_size !33 !kernel_dim !34 {
%15 = extractelement <3 x i32> %4, i64 0
%16 = zext i32 %15 to i64
%17 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 0
%18 = bitcast float addrspace(1)* %17 to <3 x float> addrspace(1)*
%19 = load <3 x float>, <3 x float> addrspace(1)* %18, align 4
%20 = extractelement <3 x float> %19, i64 0
%21 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 0
%22 = bitcast float addrspace(1)* %21 to <3 x float> addrspace(1)*
%23 = load <3 x float>, <3 x float> addrspace(1)* %22, align 4
%24 = extractelement <3 x i32> %5, i64 0
%25 = extractelement <3 x i32> %6, i64 0
%26 = zext i32 %25 to i64
%27 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %26, i32 0, i32 0, i32 0
%28 = bitcast float addrspace(3)* %27 to <4 x float> addrspace(3)*
%29 = shufflevector <3 x float> %19, <3 x float> undef, <2 x i32> <i32 1, i32 2>
br label %57
30: ; preds = %68
%31 = extractelement <3 x float> %23, i64 0
%32 = load float, float addrspace(2)* %3, align 4
%33 = fmul fast float %32, %100
%34 = insertelement <2 x float> undef, float %32, i64 0
%35 = shufflevector <2 x float> %34, <2 x float> undef, <2 x i32> zeroinitializer
%36 = fmul fast <2 x float> %35, %101
%37 = fadd fast float %33, %31
%38 = shufflevector <3 x float> %23, <3 x float> undef, <2 x i32> <i32 1, i32 2>
%39 = fadd fast <2 x float> %36, %38
%40 = fmul fast float %37, 0x3FEFF7CEE0000000
%41 = fmul fast <2 x float> %39, <float 0x3FEFF7CEE0000000, float 0x3FEFF7CEE0000000>
%42 = fmul fast float %40, %32
%43 = fmul fast <2 x float> %41, %35
%44 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 0
%45 = bitcast float addrspace(1)* %44 to <3 x float> addrspace(1)*
%46 = load <3 x float>, <3 x float> addrspace(1)* %45, align 4, !tbaa !35
%47 = extractelement <3 x float> %46, i64 0
%48 = fadd fast float %42, %47
%49 = shufflevector <3 x float> %46, <3 x float> undef, <2 x i32> <i32 1, i32 2>
%50 = fadd fast <2 x float> %43, %49
%51 = insertelement <3 x float> undef, float %48, i64 0
%52 = shufflevector <2 x float> %50, <2 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
%53 = shufflevector <3 x float> %51, <3 x float> %52, <3 x i32> <i32 0, i32 3, i32 4>
store <3 x float> %53, <3 x float> addrspace(1)* %45, align 4, !tbaa !35
%54 = insertelement <3 x float> undef, float %40, i64 0
%55 = shufflevector <2 x float> %41, <2 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
%56 = shufflevector <3 x float> %54, <3 x float> %55, <3 x i32> <i32 0, i32 3, i32 4>
store <3 x float> %56, <3 x float> addrspace(1)* %22, align 4, !tbaa !35
ret void
57: ; preds = %68, %14
%58 = phi i32 [ 0, %14 ], [ %69, %68 ]
%59 = phi i32 [ 0, %14 ], [ %70, %68 ]
%60 = phi float [ 0.000000e+00, %14 ], [ %100, %68 ]
%61 = phi <2 x float> [ zeroinitializer, %14 ], [ %101, %68 ]
%62 = shl i32 %59, 8
%63 = add i32 %25, %62
%64 = zext i32 %63 to i64
%65 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %64, i32 0, i32 0, i32 0
%66 = bitcast float addrspace(1)* %65 to <4 x float> addrspace(1)*
%67 = load <4 x float>, <4 x float> addrspace(1)* %66, align 4
store <4 x float> %67, <4 x float> addrspace(3)* %28, align 4, !tbaa !35
tail call void @air.wg.barrier(i32 2, i32 1) #3
br label %72
68: ; preds = %72
tail call void @air.wg.barrier(i32 2, i32 1) #3
%69 = add i32 %58, 256
%70 = add i32 %59, 1
%71 = icmp ult i32 %69, %24
br i1 %71, label %57, label %30, !llvm.loop !38
72: ; preds = %72, %57
%73 = phi i32 [ 0, %57 ], [ %102, %72 ]
%74 = phi float [ %60, %57 ], [ %100, %72 ]
%75 = phi <2 x float> [ %61, %57 ], [ %101, %72 ]
%76 = zext i32 %73 to i64
%77 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %76, i32 0, i32 0, i32 0
%78 = bitcast float addrspace(3)* %77 to <4 x float> addrspace(3)*
%79 = load <4 x float>, <4 x float> addrspace(3)* %78, align 4
%80 = extractelement <4 x float> %79, i64 0
%81 = extractelement <4 x float> %79, i64 3
%82 = fsub fast float %80, %20
%83 = shufflevector <4 x float> %79, <4 x float> undef, <2 x i32> <i32 1, i32 2>
%84 = fsub fast <2 x float> %83, %29
%85 = fmul fast float %82, %82
%86 = fmul fast <2 x float> %84, %84
%87 = extractelement <2 x float> %86, i64 0
%88 = extractelement <2 x float> %86, i64 1
%89 = fadd fast float %85, 0x3F1A36E2E0000000
%90 = fadd fast float %89, %87
%91 = fadd fast float %90, %88
%92 = tail call fast float @air.fast_rsqrt.f32(float %91) #4
%93 = fmul fast float %92, %92
%94 = fmul fast float %93, %92
%95 = fmul fast float %94, %81
%96 = fmul fast float %95, %82
%97 = insertelement <2 x float> undef, float %95, i64 0
%98 = shufflevector <2 x float> %97, <2 x float> undef, <2 x i32> zeroinitializer
%99 = fmul fast <2 x float> %98, %84
%100 = fadd fast float %96, %74
%101 = fadd fast <2 x float> %99, %75
%102 = add nuw nsw i32 %73, 1
%103 = icmp eq i32 %102, 256
br i1 %103, label %68, label %72, !llvm.loop !40
}
; Function Attrs: nounwind readnone
declare float @air.fast_rsqrt.f32(float) local_unnamed_addr #1
; Function Attrs: convergent noduplicate
declare void @air.wg.barrier(i32, i32) local_unnamed_addr #2
attributes #0 = { nounwind "approx-func-fp-math"="true" "frame-pointer"="all" "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="true" }
attributes #1 = { nounwind readnone "approx-func-fp-math"="true" "frame-pointer"="all" "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" }
attributes #2 = { convergent noduplicate "approx-func-fp-math"="true" "frame-pointer"="all" "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" }
attributes #3 = { convergent noduplicate nounwind }
attributes #4 = { nounwind readnone }
!air.kernel = !{!0}
!air.version = !{!18}
!air.language_version = !{!19}
!air.compile_options = !{!20, !21, !22}
!llvm.module.flags = !{!23, !24, !25, !26, !27, !28, !29, !30, !31}
!llvm.ident = !{!32}
!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float addrspace(2)*, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, i32, i32, i32, i32)* @simplified_nbody, !1, !2, !17}
!1 = !{}
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16}
!3 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"float4", !"air.arg_name", !"in_positions"}
!4 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"float4", !"air.arg_name", !"out_positions"}
!5 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 12, !"air.arg_type_align_size", i32 12, !"air.arg_type_name", !"float3", !"air.arg_name", !"inout_velocities"}
!6 = !{i32 3, !"air.buffer", !"air.buffer_size", i32 4, !"air.location_index", i32 3, i32 1, !"air.read", !"air.address_space", i32 2, !"air.arg_type_size", i32 4, !"air.arg_type_align_size", i32 4, !"air.arg_type_name", !"float", !"air.arg_name", !"time_delta"}
!7 = !{i32 4, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_id__"}
!8 = !{i32 5, !"air.threads_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_size__"}
!9 = !{i32 6, !"air.thread_position_in_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_id__"}
!10 = !{i32 7, !"air.threads_per_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_size__"}
!11 = !{i32 8, !"air.threadgroup_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_id__"}
!12 = !{i32 9, !"air.threadgroups_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_size__"}
!13 = !{i32 10, !"air.simdgroup_index_in_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_id__"}
!14 = !{i32 11, !"air.thread_index_in_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_local_id__"}
!15 = !{i32 12, !"air.threads_per_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_size__"}
!16 = !{i32 13, !"air.simdgroups_per_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__num_sub_groups__"}
!17 = !{!"air.max_work_group_size", i32 256}
!18 = !{i32 2, i32 6, i32 0}
!19 = !{!"Metal", i32 3, i32 1, i32 0}
!20 = !{!"air.compile.denorms_disable"}
!21 = !{!"air.compile.fast_math_enable"}
!22 = !{!"air.compile.framebuffer_fetch_enable"}
!23 = !{i32 7, !"air.max_device_buffers", i32 31}
!24 = !{i32 7, !"air.max_constant_buffers", i32 31}
!25 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!26 = !{i32 7, !"air.max_textures", i32 128}
!27 = !{i32 7, !"air.max_read_write_textures", i32 8}
!28 = !{i32 7, !"air.max_samplers", i32 16}
!29 = !{i32 1, !"wchar_size", i32 4}
!30 = !{i32 7, !"frame-pointer", i32 2}
!31 = !{i32 2, !"SDK Version", [2 x i32] [i32 14, i32 0]}
!32 = !{!"Apple metal version 32023.155 (metalfe-32023.155)"}
!33 = !{i32 256, i32 1, i32 1}
!34 = !{i32 1}
!35 = !{!36, !36, i64 0}
!36 = !{!"omnipotent char", !37, i64 0}
!37 = !{!"Simple C++ TBAA"}
!38 = distinct !{!38, !39}
!39 = !{!"llvm.loop.mustprogress"}
!40 = distinct !{!40, !39}
OpenCL / SPIR
Note that the compiler would usually directly output a .bc file. The output below comes from disassembling it withllvm-dis
(provided by the toolchain). Also note that the bitcode file is exported in a LLVM 3.2 / SPIR 1.2 compatible format, but the output below uses LLVM 14.0 syntax.
; ModuleID = 'spir.bc'
source_filename = "spir.bc"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
target triple = "spir64-unknown-unknown"
%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }
@simplified_nbody.local_body_positions = internal unnamed_addr addrspace(3) global [256 x %class.vector4] undef, align 4
define floor_kernel void @simplified_nbody(%class.vector4 addrspace(1)* %0, %class.vector4 addrspace(1)* %1, %class.vector3 addrspace(1)* %2, float %3) {
%5 = tail call floor_func i64 @_Z13get_global_idj(i32 0), !range !14
%6 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 0
%7 = load float, float addrspace(1)* %6, align 4
%8 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 1
%9 = load float, float addrspace(1)* %8, align 4
%10 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 2
%11 = load float, float addrspace(1)* %10, align 4
%12 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 0
%13 = load float, float addrspace(1)* %12, align 4
%14 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 1
%15 = load float, float addrspace(1)* %14, align 4
%16 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 2
%17 = load float, float addrspace(1)* %16, align 4
%18 = tail call floor_func i64 @_Z15get_global_sizej(i32 0), !range !15
%19 = trunc i64 %18 to i32, !range !16
%20 = tail call floor_func i64 @_Z12get_local_idj(i32 0), !range !17
%21 = trunc i64 %20 to i32, !range !18
%22 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 0
%23 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 1
%24 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 2
%25 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 3
br label %48
26: ; preds = %65
%27 = fmul float %98, %3
%28 = fmul float %99, %3
%29 = fmul float %100, %3
%30 = fadd float %27, %13
%31 = fadd float %28, %15
%32 = fadd float %29, %17
%33 = fmul float %30, 0x3FEFF7CEE0000000
%34 = fmul float %31, 0x3FEFF7CEE0000000
%35 = fmul float %32, 0x3FEFF7CEE0000000
%36 = fmul float %33, %3
%37 = fmul float %34, %3
%38 = fmul float %35, %3
%39 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 0
%40 = load float, float addrspace(1)* %39, align 4, !tbaa !19
%41 = fadd float %40, %36
store float %41, float addrspace(1)* %39, align 4, !tbaa !19
%42 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 1
%43 = load float, float addrspace(1)* %42, align 4, !tbaa !19
%44 = fadd float %43, %37
store float %44, float addrspace(1)* %42, align 4, !tbaa !19
%45 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 2
%46 = load float, float addrspace(1)* %45, align 4, !tbaa !19
%47 = fadd float %46, %38
store float %47, float addrspace(1)* %45, align 4, !tbaa !19
store float %33, float addrspace(1)* %12, align 4, !tbaa !19
store float %34, float addrspace(1)* %14, align 4, !tbaa !19
store float %35, float addrspace(1)* %16, align 4, !tbaa !19
ret void
48: ; preds = %65, %4
%49 = phi i32 [ 0, %4 ], [ %66, %65 ]
%50 = phi i32 [ 0, %4 ], [ %67, %65 ]
%51 = phi float [ 0.000000e+00, %4 ], [ %100, %65 ]
%52 = phi float [ 0.000000e+00, %4 ], [ %99, %65 ]
%53 = phi float [ 0.000000e+00, %4 ], [ %98, %65 ]
%54 = shl i32 %50, 8
%55 = add i32 %54, %21
%56 = zext i32 %55 to i64
%57 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 0
%58 = load float, float addrspace(1)* %57, align 4
%59 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 1
%60 = load float, float addrspace(1)* %59, align 4
%61 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 2
%62 = load float, float addrspace(1)* %61, align 4
%63 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 3
%64 = load float, float addrspace(1)* %63, align 4
store float %58, float addrspace(3)* %22, align 4, !tbaa !19
store float %60, float addrspace(3)* %23, align 4, !tbaa !19
store float %62, float addrspace(3)* %24, align 4, !tbaa !19
store float %64, float addrspace(3)* %25, align 4, !tbaa !19
tail call floor_func void @_Z7barrierj(i32 1)
br label %69
65: ; preds = %69
tail call floor_func void @_Z7barrierj(i32 1)
%66 = add i32 %49, 256
%67 = add i32 %50, 1
%68 = icmp ult i32 %66, %19
br i1 %68, label %48, label %26, !llvm.loop !22
69: ; preds = %69, %48
%70 = phi i64 [ 0, %48 ], [ %101, %69 ]
%71 = phi float [ %51, %48 ], [ %100, %69 ]
%72 = phi float [ %52, %48 ], [ %99, %69 ]
%73 = phi float [ %53, %48 ], [ %98, %69 ]
%74 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 0
%75 = load float, float addrspace(3)* %74, align 4
%76 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 1
%77 = load float, float addrspace(3)* %76, align 4
%78 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 2
%79 = load float, float addrspace(3)* %78, align 4
%80 = fsub float %75, %7
%81 = fsub float %77, %9
%82 = fsub float %79, %11
%83 = fmul float %80, %80
%84 = fmul float %81, %81
%85 = fmul float %82, %82
%86 = fadd float %83, 0x3F1A36E2E0000000
%87 = fadd float %86, %84
%88 = fadd float %87, %85
%89 = tail call floor_func float @_Z5rsqrtf(float %88)
%90 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 3
%91 = load float, float addrspace(3)* %90, align 4, !tbaa !19
%92 = fmul float %89, %89
%93 = fmul float %92, %89
%94 = fmul float %93, %91
%95 = fmul float %94, %80
%96 = fmul float %94, %81
%97 = fmul float %94, %82
%98 = fadd float %95, %73
%99 = fadd float %96, %72
%100 = fadd float %97, %71
%101 = add nuw nsw i64 %70, 1
%102 = icmp eq i64 %101, 256
br i1 %102, label %65, label %69, !llvm.loop !24
}
declare floor_func i64 @_Z13get_global_idj(i32)
declare floor_func i64 @_Z15get_global_sizej(i32)
declare floor_func i64 @_Z12get_local_idj(i32)
declare floor_func float @_Z5rsqrtf(float)
declare floor_func void @_Z7barrierj(i32)
!opencl.kernels = !{!0}
!llvm.linker.options = !{}
!llvm.module.flags = !{!7, !8}
!opencl.ocl.version = !{!9}
!opencl.spir.version = !{!9}
!opencl.enable.FP_CONTRACT = !{}
!opencl.used.extensions = !{!10}
!opencl.used.optional.core.features = !{!11}
!opencl.compiler.options = !{!12}
!llvm.ident = !{!13}
!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float)* @simplified_nbody, !1, !2, !3, !4, !5, !6}
!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0}
!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
!3 = !{!"kernel_arg_type", !"compute_global_buffer<const float4>", !"compute_global_buffer<float4>", !"compute_global_buffer<float3>", !"param<float>"}
!4 = !{!"kernel_arg_base_type", !"struct __class vector4<float>*", !"struct __class vector4<float>*", !"struct __class vector3<float>*", !"float"}
!5 = !{!"kernel_arg_type_qual", !"restrict const", !"restrict", !"restrict", !"const"}
!6 = !{!"kernel_arg_name", !"in_positions", !"out_positions", !"inout_velocities", !"time_delta"}
!7 = !{i32 1, !"wchar_size", i32 4}
!8 = !{i32 7, !"frame-pointer", i32 2}
!9 = !{i32 1, i32 2}
!10 = !{!"cl_khr_byte_addressable_store", !"cl_khr_global_int32_base_atomics", !"cl_khr_global_int32_extended_atomics", !"cl_khr_local_int32_base_atomics", !"cl_khr_local_int32_extended_atomics", !"cl_khr_fp64", !"cl_khr_fp16", !"cl_khr_gl_msaa_sharing"}
!11 = !{!"cl_doubles"}
!12 = !{!"-cl-kernel-arg-info", !"-cl-mad-enable", !"-cl-denorms-are-zero", !"-cl-unsafe-math-optimizations"}
!13 = !{!"clang version 14.0.6 (https://github.com/a2flo/floor_llvm.git 85a83a4073c340ac03ca1c8fcd131db30339db24)"}
!14 = !{i64 0, i64 4294967295}
!15 = !{i64 1, i64 4294967295}
!16 = !{i32 1, i32 -1}
!17 = !{i64 0, i64 2048}
!18 = !{i32 0, i32 2048}
!19 = !{!20, !20, i64 0}
!20 = !{!"omnipotent char", !21, i64 0}
!21 = !{!"Simple C++ TBAA"}
!22 = distinct !{!22, !23}
!23 = !{!"llvm.loop.mustprogress"}
!24 = distinct !{!24, !23}
OpenCL / SPIR-V
Note that the compiler would usually directly output a .spv file. The output below comes from disassembling it withspirv-dis
(provided by the toolchain).
Also note that the output below has been generated with extended readability (--debug-asm).
; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 153
; Schema: 0
Capability Addresses
Capability Linkage
Capability Kernel
Capability Int64
%1 = ExtInstImport "OpenCL.std"
MemoryModel Physical64 OpenCL
EntryPoint Kernel %simplified_nbody "simplified_nbody" %__spirv_BuiltInGlobalInvocationId %__spirv_BuiltInGlobalSize %__spirv_BuiltInLocalInvocationId
ExecutionMode %simplified_nbody LocalSize 256 1 1
SourceExtension "cl_khr_byte_addressable_store"
SourceExtension "cl_khr_fp16"
SourceExtension "cl_khr_fp64"
SourceExtension "cl_khr_gl_msaa_sharing"
SourceExtension "cl_khr_global_int32_base_atomics"
SourceExtension "cl_khr_global_int32_extended_atomics"
SourceExtension "cl_khr_local_int32_base_atomics"
SourceExtension "cl_khr_local_int32_extended_atomics"
Source OpenCL_C 102000
Decorate %simplified_nbody.local_body_positions Alignment 4
Decorate %19 FuncParamAttr NoAlias
Decorate %19 FuncParamAttr NoCapture
Decorate %19 FuncParamAttr NoWrite
Decorate %20 FuncParamAttr NoAlias
Decorate %20 FuncParamAttr NoCapture
Decorate %21 FuncParamAttr NoAlias
Decorate %21 FuncParamAttr NoCapture
Decorate %__spirv_BuiltInGlobalInvocationId LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
Decorate %__spirv_BuiltInGlobalInvocationId Constant
Decorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
Decorate %__spirv_BuiltInGlobalSize LinkageAttributes "__spirv_BuiltInGlobalSize" Import
Decorate %__spirv_BuiltInGlobalSize Constant
Decorate %__spirv_BuiltInGlobalSize BuiltIn GlobalSize
Decorate %__spirv_BuiltInLocalInvocationId LinkageAttributes "__spirv_BuiltInLocalInvocationId" Import
Decorate %__spirv_BuiltInLocalInvocationId Constant
Decorate %__spirv_BuiltInLocalInvocationId BuiltIn LocalInvocationId
Decorate %70 FPFastMathMode Fast
Decorate %72 FPFastMathMode Fast
Decorate %74 FPFastMathMode Fast
Decorate %101 FPFastMathMode Fast
Decorate %102 FPFastMathMode Fast
Decorate %103 FPFastMathMode Fast
Decorate %104 FPFastMathMode Fast
Decorate %105 FPFastMathMode Fast
Decorate %106 FPFastMathMode Fast
Decorate %108 FPFastMathMode Fast
Decorate %109 FPFastMathMode Fast
Decorate %110 FPFastMathMode Fast
Decorate %114 FPFastMathMode Fast
Decorate %115 FPFastMathMode Fast
Decorate %116 FPFastMathMode Fast
Decorate %117 FPFastMathMode Fast
Decorate %118 FPFastMathMode Fast
Decorate %119 FPFastMathMode Fast
Decorate %131 FPFastMathMode Fast
Decorate %132 FPFastMathMode Fast
Decorate %133 FPFastMathMode Fast
Decorate %134 FPFastMathMode Fast
Decorate %135 FPFastMathMode Fast
Decorate %136 FPFastMathMode Fast
Decorate %138 FPFastMathMode Fast
Decorate %139 FPFastMathMode Fast
Decorate %140 FPFastMathMode Fast
Decorate %141 FPFastMathMode Fast
Decorate %142 FPFastMathMode Fast
Decorate %143 FPFastMathMode Fast
Decorate %146 FPFastMathMode Fast
Decorate %149 FPFastMathMode Fast
Decorate %152 FPFastMathMode Fast
%ulong = TypeInt 64 0
%uint = TypeInt 32 0
%256ul = Constant %ulong 256
%0u = Constant %uint 0
%1u = Constant %uint 1
%2u = Constant %uint 2
%0ul = Constant %ulong 0
%3u = Constant %uint 3
%8u = Constant %uint 8
%272u = Constant %uint 272
%0ul_0 = Constant %ulong 0
%1ul = Constant %ulong 1
%256u = Constant %uint 256
%float = TypeFloat 32
%struct.anon = TypeStruct %float %float %float %float
%union.anon = TypeStruct %struct.anon
%class.vector4 = TypeStruct %union.anon
%class.vector4[256ul] = TypeArray %class.vector4 %256ul
%(Workgroup)class.vector4[256ul]* = TypePointer Workgroup %class.vector4[256ul]
%void = TypeVoid
%(CrossWorkgroup)class.vector4* = TypePointer CrossWorkgroup %class.vector4
%struct.anon.9 = TypeStruct %float %float %float
%union.anon.8 = TypeStruct %struct.anon.9
%class.vector3 = TypeStruct %union.anon.8
%(CrossWorkgroup)class.vector3* = TypePointer CrossWorkgroup %class.vector3
%void(#4) = TypeFunction %void %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector3* %float
%<3xulong> = TypeVector %ulong 3
%(Input)<3xulong>* = TypePointer Input %<3xulong>
%(CrossWorkgroup)float* = TypePointer CrossWorkgroup %float
%(Workgroup)float* = TypePointer Workgroup %float
%bool = TypeBool
%simplified_nbody.local_body_positions = Variable %(Workgroup)class.vector4[256ul]* Workgroup
%__spirv_BuiltInGlobalInvocationId = Variable %(Input)<3xulong>* Input
%__spirv_BuiltInGlobalSize = Variable %(Input)<3xulong>* Input
%__spirv_BuiltInLocalInvocationId = Variable %(Input)<3xulong>* Input
%0.0f = Constant %float 0
%9.99999975e-05f = Constant %float 9.99999975e-05
%0.999000013f = Constant %float 0.999000013
function void simplified_nbody ( %void(#4) ) {
%19 = FunctionParameter %(CrossWorkgroup)class.vector4*
%20 = FunctionParameter %(CrossWorkgroup)class.vector4*
%21 = FunctionParameter %(CrossWorkgroup)class.vector3*
%22 = FunctionParameter %float
23:
%31 = Load %<3xulong> %__spirv_BuiltInGlobalInvocationId Aligned 32
%32 = CompositeExtract %ulong %31 0
%36 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %32 %0u %0u %0u
%37 = Load %float %36 Aligned 4
%39 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %32 %0u %0u %1u
%40 = Load %float %39 Aligned 4
%42 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %32 %0u %0u %2u
%43 = Load %float %42 Aligned 4
%44 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %21 %32 %0u %0u %0u
%45 = Load %float %44 Aligned 4
%46 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %21 %32 %0u %0u %1u
%47 = Load %float %46 Aligned 4
%48 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %21 %32 %0u %0u %2u
%49 = Load %float %48 Aligned 4
%51 = Load %<3xulong> %__spirv_BuiltInGlobalSize Aligned 32
%52 = CompositeExtract %ulong %51 0
%53 = UConvert %uint %52
%55 = Load %<3xulong> %__spirv_BuiltInLocalInvocationId Aligned 32
%56 = CompositeExtract %ulong %55 0
%57 = UConvert %uint %56
%60 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %0u
%61 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %1u
%62 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %2u
%64 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %3u
Branch %24
24:
%66 = Phi %uint ( %65 <- %26, %0u <- %23 )
%68 = Phi %uint ( %67 <- %26, %0u <- %23 )
%71 = Phi %float ( %0.0f <- %23, %70 <- %26 )
%73 = Phi %float ( %0.0f <- %23, %72 <- %26 )
%75 = Phi %float ( %0.0f <- %23, %74 <- %26 )
%77 = ShiftLeftLogical %uint %68 %8u
%78 = IAdd %uint %77 %57
%79 = UConvert %ulong %78
%80 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %0u
%81 = Load %float %80 Aligned 4
%82 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %1u
%83 = Load %float %82 Aligned 4
%84 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %2u
%85 = Load %float %84 Aligned 4
%86 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %3u
%87 = Load %float %86 Aligned 4
Store %60 %81 Aligned 4
Store %61 %83 Aligned 4
Store %62 %85 Aligned 4
Store %64 %87 Aligned 4
ControlBarrier %2u %2u %272u
Branch %25
25:
%91 = Phi %ulong ( %89 <- %25, %0ul_0 <- %24 )
%92 = Phi %float ( %71 <- %24, %70 <- %25 )
%93 = Phi %float ( %73 <- %24, %72 <- %25 )
%94 = Phi %float ( %75 <- %24, %74 <- %25 )
%95 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %0u
%96 = Load %float %95 Aligned 4
%97 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %1u
%98 = Load %float %97 Aligned 4
%99 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %2u
%100 = Load %float %99 Aligned 4
%101 = FSub %float %96 %37
%102 = FSub %float %98 %40
%103 = FSub %float %100 %43
%104 = FMul %float %101 %101
%105 = FMul %float %102 %102
%106 = FMul %float %103 %103
%108 = FAdd %float %104 %9.99999975e-05f
%109 = FAdd %float %108 %105
%110 = FAdd %float %109 %106
%111 = ExtInst %float %1 rsqrt %110
%112 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %3u
%113 = Load %float %112 Aligned 4
%114 = FMul %float %111 %111
%115 = FMul %float %114 %111
%116 = FMul %float %115 %113
%117 = FMul %float %116 %101
%118 = FMul %float %116 %102
%119 = FMul %float %116 %103
%74 = FAdd %float %117 %94
%72 = FAdd %float %118 %93
%70 = FAdd %float %119 %92
%89 = IAdd %ulong %91 %1ul
%126 = IEqual %bool %89 %256ul
BranchConditional %126 %26 %25
26:
ControlBarrier %2u %2u %272u
%65 = IAdd %uint %66 %256u
%67 = IAdd %uint %68 %1u
%130 = ULessThan %bool %65 %53
BranchConditional %130 %24 %27
27:
%131 = FMul %float %74 %22
%132 = FMul %float %72 %22
%133 = FMul %float %70 %22
%134 = FAdd %float %131 %45
%135 = FAdd %float %132 %47
%136 = FAdd %float %133 %49
%138 = FMul %float %134 %0.999000013f
%139 = FMul %float %135 %0.999000013f
%140 = FMul %float %136 %0.999000013f
%141 = FMul %float %138 %22
%142 = FMul %float %139 %22
%143 = FMul %float %140 %22
%144 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %20 %32 %0u %0u %0u
%145 = Load %float %144 Aligned 4
%146 = FAdd %float %145 %141
Store %144 %146 Aligned 4
%147 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %20 %32 %0u %0u %1u
%148 = Load %float %147 Aligned 4
%149 = FAdd %float %148 %142
Store %147 %149 Aligned 4
%150 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %20 %32 %0u %0u %2u
%151 = Load %float %150 Aligned 4
%152 = FAdd %float %151 %143
Store %150 %152 Aligned 4
Store %44 %138 Aligned 4
Store %46 %139 Aligned 4
Store %48 %140 Aligned 4
Return
}
Vulkan / SPIR-V
Note that the compiler would usually directly output a .spvc file (a simple container format for multiple SPIR-V binaries). The output below comes from disassembling it withspirv-dis
(provided by the toolchain).
Also note that the output below has been generated with extended readability (--debug-asm).
; SPIR-V
; Version: 1.6
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 210
; Schema: 0
Capability Matrix
Capability Shader
Capability Int64
Capability GroupNonUniform
Capability VariablePointersStorageBuffer
Capability VariablePointers
Capability ShaderNonUniform
Capability UniformBufferArrayNonUniformIndexing
Capability SampledImageArrayNonUniformIndexing
Capability StorageBufferArrayNonUniformIndexing
Capability StorageImageArrayNonUniformIndexing
Capability VulkanMemoryModel
Capability VulkanMemoryModelDeviceScope
Capability PhysicalStorageBufferAddresses
%1 = ExtInstImport "GLSL.std.450"
MemoryModel PhysicalStorageBuffer64 Vulkan
EntryPoint GLCompute %simplified_nbody "simplified_nbody" %simplified_nbody.vulkan_uniform. %simplified_nbody.vulkan_uniform..1 %simplified_nbody.vulkan_uniform..2 %simplified_nbody.vulkan_uniform..3 %simplified_nbody.vulkan_builtin_input. %simplified_nbody.vulkan_builtin_input..4 %simplified_nbody.vulkan_builtin_input..5 %simplified_nbody.vulkan_builtin_input..6 %simplified_nbody.vulkan_builtin_input..7 %simplified_nbody.vulkan_builtin_input..8 %vulkan.immutable_sampler_0 %vulkan.immutable_sampler_1 %vulkan.immutable_sampler_2 %vulkan.immutable_sampler_3 %vulkan.immutable_sampler_4 %vulkan.immutable_sampler_5 %vulkan.immutable_sampler_6 %vulkan.immutable_sampler_7 %vulkan.immutable_sampler_8 %vulkan.immutable_sampler_9 %vulkan.immutable_sampler_10 %vulkan.immutable_sampler_11 %vulkan.immutable_sampler_12 %vulkan.immutable_sampler_13 %vulkan.immutable_sampler_14 %vulkan.immutable_sampler_15 %vulkan.immutable_sampler_16 %vulkan.immutable_sampler_17 %vulkan.immutable_sampler_18 %vulkan.immutable_sampler_19 %vulkan.immutable_sampler_20 %vulkan.immutable_sampler_21 %vulkan.immutable_sampler_22 %vulkan.immutable_sampler_23 %vulkan.immutable_sampler_24 %vulkan.immutable_sampler_25 %vulkan.immutable_sampler_26 %vulkan.immutable_sampler_27 %vulkan.immutable_sampler_28 %vulkan.immutable_sampler_29 %vulkan.immutable_sampler_30 %vulkan.immutable_sampler_31 %vulkan.immutable_sampler_32 %vulkan.immutable_sampler_33 %vulkan.immutable_sampler_34 %vulkan.immutable_sampler_35 %vulkan.immutable_sampler_36 %vulkan.immutable_sampler_37 %vulkan.immutable_sampler_38 %vulkan.immutable_sampler_39 %vulkan.immutable_sampler_40 %vulkan.immutable_sampler_41 %vulkan.immutable_sampler_42 %vulkan.immutable_sampler_43 %vulkan.immutable_sampler_44 %vulkan.immutable_sampler_45 %vulkan.immutable_sampler_46 %vulkan.immutable_sampler_47 %_ZZ16simplified_nbodyE20local_body_positions
ExecutionMode %simplified_nbody LocalSize 256 1 1
SourceExtension "vk_capability_int16"
SourceExtension "vk_capability_int64"
SourceExtension "vk_capability_multiview"
Source GLSL 450
Decorate %vulkan.immutable_sampler_0 DescriptorSet 0
Decorate %vulkan.immutable_sampler_0 Binding 0
Decorate %vulkan.immutable_sampler_1 DescriptorSet 0
Decorate %vulkan.immutable_sampler_1 Binding 1
Decorate %vulkan.immutable_sampler_2 DescriptorSet 0
Decorate %vulkan.immutable_sampler_2 Binding 2
Decorate %vulkan.immutable_sampler_3 DescriptorSet 0
Decorate %vulkan.immutable_sampler_3 Binding 3
Decorate %vulkan.immutable_sampler_4 DescriptorSet 0
Decorate %vulkan.immutable_sampler_4 Binding 4
Decorate %vulkan.immutable_sampler_5 DescriptorSet 0
Decorate %vulkan.immutable_sampler_5 Binding 5
Decorate %vulkan.immutable_sampler_6 DescriptorSet 0
Decorate %vulkan.immutable_sampler_6 Binding 6
Decorate %vulkan.immutable_sampler_7 DescriptorSet 0
Decorate %vulkan.immutable_sampler_7 Binding 7
Decorate %vulkan.immutable_sampler_8 DescriptorSet 0
Decorate %vulkan.immutable_sampler_8 Binding 8
Decorate %vulkan.immutable_sampler_9 DescriptorSet 0
Decorate %vulkan.immutable_sampler_9 Binding 9
Decorate %vulkan.immutable_sampler_10 DescriptorSet 0
Decorate %vulkan.immutable_sampler_10 Binding 10
Decorate %vulkan.immutable_sampler_11 DescriptorSet 0
Decorate %vulkan.immutable_sampler_11 Binding 11
Decorate %vulkan.immutable_sampler_12 DescriptorSet 0
Decorate %vulkan.immutable_sampler_12 Binding 12
Decorate %vulkan.immutable_sampler_13 DescriptorSet 0
Decorate %vulkan.immutable_sampler_13 Binding 13
Decorate %vulkan.immutable_sampler_14 DescriptorSet 0
Decorate %vulkan.immutable_sampler_14 Binding 14
Decorate %vulkan.immutable_sampler_15 DescriptorSet 0
Decorate %vulkan.immutable_sampler_15 Binding 15
Decorate %vulkan.immutable_sampler_16 DescriptorSet 0
Decorate %vulkan.immutable_sampler_16 Binding 16
Decorate %vulkan.immutable_sampler_17 DescriptorSet 0
Decorate %vulkan.immutable_sampler_17 Binding 17
Decorate %vulkan.immutable_sampler_18 DescriptorSet 0
Decorate %vulkan.immutable_sampler_18 Binding 18
Decorate %vulkan.immutable_sampler_19 DescriptorSet 0
Decorate %vulkan.immutable_sampler_19 Binding 19
Decorate %vulkan.immutable_sampler_20 DescriptorSet 0
Decorate %vulkan.immutable_sampler_20 Binding 20
Decorate %vulkan.immutable_sampler_21 DescriptorSet 0
Decorate %vulkan.immutable_sampler_21 Binding 21
Decorate %vulkan.immutable_sampler_22 DescriptorSet 0
Decorate %vulkan.immutable_sampler_22 Binding 22
Decorate %vulkan.immutable_sampler_23 DescriptorSet 0
Decorate %vulkan.immutable_sampler_23 Binding 23
Decorate %vulkan.immutable_sampler_24 DescriptorSet 0
Decorate %vulkan.immutable_sampler_24 Binding 24
Decorate %vulkan.immutable_sampler_25 DescriptorSet 0
Decorate %vulkan.immutable_sampler_25 Binding 25
Decorate %vulkan.immutable_sampler_26 DescriptorSet 0
Decorate %vulkan.immutable_sampler_26 Binding 26
Decorate %vulkan.immutable_sampler_27 DescriptorSet 0
Decorate %vulkan.immutable_sampler_27 Binding 27
Decorate %vulkan.immutable_sampler_28 DescriptorSet 0
Decorate %vulkan.immutable_sampler_28 Binding 28
Decorate %vulkan.immutable_sampler_29 DescriptorSet 0
Decorate %vulkan.immutable_sampler_29 Binding 29
Decorate %vulkan.immutable_sampler_30 DescriptorSet 0
Decorate %vulkan.immutable_sampler_30 Binding 30
Decorate %vulkan.immutable_sampler_31 DescriptorSet 0
Decorate %vulkan.immutable_sampler_31 Binding 31
Decorate %vulkan.immutable_sampler_32 DescriptorSet 0
Decorate %vulkan.immutable_sampler_32 Binding 32
Decorate %vulkan.immutable_sampler_33 DescriptorSet 0
Decorate %vulkan.immutable_sampler_33 Binding 33
Decorate %vulkan.immutable_sampler_34 DescriptorSet 0
Decorate %vulkan.immutable_sampler_34 Binding 34
Decorate %vulkan.immutable_sampler_35 DescriptorSet 0
Decorate %vulkan.immutable_sampler_35 Binding 35
Decorate %vulkan.immutable_sampler_36 DescriptorSet 0
Decorate %vulkan.immutable_sampler_36 Binding 36
Decorate %vulkan.immutable_sampler_37 DescriptorSet 0
Decorate %vulkan.immutable_sampler_37 Binding 37
Decorate %vulkan.immutable_sampler_38 DescriptorSet 0
Decorate %vulkan.immutable_sampler_38 Binding 38
Decorate %vulkan.immutable_sampler_39 DescriptorSet 0
Decorate %vulkan.immutable_sampler_39 Binding 39
Decorate %vulkan.immutable_sampler_40 DescriptorSet 0
Decorate %vulkan.immutable_sampler_40 Binding 40
Decorate %vulkan.immutable_sampler_41 DescriptorSet 0
Decorate %vulkan.immutable_sampler_41 Binding 41
Decorate %vulkan.immutable_sampler_42 DescriptorSet 0
Decorate %vulkan.immutable_sampler_42 Binding 42
Decorate %vulkan.immutable_sampler_43 DescriptorSet 0
Decorate %vulkan.immutable_sampler_43 Binding 43
Decorate %vulkan.immutable_sampler_44 DescriptorSet 0
Decorate %vulkan.immutable_sampler_44 Binding 44
Decorate %vulkan.immutable_sampler_45 DescriptorSet 0
Decorate %vulkan.immutable_sampler_45 Binding 45
Decorate %vulkan.immutable_sampler_46 DescriptorSet 0
Decorate %vulkan.immutable_sampler_46 Binding 46
Decorate %vulkan.immutable_sampler_47 DescriptorSet 0
Decorate %vulkan.immutable_sampler_47 Binding 47
Decorate %class.vector4[256l] ArrayStride 16
MemberDecorate %class.vector4 0 Offset 0
MemberDecorate %union.anon 0 Offset 0
MemberDecorate %struct.anon 0 Offset 0
MemberDecorate %struct.anon 1 Offset 4
MemberDecorate %struct.anon 2 Offset 8
MemberDecorate %struct.anon 3 Offset 12
Decorate %enclose.class.vector4 Block
MemberDecorate %enclose.class.vector4 0 Offset 0
Decorate %class.vector4[] ArrayStride 16
Decorate %(StorageBuffer)enclose.class.vector4* ArrayStride 16
Decorate %simplified_nbody.vulkan_uniform. NonWritable
Decorate %simplified_nbody.vulkan_uniform. DescriptorSet 1
Decorate %simplified_nbody.vulkan_uniform. Binding 0
Decorate %enclose.class.vector4_0 Block
MemberDecorate %enclose.class.vector4_0 0 Offset 0
Decorate %class.vector4[]_0 ArrayStride 16
Decorate %(StorageBuffer)enclose.class.vector4_0* ArrayStride 16
Decorate %simplified_nbody.vulkan_uniform..1 DescriptorSet 1
Decorate %simplified_nbody.vulkan_uniform..1 Binding 1
Decorate %enclose.class.vector3 Block
MemberDecorate %enclose.class.vector3 0 Offset 0
Decorate %class.vector3[] ArrayStride 12
Decorate %(StorageBuffer)enclose.class.vector3* ArrayStride 12
MemberDecorate %class.vector3 0 Offset 0
MemberDecorate %union.anon.8 0 Offset 0
MemberDecorate %struct.anon.9 0 Offset 0
MemberDecorate %struct.anon.9 1 Offset 4
MemberDecorate %struct.anon.9 2 Offset 8
Decorate %simplified_nbody.vulkan_uniform..2 DescriptorSet 1
Decorate %simplified_nbody.vulkan_uniform..2 Binding 2
Decorate %enclose. Block
MemberDecorate %enclose. 0 Offset 0
Decorate %simplified_nbody.vulkan_uniform..3 NonWritable
Decorate %simplified_nbody.vulkan_uniform..3 Uniform
Decorate %simplified_nbody.vulkan_uniform..3 DescriptorSet 1
Decorate %simplified_nbody.vulkan_uniform..3 Binding 3
Decorate %simplified_nbody.vulkan_builtin_input. BuiltIn WorkgroupId
Decorate %simplified_nbody.vulkan_builtin_input..4 BuiltIn NumWorkgroups
Decorate %simplified_nbody.vulkan_builtin_input..5 BuiltIn SubgroupId
Decorate %simplified_nbody.vulkan_builtin_input..6 BuiltIn SubgroupLocalInvocationId
Decorate %simplified_nbody.vulkan_builtin_input..7 BuiltIn SubgroupSize
Decorate %simplified_nbody.vulkan_builtin_input..8 BuiltIn NumSubgroups
Decorate %(Workgroup)class.vector4[256l]* ArrayStride 4096
Decorate %155 NoSignedWrap
Decorate %155 NoUnsignedWrap
%ilong = TypeInt 64 1
%iint = TypeInt 32 1
%256l = Constant %ilong 256
%8i = Constant %iint 8
%0i = Constant %iint 0
%1i = Constant %iint 1
%2i = Constant %iint 2
%3i = Constant %iint 3
%2504i = Constant %iint 2504
%0l = Constant %ilong 0
%1l = Constant %ilong 1
%256i = Constant %iint 256
%Sampler = TypeSampler
%(UniformConstant)Sampler* = TypePointer UniformConstant %Sampler
%float = TypeFloat 32
%struct.anon = TypeStruct %float %float %float %float
%union.anon = TypeStruct %struct.anon
%class.vector4 = TypeStruct %union.anon
%class.vector4[256l] = TypeArray %class.vector4 %256l
%(Workgroup)class.vector4[256l]* = TypePointer Workgroup %class.vector4[256l]
%void = TypeVoid
%void() = TypeFunction %void
%class.vector4[] = TypeRuntimeArray %class.vector4
%enclose.class.vector4 = TypeStruct %class.vector4[]
%(StorageBuffer)enclose.class.vector4* = TypePointer StorageBuffer %enclose.class.vector4
%class.vector4[]_0 = TypeRuntimeArray %class.vector4
%enclose.class.vector4_0 = TypeStruct %class.vector4[]_0
%(StorageBuffer)enclose.class.vector4_0* = TypePointer StorageBuffer %enclose.class.vector4_0
%struct.anon.9 = TypeStruct %float %float %float
%union.anon.8 = TypeStruct %struct.anon.9
%class.vector3 = TypeStruct %union.anon.8
%class.vector3[] = TypeRuntimeArray %class.vector3
%enclose.class.vector3 = TypeStruct %class.vector3[]
%(StorageBuffer)enclose.class.vector3* = TypePointer StorageBuffer %enclose.class.vector3
%enclose. = TypeStruct %float
%(Uniform)enclose.* = TypePointer Uniform %enclose.
%<3xiint> = TypeVector %iint 3
%(Input)<3xiint>* = TypePointer Input %<3xiint>
%(Input)iint* = TypePointer Input %iint
%(StorageBuffer)float* = TypePointer StorageBuffer %float
%(Workgroup)float* = TypePointer Workgroup %float
%bool = TypeBool
%(Uniform)float* = TypePointer Uniform %float
%vulkan.immutable_sampler_0 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_1 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_2 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_3 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_4 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_5 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_6 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_7 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_8 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_9 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_10 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_11 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_12 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_13 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_14 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_15 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_16 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_17 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_18 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_19 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_20 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_21 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_22 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_23 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_24 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_25 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_26 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_27 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_28 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_29 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_30 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_31 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_32 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_33 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_34 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_35 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_36 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_37 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_38 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_39 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_40 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_41 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_42 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_43 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_44 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_45 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_46 = Variable %(UniformConstant)Sampler* UniformConstant
%vulkan.immutable_sampler_47 = Variable %(UniformConstant)Sampler* UniformConstant
%_ZZ16simplified_nbodyE20local_body_positions = Variable %(Workgroup)class.vector4[256l]* Workgroup
%simplified_nbody.vulkan_uniform. = Variable %(StorageBuffer)enclose.class.vector4* StorageBuffer
%simplified_nbody.vulkan_uniform..1 = Variable %(StorageBuffer)enclose.class.vector4_0* StorageBuffer
%simplified_nbody.vulkan_uniform..2 = Variable %(StorageBuffer)enclose.class.vector3* StorageBuffer
%simplified_nbody.vulkan_uniform..3 = Variable %(Uniform)enclose.* Uniform
%simplified_nbody.vulkan_builtin_input. = Variable %(Input)<3xiint>* Input
%simplified_nbody.vulkan_builtin_input..4 = Variable %(Input)<3xiint>* Input
%simplified_nbody.vulkan_builtin_input..5 = Variable %(Input)iint* Input
%simplified_nbody.vulkan_builtin_input..6 = Variable %(Input)iint* Input
%simplified_nbody.vulkan_builtin_input..7 = Variable %(Input)iint* Input
%simplified_nbody.vulkan_builtin_input..8 = Variable %(Input)iint* Input
%0.0f = Constant %float 0
%9.99999975e-05f = Constant %float 9.99999975e-05
%0.999000013f = Constant %float 0.999000013
function void simplified_nbody ( %void() ) {
92:
%98 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input. Aligned 16
%99 = CompositeExtract %iint %98 0
%101 = ShiftLeftLogical %iint %99 %8i
%102 = Load %iint %simplified_nbody.vulkan_builtin_input..6 Aligned 4
%103 = Load %iint %simplified_nbody.vulkan_builtin_input..5 Aligned 4
%104 = Load %iint %simplified_nbody.vulkan_builtin_input..7 Aligned 4
%105 = IMul %iint %103 %104
%106 = IAdd %iint %105 %102
%107 = IAdd %iint %101 %106
%108 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input..4 Aligned 16
%109 = CompositeExtract %iint %108 0
%110 = ShiftLeftLogical %iint %109 %8i
%113 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %107 %0i %0i %0i
%115 = Load %float %113 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%116 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %107 %0i %0i %1i
%117 = Load %float %116 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%119 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %107 %0i %0i %2i
%120 = Load %float %119 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%121 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %107 %0i %0i %0i
%122 = Load %float %121 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%123 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %107 %0i %0i %1i
%124 = Load %float %123 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%125 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %107 %0i %0i %2i
%126 = Load %float %125 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%128 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %0i
%129 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %1i
%130 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %2i
%132 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %3i
Branch %93
93:
%134 = Phi %iint ( %133 <- %96, %0i <- %92 )
%136 = Phi %iint ( %135 <- %96, %0i <- %92 )
%139 = Phi %float ( %0.0f <- %92, %138 <- %96 )
%141 = Phi %float ( %0.0f <- %92, %140 <- %96 )
%143 = Phi %float ( %0.0f <- %92, %142 <- %96 )
%144 = ShiftLeftLogical %iint %136 %8i
%145 = IAdd %iint %106 %144
%146 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %0i
%147 = Load %float %146 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%148 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %1i
%149 = Load %float %148 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%150 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %2i
%151 = Load %float %150 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%152 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %3i
%153 = Load %float %152 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
Store %128 %147 Aligned 4
Store %129 %149 Aligned 4
Store %130 %151 Aligned 4
Store %132 %153 Aligned 4
ControlBarrier %2i %2i %2504i
LoopMerge %97 %96 None
Branch %94
94:
%157 = Phi %ilong ( %155 <- %94, %0l <- %93 )
%158 = Phi %float ( %139 <- %93, %138 <- %94 )
%159 = Phi %float ( %141 <- %93, %140 <- %94 )
%160 = Phi %float ( %143 <- %93, %142 <- %94 )
%161 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %0i
%162 = Load %float %161 Aligned 4
%163 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %1i
%164 = Load %float %163 Aligned 4
%165 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %2i
%166 = Load %float %165 Aligned 4
%167 = FSub %float %162 %115
%168 = FSub %float %164 %117
%169 = FSub %float %166 %120
%171 = ExtInst %float %1 Fma %167 %167 %9.99999975e-05f
%172 = ExtInst %float %1 Fma %168 %168 %171
%173 = ExtInst %float %1 Fma %169 %169 %172
%174 = ExtInst %float %1 InverseSqrt %173
%175 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %3i
%176 = Load %float %175 Aligned 4
%177 = FMul %float %174 %174
%178 = FMul %float %177 %174
%179 = FMul %float %178 %176
%142 = ExtInst %float %1 Fma %179 %167 %160
%140 = ExtInst %float %1 Fma %179 %168 %159
%138 = ExtInst %float %1 Fma %179 %169 %158
%155 = IAdd %ilong %157 %1l
%186 = IEqual %bool %155 %256l
LoopMerge %95 %94 None
BranchConditional %186 %95 %94
95:
Branch %96
96:
ControlBarrier %2i %2i %2504i
%133 = IAdd %iint %134 %256i
%135 = IAdd %iint %136 %1i
%190 = ULessThan %bool %133 %110
BranchConditional %190 %93 %97
97:
%192 = InBoundsAccessChain %(Uniform)float* %simplified_nbody.vulkan_uniform..3 %0i
%193 = Load %float %192 Aligned 4
%194 = ExtInst %float %1 Fma %193 %142 %122
%195 = ExtInst %float %1 Fma %193 %140 %124
%196 = ExtInst %float %1 Fma %193 %138 %126
%198 = FMul %float %194 %0.999000013f
%199 = FMul %float %195 %0.999000013f
%200 = FMul %float %196 %0.999000013f
%201 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %107 %0i %0i %0i
%202 = Load %float %201 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%203 = ExtInst %float %1 Fma %198 %193 %202
Store %201 %203 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
%204 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %107 %0i %0i %1i
%205 = Load %float %204 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%206 = ExtInst %float %1 Fma %199 %193 %205
Store %204 %206 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
%207 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %107 %0i %0i %2i
%208 = Load %float %207 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
%209 = ExtInst %float %1 Fma %200 %193 %208
Store %207 %209 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
Store %121 %198 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
Store %123 %199 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
Store %125 %200 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
Return
}
-
OS:
-
only AMD64/Intel64/ARM64 are supported
-
Windows: NT 6.1+
-
macOS: 13.0+
-
iOS: 16.0+
-
Linux: any current x64 distribution
-
other Unix: if other requirements are met
-
-
compiler/toolchain:
-
libraries and optional requirements:
-
SDL3 3.1.3+
-
(opt) OpenCL: requires OpenCL 1.2+ SDK and CPU/GPU drivers (Intel, AMD)
-
(opt) CUDA: requires sm_50+/Maxwell+ GPU and CUDA 12.0+ drivers (CUDA SDK not required!)
-
(opt) Metal: requires iOS 16.0+ or macOS 13.0+, and a Metal 3.0 capable GPU
-
(opt) Host-Compute: requires just the compiler/toolchain that is stated above
-
(opt) Vulkan: requires 1.3.231+ ICD loader / headers / SDK, volk included as submodule
-
(opt) OpenVR: requires OpenVR
-
(opt) OpenXR: requires OpenXR
-
-
ensure git submodules are cloned and up-to-date:
git submodule update --init --recursive
-
run
./build.sh
(use./build.sh help
to get a list of all options) -
configuration of optional parts:
-
to disable OpenCL: define
FLOOR_NO_OPENCL
or./build.sh no-opencl
-
to disable CUDA: define
FLOOR_NO_CUDA
or./build.sh no-cuda
-
to disable Metal (only affects macOS/iOS builds): define
FLOOR_NO_METAL
or./build.sh no-metal
-
to disable Host Compute: define
FLOOR_NO_HOST_COMPUTE
or./build.sh no-host-compute
-
to disable Vulkan: define
FLOOR_NO_VULKAN
or./build.sh no-vulkan
-
to disable OpenVR: define
FLOOR_NO_OPENVR
or./build.sh no-openvr
-
to disable OpenXR: define
FLOOR_NO_OPENXR
or./build.sh no-openxr
-
to build with libstdc++ (GCC 13.0+) instead of libc++:
./build.sh libstdc++
-
-
this is provided as an alternative to build.sh and Xcode
-
create a build folder and
cd
into it -
run
cmake -G "Ninja" -S "<path-to-libfloor>" <options>
-
options:
-
to build a static library instead of a shared/dynamic one:
-DBUILD_SHARED_LIBS=OFF
-
to explicitly use libc++:
-DWITH_LIBCXX=ON
-
to build with address sanitizer:
-DWITH_ASAN=ON
-
-
run
ninja
-
open
floor.xcodeproj
and build -
some notes:
-
almost all optional parts of floor are enabled here and you’ll have to install all dependencies or disable them manually
-
Homebrew is the recommended way to install additional dependencies:
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
-
(opt) download OpenVR and manually install it:
-
mkdir -p {/usr/local/include/openvr,/usr/local/lib}
-
cp openvr/headers/* /usr/local/include/openvr/
-
cp openvr/bin/osx32/libopenvr_api.dylib /usr/local/lib/
-
-
command line tools might be necessary, install them with:
xcode-select --install
-
on iOS, either copy dependencies into your iPhoneOS and iPhoneSimulator SDK, or
floor/ios/deps/{include,lib}
-
iOS linker flags for a depending project:
-lSDL3 -lfloor
-
-
install Visual Studio 2022
-
in "Workloads" select "Desktop development with C++", in "Individual components" search for and select all clang packages
-
install and wait
-
install Vulkan SDK
-
install vcpkg (somewhere, not within libfloor):
-
git clone https://github.com/Microsoft/vcpkg.git
-
cd vcpkg
-
.\bootstrap-vcpkg.bat -disableMetrics
-
.\vcpkg integrate install
-
-
install vcpkg packages:
-
.\vcpkg --triplet x64-windows install sdl3 OpenCL vulkan openvr openxr-loader
-
-
add a user (or system) environment variable
VCPKG_ROOT
that points to the vcpkg folder -
in Visual Studio: Tools → Options → search for vcpkg and set the custom vcpkg.exe path
-
in Visual Studio: open folder
floor
(wait a little until build files are generated) -
select
Debug
orRelease
configuration and build -
NOTE: all dependencies (optional parts) are enabled here
-
NOTE: having other build environments/systems in
PATH
(e.g. MSYS2/MinGW) may result in install/build issues
-
sudo mkdir -p /opt/floor/include
-
sudo ln -sf /path/to/floor /opt/floor/include/floor
-
sudo ln -sf /path/to/floor/bin /opt/floor/lib
-
alternatively: copy these files/folders there
-
create a
%%ProgramFiles%%/floor
folder (C:/Program Files/floor) -
inside this folder:
-
create a
lib
folder -
VS2022:
-
copy everything from bin/ in there (dlls/lib/exp)
-
-
MinGW/MSYS2:
-
copy libfloor_static.a/libfloord_static.a there
-
-
create an
include
folder and copy the originalfloor
folder in there (containing all floor source code)
-
-
automated builds for Linux, macOS and Windows can be found at: https://libfloor.org/builds/toolchain
-
NOTE: this requires a Unix environment with all LLVM build dependencies installed - use MSYS2 on Windows
-
NOTE: the absolute build path must not contain spaces
-
compile the toolchain:
-
cd floor/etc/llvm140/ && ./build.sh
-
if successful, package it (in addition to a .zip file, this also creates a folder with all necessary binaries and include files):
./pkg.sh
-
-
install the toolchain:
-
Unix:
-
automatic:
-
development: run
./deploy_dev.sh
from the floor/etc/llvm140/ folder (this will create symlinks to everything in floor and floor/etc/llvm140) -
release: run
./deploy_pkg.sh
from inside the toolchain package folder (floor/etc/llvm140/toolchain_140006_*; this will copy everything)
-
-
manual:
-
copy the toolchain folder as
toolchain
to/opt/floor/
(should then be/opt/floor/toolchain/{bin,clang,libcxx}
) -
inside
/opt/floor/toolchain
, add a symlink to thefloor
include folder:sudo ln -sf ../include floor
-
-
-
Windows:
-
copy the toolchain folder as
toolchain
to%%ProgramFiles%%/floor
(should then be%%ProgramFiles%%/floor/toolchain/{bin,clang,libcxx}
) -
inside
%%ProgramFiles%%/floor/toolchain
, copy thefloor
folder from theinclude
folder above it into this folder
-
-
-
NOTE: this is the expected default setup - paths can be changed inside config.json (toolchain.generic.paths)
-
when using X11 forwarding, set these env variables:
-
export SDL_VIDEO_X11_NODIRECTCOLOR=yes
-
-
depending on how your Linux distribution handles OpenCL headers and library, you might need to manually install OpenCL 1.2+ compatible ones
-
Host-Compute device execution requires locked/pinned memory, which may be very limited in default Linux configurations (usually 64KiB)
-
libfloor will try to increase the limit to 32MiB per logical CPU core, but this may fail if the max limit is too low
-
to increase the max limit, /etc/security/limits.conf must be modified
-
as a simple workaround, add the following line to it (replace user_name with your user name) and relog:
-
user_name hard memlock unlimited
-
-
NOTE: when using ssh, PAM must be enabled for this to apply
-
-
depending on your Vulkan implementation, you may also need to increase the max amount of open files (usual default is 1024 files)
-
libfloor will try to increase the limit to 256 files per logical CPU core, but this may fail if the max limit is too low
-
to increase the max limit, /etc/security/limits.conf must be modified
-
as a simple workaround, add the following line to it (replace user_name with your user name) and relog:
-
user_name hard nofile unlimited
-
-
NOTE: when using ssh, PAM must be enabled for this to apply
-
-
floor_examples (dnn, nbody, warp, hlbvh, path tracer, other)
-
libwarp (image-space warping library)
-
obsolete: oclraster (Flexible Rasterizer in OpenCL)
-
obsolete: a2elight (Albion 2 Engine)
-
obsolete: unibot (IRC bot)