-
Notifications
You must be signed in to change notification settings - Fork 15.6k
Closed
Description
const float delx = xtmp - x[j].x;
const float dely = ytmp - x[j].y;
const float delz = ztmp - x[j].z;
const float rsq = delx * delx + dely * dely + delz * delz;- The kernel body of GCC: VL = 4, use loop vectorize
fsub v30.4s, v22.4s, v30.4s
zip1 v28.4s, v28.4s, v29.4s
zip1 v31.4s, v31.4s, v27.4s
fmul v29.4s, v30.4s, v30.4s
fsub v28.4s, v21.4s, v28.4s
fsub v31.4s, v23.4s, v31.4s
fmla v29.4s, v28.4s, v28.4s
fmla v29.4s, v31.4s, v31.4s
fdiv v27.4s, v16.4s, v29.4s -- float r2inv = 1.0 / rsq;- The kernel body of Clang: VL = 2, use SLP
.LBB0_8: // Parent Loop BB0_5 Depth=1
ldrsw x16, [x15], #4
add x16, x6, x16, lsl #4
ldr d16, [x16]
ldr s17, [x16, #8]
fsub v16.2s, v6.2s, v16.2s
fsub s17, s7, s17
fmul v18.2s, v16.2s, v16.2s
faddp s18, v18.2s
fmadd s18, s17, s17, s18
fcmp s18, s0
b.ge .LBB0_7
fdiv s18, s3, s18 -- float r2inv = 1.0 / rsq;