Skip to content

Commit

Permalink
arm: mc: Optimize blend_v
Browse files Browse the repository at this point in the history
Use a post-increment with a register on the last increment, avoiding
a separate increment. Avoid processing the last 8 pixels in the w32
case when we only output 24 pixels.

Before:
ARM32                Cortex A7      A8      A9     A53     A72     A73
blend_v_w4_8bpc_neon:    450.4   574.7   538.7   374.6   199.3   260.5
blend_v_w8_8bpc_neon:    559.6   351.3   552.5   357.6   214.8   204.3
blend_v_w16_8bpc_neon:   926.3   511.6   787.9   593.0   271.0   246.8
blend_v_w32_8bpc_neon:  1482.5   917.0  1149.5   991.9   354.0   368.9
ARM64
blend_v_w4_8bpc_neon:                            351.1   200.0   224.1
blend_v_w8_8bpc_neon:                            333.0   212.4   203.8
blend_v_w16_8bpc_neon:                           495.2   302.0   247.0
blend_v_w32_8bpc_neon:                           840.0   557.8   514.0

After:
ARM32
blend_v_w4_8bpc_neon:    435.5   575.8   537.6   356.2   198.3   259.5
blend_v_w8_8bpc_neon:    545.2   347.9   553.5   339.1   207.8   204.2
blend_v_w16_8bpc_neon:   913.7   511.0   788.1   573.7   275.4   243.3
blend_v_w32_8bpc_neon:  1445.3   951.2  1079.1   920.4   352.2   361.6
ARM64
blend_v_w4_8bpc_neon:                            333.0   191.3   225.9
blend_v_w8_8bpc_neon:                            314.9   199.3   203.5
blend_v_w16_8bpc_neon:                           476.9   301.3   241.1
blend_v_w32_8bpc_neon:                           766.9   432.8   416.9
  • Loading branch information
mstorsjo authored and barrbrain committed Mar 6, 2020
1 parent 8e41852 commit d858188
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 44 deletions.
28 changes: 11 additions & 17 deletions src/arm/32/mc.S
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
sub r1, r1, #3
sub r1, r1, #2
4:
vld1.u8 {d2}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
Expand All @@ -764,10 +764,8 @@ L(blend_v_tbl):
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0, :16]!
vst1.16 {d20[2]}, [r12, :16]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
add r0, r0, r1
add r12, r12, r1
vst1.8 {d20[2]}, [r0], r1
vst1.8 {d20[6]}, [r12], r1
bgt 4b
pop {r4-r5,pc}
80:
Expand All @@ -776,7 +774,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
sub r1, r1, #6
sub r1, r1, #4
8:
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
Expand All @@ -790,10 +788,8 @@ L(blend_v_tbl):
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0, :32]!
vst1.32 {d23[0]}, [r12, :32]!
vst1.16 {d22[2]}, [r0, :16]!
vst1.16 {d23[2]}, [r12, :16]!
add r0, r0, r1
add r12, r12, r1
vst1.16 {d22[2]}, [r0, :16], r1
vst1.16 {d23[2]}, [r12, :16], r1
bgt 8b
pop {r4-r5,pc}
160:
Expand All @@ -802,7 +798,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
sub r1, r1, #12
sub r1, r1, #8
16:
vld1.u8 {q1, q2}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
Expand All @@ -822,20 +818,18 @@ L(blend_v_tbl):
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0, :64]!
vst1.u8 {d20}, [r12, :64]!
vst1.32 {d19[0]}, [r0, :32]!
vst1.32 {d21[0]}, [r12, :32]!
add r0, r0, r1
add r12, r12, r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d21[0]}, [r12, :32], r1
bgt 16b
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5, :128]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
vsub.i8 d24, d20, d6
32:
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
vld1.u8 {d0, d1, d2}, [r0, :64]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
Expand Down
40 changes: 13 additions & 27 deletions src/arm/64/mc.S
Original file line number Diff line number Diff line change
Expand Up @@ -709,8 +709,8 @@ function blend_v_8bpc_neon, export=1
ret
40:
ld1r {v0.2s}, [x5]
sub x1, x1, #2
sub v1.8b, v4.8b, v0.8b
sub x1, x1, #3
4:
ld1 {v2.8b}, [x2], #8
ld1 {v3.s}[0], [x0]
Expand All @@ -721,16 +721,14 @@ function blend_v_8bpc_neon, export=1
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], #2
st1 {v5.h}[2], [x8], #2
st1 {v5.b}[2], [x0], #1
st1 {v5.b}[6], [x8], #1
add x0, x0, x1
add x8, x8, x1
st1 {v5.b}[2], [x0], x1
st1 {v5.b}[6], [x8], x1
b.gt 4b
ret
80:
ld1r {v0.2d}, [x5]
sub x1, x1, #4
sub v1.16b, v4.16b, v0.16b
sub x1, x1, #6
8:
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
Expand All @@ -744,16 +742,14 @@ function blend_v_8bpc_neon, export=1
rshrn2 v7.16b, v6.8h, #6
st1 {v7.s}[0], [x0], #4
st1 {v7.s}[2], [x8], #4
st1 {v7.h}[2], [x0], #2
st1 {v7.h}[6], [x8], #2
add x0, x0, x1
add x8, x8, x1
st1 {v7.h}[2], [x0], x1
st1 {v7.h}[6], [x8], x1
b.gt 8b
ret
160:
ld1 {v0.16b}, [x5]
sub x1, x1, #8
sub v2.16b, v4.16b, v0.16b
sub x1, x1, #12
16:
ld1 {v5.16b, v6.16b}, [x2], #32
ld1 {v7.16b}, [x0]
Expand All @@ -773,17 +769,15 @@ function blend_v_8bpc_neon, export=1
rshrn2 v22.16b, v21.8h, #6
st1 {v19.8b}, [x0], #8
st1 {v22.8b}, [x8], #8
st1 {v19.s}[2], [x0], #4
st1 {v22.s}[2], [x8], #4
add x0, x0, x1
add x8, x8, x1
st1 {v19.s}[2], [x0], x1
st1 {v22.s}[2], [x8], x1
b.gt 16b
ret
320:
ld1 {v0.16b, v1.16b}, [x5]
sub x1, x1, #16
sub v2.16b, v4.16b, v0.16b
sub v3.16b, v4.16b, v1.16b
sub x1, x1, #24
sub v3.8b, v4.8b, v1.8b
32:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ld1 {v5.16b, v6.16b}, [x0]
Expand All @@ -795,30 +789,22 @@ function blend_v_8bpc_neon, export=1
umlal2 v23.8h, v5.16b, v2.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v6.8b, v3.8b
umull2 v29.8h, v17.16b, v1.16b
umlal2 v29.8h, v6.16b, v3.16b
umull v30.8h, v18.8b, v0.8b
umlal v30.8h, v20.8b, v2.8b
umull2 v31.8h, v18.16b, v0.16b
umlal2 v31.8h, v20.16b, v2.16b
umull v25.8h, v19.8b, v1.8b
umlal v25.8h, v21.8b, v3.8b
umull2 v26.8h, v19.16b, v1.16b
umlal2 v26.8h, v21.16b, v3.16b
rshrn v24.8b, v22.8h, #6
rshrn2 v24.16b, v23.8h, #6
rshrn v28.8b, v28.8h, #6
rshrn2 v28.16b, v29.8h, #6
rshrn v30.8b, v30.8h, #6
rshrn2 v30.16b, v31.8h, #6
rshrn v27.8b, v25.8h, #6
rshrn2 v27.16b, v26.8h, #6
st1 {v24.16b}, [x0], #16
st1 {v30.16b}, [x8], #16
st1 {v28.8b}, [x0], #8
st1 {v27.8b}, [x8], #8
add x0, x0, x1
add x8, x8, x1
st1 {v28.8b}, [x0], x1
st1 {v27.8b}, [x8], x1
b.gt 32b
ret
L(blend_v_tbl):
Expand Down

0 comments on commit d858188

Please sign in to comment.