diff --git a/mask_amd64.s b/mask_amd64.s index 73ae59b4..8464440b 100644 --- a/mask_amd64.s +++ b/mask_amd64.s @@ -117,10 +117,10 @@ less_than_4: less_than_2: TESTQ $1, CX - JZ done + JZ end XORB SI, (AX) ROLL $24, SI -done: +end: MOVL SI, ret+24(FP) RET diff --git a/mask_arm64.s b/mask_arm64.s index 8fd49aa9..42a1211f 100644 --- a/mask_arm64.s +++ b/mask_arm64.s @@ -15,7 +15,7 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28 CMP $64, R1 BLT less_than_64 -// TODO: allign memory like amd64 +// TODO: align memory like amd64 loop_64: VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] @@ -29,41 +29,39 @@ loop_64: BGE loop_64 less_than_64: - // quick end - CBZ R1, end - TBZ $5, R1, less_than32 + TBZ $5, R1, less_than_32 VLD1 (R0), [V1.B16, V2.B16] VEOR V1.B16, V0.B16, V1.B16 VEOR V2.B16, V0.B16, V2.B16 VST1.P [V1.B16, V2.B16], 32(R0) -less_than32: - TBZ $4, R1, less_than16 +less_than_32: + TBZ $4, R1, less_than_16 LDP (R0), (R11, R12) EOR R11, R2, R11 EOR R12, R2, R12 STP.P (R11, R12), 16(R0) -less_than16: - TBZ $3, R1, less_than8 +less_than_16: + TBZ $3, R1, less_than_8 MOVD (R0), R11 EOR R2, R11, R11 MOVD.P R11, 8(R0) -less_than8: - TBZ $2, R1, less_than4 +less_than_8: + TBZ $2, R1, less_than_4 MOVWU (R0), R11 EORW R2, R11, R11 MOVWU.P R11, 4(R0) -less_than4: - TBZ $1, R1, less_than2 +less_than_4: + TBZ $1, R1, less_than_2 MOVHU (R0), R11 EORW R3, R11, R11 MOVHU.P R11, 2(R0) RORW $16, R3 -less_than2: +less_than_2: TBZ $0, R1, end MOVBU (R0), R11 EORW R3, R11, R11