From 78fb1d03d39e8357e4790a9f0788ef0a8e7d8ae1 Mon Sep 17 00:00:00 2001 From: Archana R Date: Wed, 10 Nov 2021 01:18:42 -0600 Subject: [PATCH] internal/bytealg: optimize cmpbody for ppc64le/ppc64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vectorize the cmpbody loop for bytes of size greater than or equal to 32 on both POWER8(LE and BE) and POWER9(LE and BE) and improve performance of smaller size compares Performance improves for most sizes with this change on POWER8, 9 and POWER10. For the very small sizes (upto 8) the overhead of calling function starts to impact performance. POWER9: name old time/op new time/op delta BytesCompare/1 4.60ns ± 0% 5.49ns ± 0% +19.27% BytesCompare/2 4.68ns ± 0% 5.46ns ± 0% +16.71% BytesCompare/4 6.58ns ± 0% 5.49ns ± 0% -16.58% BytesCompare/8 4.89ns ± 0% 5.46ns ± 0% +11.64% BytesCompare/16 5.21ns ± 0% 4.96ns ± 0% -4.70% BytesCompare/32 5.09ns ± 0% 4.98ns ± 0% -2.14% BytesCompare/64 6.40ns ± 0% 5.96ns ± 0% -6.84% BytesCompare/128 11.3ns ± 0% 8.1ns ± 0% -28.09% BytesCompare/256 15.1ns ± 0% 12.8ns ± 0% -15.16% BytesCompare/512 26.5ns ± 0% 23.3ns ± 5% -12.03% BytesCompare/1024 50.2ns ± 0% 41.6ns ± 2% -17.01% BytesCompare/2048 99.3ns ± 0% 86.5ns ± 0% -12.88% Change-Id: I24f93b2910591e6829ddd8509aa6eeaa6355c609 Reviewed-on: https://go-review.googlesource.com/c/go/+/362797 Reviewed-by: Lynn Boger Run-TryBot: Archana Ravindar TryBot-Result: Gopher Robot Reviewed-by: Ian Lance Taylor Reviewed-by: Than McIntosh --- src/internal/bytealg/compare_ppc64x.s | 552 ++++++++++++++++++-------- 1 file changed, 388 insertions(+), 164 deletions(-) diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s index fc6f170ca81e2c..cbe0525af55d40 100644 --- a/src/internal/bytealg/compare_ppc64x.s +++ b/src/internal/bytealg/compare_ppc64x.s @@ -21,11 +21,12 @@ TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 CMP R5,R6,CR7 CMP R3,R4,CR6 BEQ CR7,equal -#ifdef GOARCH_ppc64le - BR cmpbodyLE<>(SB) -#else - BR cmpbodyBE<>(SB) -#endif + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) equal: BEQ CR6,done MOVD $1, R8 @@ -52,11 +53,12 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 CMP R5,R6,CR7 CMP R3,R4,CR6 BEQ CR7,equal -#ifdef GOARCH_ppc64le - BR cmpbodyLE<>(SB) -#else - BR cmpbodyBE<>(SB) -#endif + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) equal: BEQ CR6,done MOVD $1, R8 @@ -70,209 +72,431 @@ done: MOVD $0, R3 RET -// Do an efficient memcmp for ppc64le +#ifdef GOARCH_ppc64le +DATA byteswap<>+0(SB)/8, $0x0706050403020100 +DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 +GLOBL byteswap<>+0(SB), RODATA, $16 +#define SWAP V21 +#endif + +// Do an efficient memcmp for ppc64le/ppc64/POWER8 // R3 = a len // R4 = b len // R5 = a addr // R6 = b addr // On exit: // R3 = return value -TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0 +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R3,R8 // set up length CMP R3,R4,CR2 // unequal? - BC 12,8,setuplen // BLT CR2 + BLT CR2,setuplen // BLT CR2 MOVD R4,R8 // use R4 for comparison len setuplen: - MOVD R8,CTR // set up loop counter - CMP R8,$8 // only optimize >=8 - BLT simplecheck - DCBT (R5) // cache hint - DCBT (R6) CMP R8,$32 // optimize >= 32 MOVD R8,R9 - BLT setup8a // 8 byte moves only -setup32a: - SRADCC $5,R8,R9 // number of 32 byte chunks - MOVD R9,CTR - - // Special processing for 32 bytes or longer. - // Loading this way is faster and correct as long as the - // doublewords being compared are equal. Once they - // are found unequal, reload them in proper byte order - // to determine greater or less than. -loop32a: - MOVD 0(R5),R9 // doublewords to compare - MOVD 0(R6),R10 // get 4 doublewords - MOVD 8(R5),R14 - MOVD 8(R6),R15 - CMPU R9,R10 // bytes equal? - MOVD $0,R16 // set up for cmpne - BNE cmpne // further compare for LT or GT - MOVD 16(R5),R9 // get next pair of doublewords - MOVD 16(R6),R10 - CMPU R14,R15 // bytes match? - MOVD $8,R16 // set up for cmpne - BNE cmpne // further compare for LT or GT - MOVD 24(R5),R14 // get next pair of doublewords - MOVD 24(R6),R15 - CMPU R9,R10 // bytes match? - MOVD $16,R16 // set up for cmpne - BNE cmpne // further compare for LT or GT - MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32 - ADD $32,R5 // bump up to next 32 - ADD $32,R6 - CMPU R14,R15 // bytes match? - BC 8,2,loop32a // br ctr and cr - BNE cmpne + BLT setup8a // optimize < 32 + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$64 + BLT cmp32 // process size 32-63 + + DCBT (R5) // optimize >= 64 + DCBT (R6) // cache hint + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different // jump out if its different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector + LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector + LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$0 + BNE rem // loop to rem if the remainder is not 0 + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)+00(SB), R16 + LXVD2X (R16)(R0),SWAP // Set up swap string + + VPERM V3,V3,SWAP,V3 + VPERM V4,V4,SWAP,V4 +#endif + MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower + BGT greater + MOVD $-1,R3 // return value if A < B + RET +lower: + VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison + MFVSRD VS35,R16 + VSLDOI $8,V4,V4,V4 + MFVSRD VS36,R10 + + CMPU R16,R10 + BGT greater + MOVD $-1,R3 // return value if A < B + RET setup8a: - SRADCC $3,R9,R9 // get the 8 byte count + SRADCC $3,R8,R9 // get the 8 byte count BEQ leftover // shifted value is 0 + CMPU R8,$8 // optimize 8byte move + BEQ size8 + CMPU R8,$16 + BEQ size16 MOVD R9,CTR // loop count for doublewords loop8: - MOVDBR (R5+R0),R9 // doublewords to compare +#ifdef GOARCH_ppc64le + MOVDBR (R5+R0),R16 // doublewords to compare MOVDBR (R6+R0),R10 // LE compare order +#else + MOVD (R5+R0),R16 // doublewords to compare + MOVD (R6+R0),R10 // BE compare order +#endif ADD $8,R5 ADD $8,R6 - CMPU R9,R10 // match? + CMPU R16,R10 // match? BC 8,2,loop8 // bt ctr <> 0 && cr BGT greater BLT less leftover: ANDCC $7,R8,R9 // check for leftover bytes - MOVD R9,CTR // save the ctr - BNE simple // leftover bytes - BC 12,10,equal // test CR2 for length comparison - BC 12,8,less - BR greater + BEQ zeroremainder simplecheck: - CMP R8,$0 // remaining compare length 0 - BNE simple // do simple compare - BC 12,10,equal // test CR2 for length comparison - BC 12,8,less // 1st len < 2nd len, result less - BR greater // 1st len > 2nd len must be greater -simple: - MOVBZ 0(R5), R9 // get byte from 1st operand - ADD $1,R5 - MOVBZ 0(R6), R10 // get byte from 2nd operand - ADD $1,R6 - CMPU R9, R10 - BC 8,2,simple // bc ctr <> 0 && cr - BGT greater // 1st > 2nd - BLT less // 1st < 2nd - BC 12,10,equal // test CR2 for length comparison - BC 12,9,greater // 2nd len > 1st len - BR less // must be less -cmpne: // only here is not equal - MOVDBR (R5+R16),R8 // reload in reverse order - MOVDBR (R6+R16),R9 - CMPU R8,R9 // compare correct endianness - BGT greater // here only if NE -less: - MOVD $-1, R3 // return value if A < B + MOVD R0,R14 + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5)(R14),R10 + MOVWBR (R6)(R14),R11 +#else + MOVWZ (R5)(R14),R10 + MOVWZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-4,R9 + ADD $4,R14 + PCALIGN $16 + +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5)(R14),R10 + MOVHBR (R6)(R14),R11 +#else + MOVHZ (R5)(R14),R10 + MOVHZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-2,R9 + ADD $2,R14 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5)(R14),R10 + MOVBZ (R6)(R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater + +less: MOVD $-1,R3 // return value if A < B RET +size16: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +zeroremainder: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A) 1st len + BLT CR2,less // 2nd len < 1st len equal: MOVD $0, R3 // return value if A == B RET greater: - MOVD $1, R3 // return value if A > B + MOVD $1,R3 // return value if A > B RET -// Do an efficient memcmp for ppc64 (BE) +// Do an efficient memcmp for ppc64le/ppc64/POWER9 // R3 = a len // R4 = b len // R5 = a addr // R6 = b addr // On exit: // R3 = return value -TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0 +TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R3,R8 // set up length CMP R3,R4,CR2 // unequal? - BC 12,8,setuplen // BLT CR2 + BLT CR2,setuplen // BLT CR2 MOVD R4,R8 // use R4 for comparison len setuplen: - MOVD R8,CTR // set up loop counter - CMP R8,$8 // only optimize >=8 - BLT simplecheck - DCBT (R5) // cache hint - DCBT (R6) - CMP R8,$32 // optimize >= 32 + CMP R8,$16 // optimize for size<16 MOVD R8,R9 - BLT setup8a // 8 byte moves only - -setup32a: - SRADCC $5,R8,R9 // number of 32 byte chunks - MOVD R9,CTR -loop32a: - MOVD 0(R5),R9 // doublewords to compare - MOVD 0(R6),R10 // get 4 doublewords - MOVD 8(R5),R14 - MOVD 8(R6),R15 - CMPU R9,R10 // bytes equal? - BLT less // found to be less - BGT greater // found to be greater - MOVD 16(R5),R9 // get next pair of doublewords - MOVD 16(R6),R10 - CMPU R14,R15 // bytes match? - BLT less // found less - BGT greater // found greater - MOVD 24(R5),R14 // get next pair of doublewords - MOVD 24(R6),R15 - CMPU R9,R10 // bytes match? - BLT less // found to be less - BGT greater // found to be greater - ADD $32,R5 // bump up to next 32 - ADD $32,R6 - CMPU R14,R15 // bytes match? - BC 8,2,loop32a // br ctr and cr - BLT less // with BE, byte ordering is - BGT greater // good for compare - ANDCC $24,R8,R9 // Any 8 byte chunks? - BEQ leftover // and result is 0 -setup8a: - SRADCC $3,R9,R9 // get the 8 byte count - BEQ leftover // shifted value is 0 - MOVD R9,CTR // loop count for doublewords -loop8: - MOVD (R5),R9 - MOVD (R6),R10 - ADD $8,R5 - ADD $8,R6 - CMPU R9,R10 // match? - BC 8,2,loop8 // bt ctr <> 0 && cr + BLT simplecheck + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$32 // optimize for size 16-31 + BLT cmp16 + CMP R8,$64 + BLT cmp32 // optimize for size 32-63 + DCBT (R5) // optimize for size>=64 + DCBT (R6) // cache hint + + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector + VCMPNEBCC V3,V4,V1 // record comparison into V1 + BNE CR6,different // jump out if its different + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector + LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector + LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$16 + BGE cmp16 // loop to cmp16 if there are 16-31 bytes left + CMPU R9,$0 + BNE simplecheck // loop to simplecheck for remaining bytes + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A) B + RET +cmp16: + ANDCC $16,R9,R31 + BEQ tail + + LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $16,R5 + ADD $16,R6 +tail: + ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) + BEQ end + + ADD R9,R5 + ADD R9,R6 + MOVD $-16,R10 + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +end: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if BLT CR2 that is, len(A) 0 && cr - BGT greater // 1st > 2nd - BLT less // 1st < 2nd - BC 12,10,equal // test CR2 for length comparison - BC 12,9,greater // 2nd len > 1st len + MOVD $0,R14 // process 8 bytes + CMP R9,$8 + BLT word +#ifdef GOARCH_ppc64le + MOVDBR (R5+R14),R10 + MOVDBR (R6+R14),R11 +#else + MOVD (R5+R14),R10 + MOVD (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $8,R14 + ADD $-8,R9 + PCALIGN $16 +word: + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5+R14),R10 + MOVWBR (R6+R14),R11 +#else + MOVWZ (R5+R14),R10 + MOVWZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $4,R14 + ADD $-4,R9 + PCALIGN $16 +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5+R14),R10 + MOVHBR (R6+R14),R11 +#else + MOVHZ (R5+R14),R10 + MOVHZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $2,R14 + ADD $-2,R9 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5+R14),R10 + MOVBZ (R6+R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater less: - MOVD $-1, R3 // return value if A < B + MOVD $-1,R3 // return value if A < B RET equal: MOVD $0, R3 // return value if A == B RET -greater: - MOVD $1, R3 // return value if A > B - RET