From 261fe25c83a94fc3defe064baed3944cd3d16959 Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Sun, 18 Sep 2022 01:43:20 +1000 Subject: [PATCH] internal/bytealg: simplify and improve compare on riscv64 Remove some unnecessary loops and pull the comparison code out from the compare/loop code. Add an unaligned 8 byte comparison, which reads 8 bytes from each input before comparing them. This gives a reasonable gain in performance for the large unaligned case. Updates #50615 name old time/op new time/op delta CompareBytesEqual-4 116ns _ 0% 111ns _ 0% -4.10% (p=0.000 n=5+5) CompareBytesToNil-4 34.9ns _ 0% 35.0ns _ 0% +0.45% (p=0.002 n=5+5) CompareBytesEmpty-4 29.6ns _ 1% 29.8ns _ 0% +0.71% (p=0.016 n=5+5) CompareBytesIdentical-4 29.8ns _ 0% 29.9ns _ 1% +0.50% (p=0.036 n=5+5) CompareBytesSameLength-4 66.1ns _ 0% 60.4ns _ 0% -8.59% (p=0.000 n=5+5) CompareBytesDifferentLength-4 63.1ns _ 0% 60.5ns _ 0% -4.20% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=1-4 6.84ms _ 3% 6.04ms _ 5% -11.70% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=2-4 6.99ms _ 4% 5.93ms _ 6% -15.22% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=3-4 6.74ms _ 1% 6.00ms _ 5% -10.94% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=4-4 7.20ms _ 6% 5.97ms _ 6% -17.05% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=5-4 6.75ms _ 1% 5.81ms _ 8% -13.93% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=6-4 6.89ms _ 5% 5.75ms _ 2% -16.58% (p=0.000 n=5+4) CompareBytesBigUnaligned/offset=7-4 6.91ms _ 6% 6.13ms _ 6% -11.27% (p=0.001 n=5+5) CompareBytesBig-4 2.75ms _ 5% 2.71ms _ 8% ~ (p=0.651 n=5+5) CompareBytesBigIdentical-4 29.9ns _ 1% 29.8ns _ 0% ~ (p=0.751 n=5+5) name old speed new speed delta CompareBytesBigUnaligned/offset=1-4 153MB/s _ 3% 174MB/s _ 6% +13.40% (p=0.003 n=5+5) CompareBytesBigUnaligned/offset=2-4 150MB/s _ 4% 177MB/s _ 6% +18.06% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=3-4 156MB/s _ 1% 175MB/s _ 5% +12.39% (p=0.002 n=5+5) CompareBytesBigUnaligned/offset=4-4 146MB/s _ 6% 176MB/s _ 6% +20.67% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=5-4 155MB/s _ 1% 181MB/s _ 7% +16.35% (p=0.002 n=5+5) CompareBytesBigUnaligned/offset=6-4 152MB/s _ 5% 182MB/s _ 2% +19.74% (p=0.000 n=5+4) CompareBytesBigUnaligned/offset=7-4 152MB/s _ 6% 171MB/s _ 6% +12.70% (p=0.001 n=5+5) CompareBytesBig-4 382MB/s _ 5% 388MB/s _ 9% ~ (p=0.616 n=5+5) CompareBytesBigIdentical-4 35.1TB/s _ 1% 35.1TB/s _ 0% ~ (p=0.800 n=5+5) Change-Id: I127edc376e62a2c529719a4ab172f481e0a81357 Reviewed-on: https://go-review.googlesource.com/c/go/+/431100 Reviewed-by: Cherry Mui Reviewed-by: Meng Zhuo Reviewed-by: Bryan Mills TryBot-Result: Gopher Robot Reviewed-by: Joedian Reid Run-TryBot: Joel Sing --- src/internal/bytealg/compare_riscv64.s | 173 +++++++++++++++---------- 1 file changed, 103 insertions(+), 70 deletions(-) diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s index 44a743d3af57aa..68cba2a37f1852 100644 --- a/src/internal/bytealg/compare_riscv64.s +++ b/src/internal/bytealg/compare_riscv64.s @@ -40,13 +40,13 @@ use_a_len: BEQZ X5, cmp_len MOV $32, X6 - BLT X5, X6, loop4_check + BLT X5, X6, check8_unaligned // Check alignment - if alignment differs we have to do one byte at a time. AND $7, X10, X7 AND $7, X12, X8 - BNE X7, X8, loop4_check - BEQZ X7, loop32_check + BNE X7, X8, check8_unaligned + BEQZ X7, compare32 // Check one byte at a time until we reach 8 byte alignment. SUB X7, X5, X5 @@ -59,94 +59,99 @@ align: ADD $1, X12 BNEZ X7, align -loop32_check: - MOV $32, X7 - BLT X5, X7, loop16_check -loop32: +check32: + MOV $32, X6 + BLT X5, X6, compare16 +compare32: MOV 0(X10), X15 MOV 0(X12), X16 MOV 8(X10), X17 MOV 8(X12), X18 - BEQ X15, X16, loop32a - JMP cmp8a -loop32a: - BEQ X17, X18, loop32b - JMP cmp8b -loop32b: + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b MOV 16(X10), X15 MOV 16(X12), X16 MOV 24(X10), X17 MOV 24(X12), X18 - BEQ X15, X16, loop32c - JMP cmp8a -loop32c: - BEQ X17, X18, loop32d - JMP cmp8b -loop32d: + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b ADD $32, X10 ADD $32, X12 ADD $-32, X5 - BGE X5, X7, loop32 + BGE X5, X6, compare32 BEQZ X5, cmp_len -loop16_check: +check16: MOV $16, X6 - BLT X5, X6, loop4_check -loop16: + BLT X5, X6, check8_unaligned +compare16: MOV 0(X10), X15 MOV 0(X12), X16 MOV 8(X10), X17 MOV 8(X12), X18 - BEQ X15, X16, loop16a - JMP cmp8a -loop16a: - BEQ X17, X18, loop16b - JMP cmp8b -loop16b: + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b ADD $16, X10 ADD $16, X12 ADD $-16, X5 - BGE X5, X6, loop16 BEQZ X5, cmp_len -loop4_check: - MOV $4, X6 - BLT X5, X6, loop1 -loop4: +check8_unaligned: + MOV $8, X6 + BLT X5, X6, check4_unaligned +compare8_unaligned: MOVBU 0(X10), X8 + MOVBU 1(X10), X15 + MOVBU 2(X10), X17 + MOVBU 3(X10), X19 + MOVBU 4(X10), X21 + MOVBU 5(X10), X23 + MOVBU 6(X10), X25 + MOVBU 7(X10), X29 MOVBU 0(X12), X9 + MOVBU 1(X12), X16 + MOVBU 2(X12), X18 + MOVBU 3(X12), X20 + MOVBU 4(X12), X22 + MOVBU 5(X12), X24 + MOVBU 6(X12), X28 + MOVBU 7(X12), X30 + BNE X8, X9, cmp1a + BNE X15, X16, cmp1b + BNE X17, X18, cmp1c + BNE X19, X20, cmp1d + BNE X21, X22, cmp1e + BNE X23, X24, cmp1f + BNE X25, X28, cmp1g + BNE X29, X30, cmp1h + ADD $8, X10 + ADD $8, X12 + ADD $-8, X5 + BGE X5, X6, compare8_unaligned + BEQZ X5, cmp_len + +check4_unaligned: + MOV $4, X6 + BLT X5, X6, compare1 +compare4_unaligned: + MOVBU 0(X10), X8 MOVBU 1(X10), X15 + MOVBU 2(X10), X17 + MOVBU 3(X10), X19 + MOVBU 0(X12), X9 MOVBU 1(X12), X16 - BEQ X8, X9, loop4a - SLTU X9, X8, X5 - SLTU X8, X9, X6 - JMP cmp_ret -loop4a: - BEQ X15, X16, loop4b - SLTU X16, X15, X5 - SLTU X15, X16, X6 - JMP cmp_ret -loop4b: - MOVBU 2(X10), X21 - MOVBU 2(X12), X22 - MOVBU 3(X10), X23 - MOVBU 3(X12), X24 - BEQ X21, X22, loop4c - SLTU X22, X21, X5 - SLTU X21, X22, X6 - JMP cmp_ret -loop4c: - BEQ X23, X24, loop4d - SLTU X24, X23, X5 - SLTU X23, X24, X6 - JMP cmp_ret -loop4d: + MOVBU 2(X12), X18 + MOVBU 3(X12), X20 + BNE X8, X9, cmp1a + BNE X15, X16, cmp1b + BNE X17, X18, cmp1c + BNE X19, X20, cmp1d ADD $4, X10 ADD $4, X12 ADD $-4, X5 - BGE X5, X6, loop4 + BGE X5, X6, compare4_unaligned -loop1: +compare1: BEQZ X5, cmp_len MOVBU 0(X10), X8 MOVBU 0(X12), X9 @@ -154,27 +159,55 @@ loop1: ADD $1, X10 ADD $1, X12 ADD $-1, X5 - JMP loop1 + JMP compare1 // Compare 8 bytes of memory in X15/X16 that are known to differ. cmp8a: - MOV $0xff, X19 -cmp8a_loop: - AND X15, X19, X8 - AND X16, X19, X9 - BNE X8, X9, cmp - SLLI $8, X19 - JMP cmp8a_loop + MOV X15, X17 + MOV X16, X18 // Compare 8 bytes of memory in X17/X18 that are known to differ. cmp8b: MOV $0xff, X19 -cmp8b_loop: +cmp8_loop: AND X17, X19, X8 AND X18, X19, X9 BNE X8, X9, cmp SLLI $8, X19 - JMP cmp8b_loop + JMP cmp8_loop + +cmp1a: + SLTU X9, X8, X5 + SLTU X8, X9, X6 + JMP cmp_ret +cmp1b: + SLTU X16, X15, X5 + SLTU X15, X16, X6 + JMP cmp_ret +cmp1c: + SLTU X18, X17, X5 + SLTU X17, X18, X6 + JMP cmp_ret +cmp1d: + SLTU X20, X19, X5 + SLTU X19, X20, X6 + JMP cmp_ret +cmp1e: + SLTU X22, X21, X5 + SLTU X21, X22, X6 + JMP cmp_ret +cmp1f: + SLTU X24, X23, X5 + SLTU X23, X24, X6 + JMP cmp_ret +cmp1g: + SLTU X28, X25, X5 + SLTU X25, X28, X6 + JMP cmp_ret +cmp1h: + SLTU X30, X29, X5 + SLTU X29, X30, X6 + JMP cmp_ret cmp_len: MOV X11, X8