Skip to content

Commit

Permalink
internal/bytealg: simplify and improve compare on riscv64
Browse files Browse the repository at this point in the history
Remove some unnecessary loops and pull the comparison code out from the
compare/loop code. Add an unaligned 8 byte comparison, which reads 8 bytes
from each input before comparing them. This gives a reasonable gain in
performance for the large unaligned case.

Updates #50615

name                                 old time/op    new time/op    delta
CompareBytesEqual-4                     116ns _ 0%     111ns _ 0%   -4.10%  (p=0.000 n=5+5)
CompareBytesToNil-4                    34.9ns _ 0%    35.0ns _ 0%   +0.45%  (p=0.002 n=5+5)
CompareBytesEmpty-4                    29.6ns _ 1%    29.8ns _ 0%   +0.71%  (p=0.016 n=5+5)
CompareBytesIdentical-4                29.8ns _ 0%    29.9ns _ 1%   +0.50%  (p=0.036 n=5+5)
CompareBytesSameLength-4               66.1ns _ 0%    60.4ns _ 0%   -8.59%  (p=0.000 n=5+5)
CompareBytesDifferentLength-4          63.1ns _ 0%    60.5ns _ 0%   -4.20%  (p=0.000 n=5+5)
CompareBytesBigUnaligned/offset=1-4    6.84ms _ 3%    6.04ms _ 5%  -11.70%  (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=2-4    6.99ms _ 4%    5.93ms _ 6%  -15.22%  (p=0.000 n=5+5)
CompareBytesBigUnaligned/offset=3-4    6.74ms _ 1%    6.00ms _ 5%  -10.94%  (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=4-4    7.20ms _ 6%    5.97ms _ 6%  -17.05%  (p=0.000 n=5+5)
CompareBytesBigUnaligned/offset=5-4    6.75ms _ 1%    5.81ms _ 8%  -13.93%  (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=6-4    6.89ms _ 5%    5.75ms _ 2%  -16.58%  (p=0.000 n=5+4)
CompareBytesBigUnaligned/offset=7-4    6.91ms _ 6%    6.13ms _ 6%  -11.27%  (p=0.001 n=5+5)
CompareBytesBig-4                      2.75ms _ 5%    2.71ms _ 8%     ~     (p=0.651 n=5+5)
CompareBytesBigIdentical-4             29.9ns _ 1%    29.8ns _ 0%     ~     (p=0.751 n=5+5)

name                                 old speed      new speed      delta
CompareBytesBigUnaligned/offset=1-4   153MB/s _ 3%   174MB/s _ 6%  +13.40%  (p=0.003 n=5+5)
CompareBytesBigUnaligned/offset=2-4   150MB/s _ 4%   177MB/s _ 6%  +18.06%  (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=3-4   156MB/s _ 1%   175MB/s _ 5%  +12.39%  (p=0.002 n=5+5)
CompareBytesBigUnaligned/offset=4-4   146MB/s _ 6%   176MB/s _ 6%  +20.67%  (p=0.001 n=5+5)
CompareBytesBigUnaligned/offset=5-4   155MB/s _ 1%   181MB/s _ 7%  +16.35%  (p=0.002 n=5+5)
CompareBytesBigUnaligned/offset=6-4   152MB/s _ 5%   182MB/s _ 2%  +19.74%  (p=0.000 n=5+4)
CompareBytesBigUnaligned/offset=7-4   152MB/s _ 6%   171MB/s _ 6%  +12.70%  (p=0.001 n=5+5)
CompareBytesBig-4                     382MB/s _ 5%   388MB/s _ 9%     ~     (p=0.616 n=5+5)
CompareBytesBigIdentical-4           35.1TB/s _ 1%  35.1TB/s _ 0%     ~     (p=0.800 n=5+5)

Change-Id: I127edc376e62a2c529719a4ab172f481e0a81357
Reviewed-on: https://go-review.googlesource.com/c/go/+/431100
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Meng Zhuo <mzh@golangcn.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joedian Reid <joedian@golang.org>
Run-TryBot: Joel Sing <joel@sing.id.au>
  • Loading branch information
4a6f656c committed Feb 11, 2023
1 parent e03ee85 commit 261fe25
Showing 1 changed file with 103 additions and 70 deletions.
173 changes: 103 additions & 70 deletions src/internal/bytealg/compare_riscv64.s
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ use_a_len:
BEQZ X5, cmp_len

MOV $32, X6
BLT X5, X6, loop4_check
BLT X5, X6, check8_unaligned

// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X7
AND $7, X12, X8
BNE X7, X8, loop4_check
BEQZ X7, loop32_check
BNE X7, X8, check8_unaligned
BEQZ X7, compare32

// Check one byte at a time until we reach 8 byte alignment.
SUB X7, X5, X5
Expand All @@ -59,122 +59,155 @@ align:
ADD $1, X12
BNEZ X7, align

loop32_check:
MOV $32, X7
BLT X5, X7, loop16_check
loop32:
check32:
MOV $32, X6
BLT X5, X6, compare16
compare32:
MOV 0(X10), X15
MOV 0(X12), X16
MOV 8(X10), X17
MOV 8(X12), X18
BEQ X15, X16, loop32a
JMP cmp8a
loop32a:
BEQ X17, X18, loop32b
JMP cmp8b
loop32b:
BNE X15, X16, cmp8a
BNE X17, X18, cmp8b
MOV 16(X10), X15
MOV 16(X12), X16
MOV 24(X10), X17
MOV 24(X12), X18
BEQ X15, X16, loop32c
JMP cmp8a
loop32c:
BEQ X17, X18, loop32d
JMP cmp8b
loop32d:
BNE X15, X16, cmp8a
BNE X17, X18, cmp8b
ADD $32, X10
ADD $32, X12
ADD $-32, X5
BGE X5, X7, loop32
BGE X5, X6, compare32
BEQZ X5, cmp_len

loop16_check:
check16:
MOV $16, X6
BLT X5, X6, loop4_check
loop16:
BLT X5, X6, check8_unaligned
compare16:
MOV 0(X10), X15
MOV 0(X12), X16
MOV 8(X10), X17
MOV 8(X12), X18
BEQ X15, X16, loop16a
JMP cmp8a
loop16a:
BEQ X17, X18, loop16b
JMP cmp8b
loop16b:
BNE X15, X16, cmp8a
BNE X17, X18, cmp8b
ADD $16, X10
ADD $16, X12
ADD $-16, X5
BGE X5, X6, loop16
BEQZ X5, cmp_len

loop4_check:
MOV $4, X6
BLT X5, X6, loop1
loop4:
check8_unaligned:
MOV $8, X6
BLT X5, X6, check4_unaligned
compare8_unaligned:
MOVBU 0(X10), X8
MOVBU 1(X10), X15
MOVBU 2(X10), X17
MOVBU 3(X10), X19
MOVBU 4(X10), X21
MOVBU 5(X10), X23
MOVBU 6(X10), X25
MOVBU 7(X10), X29
MOVBU 0(X12), X9
MOVBU 1(X12), X16
MOVBU 2(X12), X18
MOVBU 3(X12), X20
MOVBU 4(X12), X22
MOVBU 5(X12), X24
MOVBU 6(X12), X28
MOVBU 7(X12), X30
BNE X8, X9, cmp1a
BNE X15, X16, cmp1b
BNE X17, X18, cmp1c
BNE X19, X20, cmp1d
BNE X21, X22, cmp1e
BNE X23, X24, cmp1f
BNE X25, X28, cmp1g
BNE X29, X30, cmp1h
ADD $8, X10
ADD $8, X12
ADD $-8, X5
BGE X5, X6, compare8_unaligned
BEQZ X5, cmp_len

check4_unaligned:
MOV $4, X6
BLT X5, X6, compare1
compare4_unaligned:
MOVBU 0(X10), X8
MOVBU 1(X10), X15
MOVBU 2(X10), X17
MOVBU 3(X10), X19
MOVBU 0(X12), X9
MOVBU 1(X12), X16
BEQ X8, X9, loop4a
SLTU X9, X8, X5
SLTU X8, X9, X6
JMP cmp_ret
loop4a:
BEQ X15, X16, loop4b
SLTU X16, X15, X5
SLTU X15, X16, X6
JMP cmp_ret
loop4b:
MOVBU 2(X10), X21
MOVBU 2(X12), X22
MOVBU 3(X10), X23
MOVBU 3(X12), X24
BEQ X21, X22, loop4c
SLTU X22, X21, X5
SLTU X21, X22, X6
JMP cmp_ret
loop4c:
BEQ X23, X24, loop4d
SLTU X24, X23, X5
SLTU X23, X24, X6
JMP cmp_ret
loop4d:
MOVBU 2(X12), X18
MOVBU 3(X12), X20
BNE X8, X9, cmp1a
BNE X15, X16, cmp1b
BNE X17, X18, cmp1c
BNE X19, X20, cmp1d
ADD $4, X10
ADD $4, X12
ADD $-4, X5
BGE X5, X6, loop4
BGE X5, X6, compare4_unaligned

loop1:
compare1:
BEQZ X5, cmp_len
MOVBU 0(X10), X8
MOVBU 0(X12), X9
BNE X8, X9, cmp
ADD $1, X10
ADD $1, X12
ADD $-1, X5
JMP loop1
JMP compare1

// Compare 8 bytes of memory in X15/X16 that are known to differ.
cmp8a:
MOV $0xff, X19
cmp8a_loop:
AND X15, X19, X8
AND X16, X19, X9
BNE X8, X9, cmp
SLLI $8, X19
JMP cmp8a_loop
MOV X15, X17
MOV X16, X18

// Compare 8 bytes of memory in X17/X18 that are known to differ.
cmp8b:
MOV $0xff, X19
cmp8b_loop:
cmp8_loop:
AND X17, X19, X8
AND X18, X19, X9
BNE X8, X9, cmp
SLLI $8, X19
JMP cmp8b_loop
JMP cmp8_loop

cmp1a:
SLTU X9, X8, X5
SLTU X8, X9, X6
JMP cmp_ret
cmp1b:
SLTU X16, X15, X5
SLTU X15, X16, X6
JMP cmp_ret
cmp1c:
SLTU X18, X17, X5
SLTU X17, X18, X6
JMP cmp_ret
cmp1d:
SLTU X20, X19, X5
SLTU X19, X20, X6
JMP cmp_ret
cmp1e:
SLTU X22, X21, X5
SLTU X21, X22, X6
JMP cmp_ret
cmp1f:
SLTU X24, X23, X5
SLTU X23, X24, X6
JMP cmp_ret
cmp1g:
SLTU X28, X25, X5
SLTU X25, X28, X6
JMP cmp_ret
cmp1h:
SLTU X30, X29, X5
SLTU X29, X30, X6
JMP cmp_ret

cmp_len:
MOV X11, X8
Expand Down

0 comments on commit 261fe25

Please sign in to comment.