Skip to content

Commit

Permalink
internal/lz4block: arm64 decoder improvements
Browse files Browse the repository at this point in the history
Use fast loop after dict copy. Checking for its possibility costs as
many instructions as jumping over it.

Move SUBS close to conditional branches for CPUs that fuse these
instructions.

Shave one instruction off the remainder handling code after this loop.
A load from register base+register offset has the same latency and
throughput as a load from register+constant offset, at least on
Cortex-A72.
  • Loading branch information
greatroar committed Feb 16, 2022
1 parent f9f4ce2 commit 24303cf
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 10 deletions.
17 changes: 7 additions & 10 deletions internal/lz4block/decode_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,9 @@ copyDict:
CBZ len, copyMatchDone

// If the match extends beyond the dictionary, the rest is at dstorig.
// Recompute the offset for the next check.
MOVD dstorig, match

// The code up to copyMatchLoop1 assumes len >= minMatch.
CMP $const_minMatch, len
BLO copyMatchLoop1
SUB dstorig, dst, offset

copyMatchTry8:
// Copy doublewords if both len and offset are at least eight.
Expand All @@ -190,23 +188,22 @@ copyMatchTry8:
AND $7, len, lenRem
SUB $8, len
copyMatchLoop8:
SUBS $8, len
MOVD.P 8(match), tmp1
MOVD.P tmp1, 8(dst)
SUBS $8, len
BPL copyMatchLoop8

ADD lenRem, match
MOVD (match)(len), tmp2 // match+len == match+lenRem-8.
ADD lenRem, dst
MOVD -8(match), tmp2
MOVD tmp2, -8(dst)
B copyMatchDone

copyMatchLoop1:
// Finish with a byte-at-a-time copy.
SUB $1, len
// Byte-at-a-time copy for small offsets.
MOVBU.P 1(match), tmp2
MOVB.P tmp2, 1(dst)
CBNZ len, copyMatchLoop1
SUBS $1, len
BNE copyMatchLoop1

copyMatchDone:
CMP src, srcend
Expand Down
1 change: 1 addition & 0 deletions internal/lz4block/decode_asm.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//go:build (amd64 || arm || arm64) && !appengine && gc && !noasm
// +build amd64 arm arm64
// +build !appengine
// +build gc
Expand Down

0 comments on commit 24303cf

Please sign in to comment.