Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zstd: Improve zstd best efficiency #784

Merged
merged 4 commits into from
Mar 23, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 70 additions & 99 deletions zstd/enc_best.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
)

const (
bestLongTableBits = 22 // Bits used in the long match table
bestLongTableBits = 23 // Bits used in the long match table
bestLongTableSize = 1 << bestLongTableBits // Size of the table
bestLongLen = 8 // Bytes used for table hash

Expand Down Expand Up @@ -188,7 +188,7 @@ encodeLoop:
panic("offset0 was 0")
}

const goodEnough = 100
const goodEnough = 250

nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
Expand All @@ -205,7 +205,37 @@ encodeLoop:
panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
}
}
cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
// Try to quick reject if we already have a long match.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this help? I'd tried to check for overlap between the best match and the candidate before the first load3232, but found that it slows compression down. From memory, what I did was

overlaps := m.rep > 0 && offset >= m.offset && offset < m.offset+m.length
if s-offset >= e.maxMatchOff || overlaps || load3232(src, offset) != first {
    return
}

Copy link
Owner Author

@klauspost klauspost Mar 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cheap version of your check is to check if the offset is the same. I had it at some point but very few hits. Also tried keeping all previously tested offsets - but horrible.

This piece of code does something else. It is a bit easier to explain with an example.

Say we have a match of 50 bytes already.
When we check if a new match is better, we start by checking if bytes [42...46] match. If they don't, we will at best get a match that is 45 bytes, which will always be worse than the 50 byte match, without further tests.

In most cases we can reject on this. Some do the entire matchlen in reverse first and forward afterwards. Too tedious for me.

We could check 8 bytes - but I don't think that will make much difference.

if m.length > 16 {
left := len(src) - int(m.s+m.length)
// If we are too close to the end, keep as is.
if left <= 0 {
return
}
if left > 2 {
// Check 4 bytes, 4 bytes from the end of the current match.
a := load3232(src, offset+m.length-8)
b := load3232(src, s+m.length-8)
if a != b {
return
}
}
}
l := 4 + e.matchlen(s+4, offset+4, src)
if rep < 0 {
// Extend candidate match backwards as far as possible.
tMin := s - e.maxMatchOff
if tMin < 0 {
tMin = 0
}
for offset > tMin && s > nextEmit && src[offset-1] == src[s-1] && l < maxMatchLength {
s--
offset--
l++
}
}

cand := match{offset: offset, s: s, length: l, rep: rep}
cand.estBits(bitsPerByte)
if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
*m = cand
Expand All @@ -219,17 +249,29 @@ encodeLoop:
improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)

if canRepeat && best.length < goodEnough {
cv32 := uint32(cv >> 8)
spp := s + 1
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
if best.length > 0 {
cv32 = uint32(cv >> 24)
spp += 2
if s == nextEmit {
// Check repeats straight after a match.
improve(&best, s-offset2, s, uint32(cv), 1|4)
improve(&best, s-offset3, s, uint32(cv), 2|4)
if offset1 > 1 {
improve(&best, s-(offset1-1), s, uint32(cv), 3|4)
}
}

// If either no match or a non-repeat match, check at + 1
if best.rep <= 0 {
cv32 := uint32(cv >> 8)
spp := s + 1
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
if best.rep < 0 {
cv32 = uint32(cv >> 24)
spp += 2
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
}
}
}
// Load next and check...
Expand All @@ -248,7 +290,7 @@ encodeLoop:
continue
}

s++
s := s + 1
candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
cv = load6432(src, s)
cv2 := load6432(src, s+1)
Expand Down Expand Up @@ -292,38 +334,22 @@ encodeLoop:

// We have a match, we can store the forward value
if best.rep > 0 {
s = best.s
var seq seq
seq.matchLen = uint32(best.length - zstdMinMatch)

// We might be able to match backwards.
// Extend as long as we can.
start := best.s
// We end the search early, so we don't risk 0 literals
// and have to do special offset treatment.
startLimit := nextEmit + 1

tMin := s - e.maxMatchOff
if tMin < 0 {
tMin = 0
}
repIndex := best.offset
for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
repIndex--
start--
seq.matchLen++
if debugAsserts && s <= nextEmit {
panic("s <= nextEmit")
}
addLiterals(&seq, start)
addLiterals(&seq, best.s)

// rep 0
seq.offset = uint32(best.rep)
// Repeat. If bit 4 is set, this is a non-lit repeat.
seq.offset = uint32(best.rep & 3)
if debugSequences {
println("repeat sequence", seq, "next s:", s)
}
blk.sequences = append(blk.sequences, seq)

// Index match start+1 (long) -> s - 1
index0 := s
// Index old s + 1 -> s - 1
index0 := s + 1
s = best.s + best.length

nextEmit = s
Expand All @@ -336,7 +362,7 @@ encodeLoop:
}
// Index skipped...
off := index0 + e.cur
for index0 < s-1 {
for index0 < s {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
Expand All @@ -346,17 +372,20 @@ encodeLoop:
index0++
}
switch best.rep {
case 2:
case 2, 4 | 1:
offset1, offset2 = offset2, offset1
case 3:
case 3, 4 | 2:
offset1, offset2, offset3 = offset3, offset1, offset2
case 4 | 3:
offset1, offset2, offset3 = offset1-1, offset1, offset2
}
cv = load6432(src, s)
continue
}

// A 4-byte match has been found. Update recent offsets.
// We'll later see if more than 4 bytes.
index0 := s + 1
s = best.s
t := best.offset
offset1, offset2, offset3 = s-t, offset1, offset2
Expand All @@ -369,22 +398,9 @@ encodeLoop:
panic("invalid offset")
}

// Extend the n-byte match as long as possible.
l := best.length

// Extend backwards
tMin := s - e.maxMatchOff
if tMin < 0 {
tMin = 0
}
for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
s--
t--
l++
}

// Write our sequence
var seq seq
l := best.length
seq.litLen = uint32(s - nextEmit)
seq.matchLen = uint32(l - zstdMinMatch)
if seq.litLen > 0 {
Expand All @@ -401,10 +417,8 @@ encodeLoop:
break encodeLoop
}

// Index match start+1 (long) -> s - 1
index0 := s - l + 1
// every entry
for index0 < s-1 {
// Index old s + 1 -> s - 1
for index0 < s {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
Expand All @@ -413,50 +427,7 @@ encodeLoop:
e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
index0++
}

cv = load6432(src, s)
if !canRepeat {
continue
}

// Check offset 2
for {
o2 := s - offset2
if load3232(src, o2) != uint32(cv) {
// Do regular search
break
}

// Store this, since we have it.
nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)

// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
l := 4 + e.matchlen(s+4, o2+4, src)

e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: e.table[nextHashS].offset}
seq.matchLen = uint32(l) - zstdMinMatch
seq.litLen = 0

// Since litlen is always 0, this is offset 1.
seq.offset = 1
s += l
nextEmit = s
if debugSequences {
println("sequence", seq, "next s:", s)
}
blk.sequences = append(blk.sequences, seq)

// Swap offset 1 and 2.
offset1, offset2 = offset2, offset1
if s >= sLimit {
// Finished
break encodeLoop
}
cv = load6432(src, s)
}
}

if int(nextEmit) < len(src) {
Expand Down