Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

s2: Search at match end in best mode #358

Merged
merged 1 commit into from
Apr 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions s2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -543,27 +543,27 @@ Some examples compared on 16 core CPU, amd64 assembly used:
* enwik10
Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
Best... 10000000000 -> 3649340179 [36.49%]; 40.05s, 238.1MB/s
Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s

* github-june-2days-2019.json
Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
Best... 6273951764 -> 845168908 [13.47%]; 8.878s, 673.9MB/s
Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s

* nyc-taxi-data-10M.csv
Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
Best... 3325605752 -> 786648492 [23.65%]; 7.628s, 415.8MB/s
Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s

* 10gb.tar
Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
Best... 10065157632 -> 5215462149 [51.82%]; 29.977s, 320.2MB/s
Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/

* consensus.db.10gb
Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
Best... 10737418240 -> 4280128613 [39.86%]; 41.758s, 245.2MB/s
Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
```

Decompression speed should be around the same as using the 'better' compression mode.
Expand Down
65 changes: 41 additions & 24 deletions s2/encode_best.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
offset int
s int
length int
score int
rep bool
}
var best match
Expand All @@ -82,6 +83,20 @@ func encodeBlockBest(dst, src []byte) (d int) {
candidateL := lTable[hashL]
candidateS := sTable[hashS]

score := func(m match) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - m.s
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
}
return score - emitCopySize(offset, m.length)
}

matchAt := func(offset, s int, first uint32, rep bool) match {
if best.length != 0 && best.s-best.offset == s-offset {
// Don't retest if we have the same offset.
Expand All @@ -101,41 +116,26 @@ func encodeBlockBest(dst, src []byte) (d int) {
m.length += 8
}
m.length -= offset
return m
}
score := func(m match, otherS int) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - (m.s - otherS)
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
m.score = score(m)
if m.score <= -m.s {
// Eliminate if no savings, we might find a better one.
m.length = 0
}
return score - emitCopySize(offset, m.length)
return m
}

bestOf := func(a, b match) match {
if b.length == 0 {
return a
}
if a.length == 0 {
return b
}
as := score(a, b.s)
bs := score(b, a.s)
as := a.score + b.s
bs := b.score + a.s
if as >= bs {
if as <= 0 {
// Eliminate if no savings, we might find a better one.
a.length = 0
}
return a
}
if bs <= 0 {
// Eliminate if no savings, we might find a better one.
b.length = 0
}
return b
}

Expand All @@ -159,7 +159,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))

// s+2
if best.length < 100 {
if true {
nextShort = sTable[hash4(cv>>8, sTableBits)]
s++
cv = load64(src, s)
Expand All @@ -169,6 +169,23 @@ func encodeBlockBest(dst, src []byte) (d int) {
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
}
// Search for a match at best match end, see if that is better.
if sAt := best.s + best.length; sAt < sLimit {
sBack := best.s
backL := best.length
// Load initial values
cv = load64(src, sBack)
// Search for mismatch
next := lTable[hash8(load64(src, sAt), lTableBits)]
//next := sTable[hash4(load64(src, sAt), sTableBits)]

if checkAt := getCur(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
if checkAt := getPrev(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
}
}
}

Expand Down