Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions s2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -543,27 +543,27 @@ Some examples compared on 16 core CPU, amd64 assembly used:
* enwik10
Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
Best... 10000000000 -> 3649340179 [36.49%]; 40.05s, 238.1MB/s
Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s

* github-june-2days-2019.json
Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
Best... 6273951764 -> 845168908 [13.47%]; 8.878s, 673.9MB/s
Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s

* nyc-taxi-data-10M.csv
Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
Best... 3325605752 -> 786648492 [23.65%]; 7.628s, 415.8MB/s
Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s

* 10gb.tar
Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
Best... 10065157632 -> 5215462149 [51.82%]; 29.977s, 320.2MB/s
Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/

* consensus.db.10gb
Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
Best... 10737418240 -> 4280128613 [39.86%]; 41.758s, 245.2MB/s
Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
```

Decompression speed should be around the same as using the 'better' compression mode.
Expand Down
65 changes: 41 additions & 24 deletions s2/encode_best.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
offset int
s int
length int
score int
rep bool
}
var best match
Expand All @@ -82,6 +83,20 @@ func encodeBlockBest(dst, src []byte) (d int) {
candidateL := lTable[hashL]
candidateS := sTable[hashS]

score := func(m match) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - m.s
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
}
return score - emitCopySize(offset, m.length)
}

matchAt := func(offset, s int, first uint32, rep bool) match {
if best.length != 0 && best.s-best.offset == s-offset {
// Don't retest if we have the same offset.
Expand All @@ -101,41 +116,26 @@ func encodeBlockBest(dst, src []byte) (d int) {
m.length += 8
}
m.length -= offset
return m
}
score := func(m match, otherS int) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - (m.s - otherS)
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
m.score = score(m)
if m.score <= -m.s {
// Eliminate if no savings, we might find a better one.
m.length = 0
}
return score - emitCopySize(offset, m.length)
return m
}

bestOf := func(a, b match) match {
if b.length == 0 {
return a
}
if a.length == 0 {
return b
}
as := score(a, b.s)
bs := score(b, a.s)
as := a.score + b.s
bs := b.score + a.s
if as >= bs {
if as <= 0 {
// Eliminate if no savings, we might find a better one.
a.length = 0
}
return a
}
if bs <= 0 {
// Eliminate if no savings, we might find a better one.
b.length = 0
}
return b
}

Expand All @@ -159,7 +159,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))

// s+2
if best.length < 100 {
if true {
nextShort = sTable[hash4(cv>>8, sTableBits)]
s++
cv = load64(src, s)
Expand All @@ -169,6 +169,23 @@ func encodeBlockBest(dst, src []byte) (d int) {
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
}
// Search for a match at best match end, see if that is better.
if sAt := best.s + best.length; sAt < sLimit {
sBack := best.s
backL := best.length
// Load initial values
cv = load64(src, sBack)
// Search for mismatch
next := lTable[hash8(load64(src, sAt), lTableBits)]
//next := sTable[hash4(load64(src, sAt), sTableBits)]

if checkAt := getCur(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
if checkAt := getPrev(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
}
}
}

Expand Down