zstd: Improve best encoder by extending backwards

The SpeedBestCompression encoder now extends matches backwards before estimating their encoded size, rather than doing this after selecting the best match. This is a bit slower, but produces smaller output. Benchmarks on amd64: name old speed new speed delta Encoder_EncodeAllSimple/best-8 20.7MB/s ± 3% 19.0MB/s ± 1% -8.04% (p=0.000 n=19+18) Encoder_EncodeAllSimple4K/best-8 19.2MB/s ± 6% 17.9MB/s ± 1% -6.86% (p=0.000 n=20+20) Output sizes on Silesia and enwik9: dickens 3220994 3179697 (× 0.987179) enwik9 259846164 257481474 (× 0.990900) mozilla 16912437 16895142 (× 0.998977) mr 3502823 3473770 (× 0.991706) nci 2306320 2300580 (× 0.997511) ooffice 2896907 2888715 (× 0.997172) osdb 3390548 3368411 (× 0.993471) reymont 1657380 1639490 (× 0.989206) samba 4329898 4315020 (× 0.996564) sao 5416648 5383855 (× 0.993946) webster 9972808 9887560 (× 0.991452) xml 542277 541018 (× 0.997678) x-ray 5733121 5681186 (× 0.990941) total 319728325 317035918 (× 0.991579) Wall clock time for compressing enwik9 goes up a bit, but is still close to what is was before klauspost#776.
greatroar · Mar 19, 2023 · 194a8db · 194a8db
1 parent 7633d62
commit 194a8db
Showing 1 changed file with 20 additions and 33 deletions.
diff --git a/zstd/enc_best.go b/zstd/enc_best.go
@@ -205,7 +205,22 @@ encodeLoop:
 					panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
 				}
 			}
-			cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+
+			l := 4 + e.matchlen(s+4, offset+4, src)
+			if rep < 0 {
+				// Extend candidate match backwards as far as possible.
+				tMin := s - e.maxMatchOff
+				if tMin < 0 {
+					tMin = 0
+				}
+				for offset > tMin && s > nextEmit && src[offset-1] == src[s-1] && l < maxMatchLength {
+					s--
+					offset--
+					l++
+				}
+			}
+
+			cand := match{offset: offset, s: s, length: l, rep: rep}
 			cand.estBits(bitsPerByte)
 			if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
 				*m = cand
@@ -295,25 +310,10 @@ encodeLoop:
 			s = best.s
 			var seq seq
 			seq.matchLen = uint32(best.length - zstdMinMatch)
-
-			// We might be able to match backwards.
-			// Extend as long as we can.
-			start := best.s
-			// We end the search early, so we don't risk 0 literals
-			// and have to do special offset treatment.
-			startLimit := nextEmit + 1
-
-			tMin := s - e.maxMatchOff
-			if tMin < 0 {
-				tMin = 0
+			if debugAsserts && s <= nextEmit {
+				panic("s <= nextEmit")
 			}
-			repIndex := best.offset
-			for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-				repIndex--
-				start--
-				seq.matchLen++
-			}
-			addLiterals(&seq, start)
+			addLiterals(&seq, s)
 
 			// rep 0
 			seq.offset = uint32(best.rep)
@@ -369,22 +369,9 @@ encodeLoop:
 			panic("invalid offset")
 		}
 
-		// Extend the n-byte match as long as possible.
-		l := best.length
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
 		// Write our sequence
 		var seq seq
+		l := best.length
 		seq.litLen = uint32(s - nextEmit)
 		seq.matchLen = uint32(l - zstdMinMatch)
 		if seq.litLen > 0 {