fse: Optimize compression (#745)

* fse: Optimize table building Skipping the loop body when v == 0 helps endzerobits and normcount2. Not writing to s.symbolLen in every iteration helps the other benchmarks. name old speed new speed delta Compress/gettysburg-8 181MB/s ± 1% 183MB/s ± 0% +1.15% (p=0.002 n=10+8) Compress/digits-8 241MB/s ± 0% 241MB/s ± 1% ~ (p=0.434 n=9+10) Compress/twain-8 218MB/s ± 0% 218MB/s ± 0% ~ (p=0.755 n=10+10) Compress/low-ent-8 239MB/s ± 0% 239MB/s ± 1% ~ (p=0.853 n=10+10) Compress/superlow-ent-8 208MB/s ± 1% 208MB/s ± 0% ~ (p=0.408 n=9+7) Compress/endzerobits-8 11.5MB/s ± 1% 13.3MB/s ± 1% +16.35% (p=0.000 n=10+9) Compress/pngdata.001-8 224MB/s ± 0% 224MB/s ± 1% +0.38% (p=0.004 n=8+10) Compress/normcount2-8 35.7MB/s ± 1% 36.6MB/s ± 1% +2.66% (p=0.000 n=10+9) * fse: Skip bounds checks each occurrence of v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1] now incurs three bounds checks instead of four. I haven't found a way to eliminate the remaining three. name old speed new speed delta Compress/gettysburg-8 183MB/s ± 0% 189MB/s ± 0% +3.32% (p=0.000 n=8+9) Compress/digits-8 241MB/s ± 1% 251MB/s ± 1% +4.14% (p=0.000 n=10+9) Compress/twain-8 218MB/s ± 0% 228MB/s ± 0% +4.36% (p=0.000 n=10+10) Compress/low-ent-8 239MB/s ± 1% 244MB/s ± 1% +1.90% (p=0.000 n=10+10) Compress/superlow-ent-8 208MB/s ± 0% 210MB/s ± 0% +0.89% (p=0.000 n=7+8) Compress/endzerobits-8 13.3MB/s ± 1% 13.4MB/s ± 1% +0.40% (p=0.019 n=9+10) Compress/pngdata.001-8 224MB/s ± 1% 225MB/s ± 1% +0.41% (p=0.006 n=10+9) Compress/normcount2-8 36.6MB/s ± 1% 36.4MB/s ± 1% -0.62% (p=0.012 n=9+10)
klauspost · Jan 21, 2023 · e766bf7 · e766bf7
1 parent 5f40643
commit e766bf7
Showing 1 changed file with 15 additions and 16 deletions.
diff --git a/fse/compress.go b/fse/compress.go
@@ -146,54 +146,51 @@ func (s *Scratch) compress(src []byte) error {
 		c1.encodeZero(tt[src[ip-2]])
 		ip -= 2
 	}
+	src = src[:ip]
 
 	// Main compression loop.
 	switch {
 	case !s.zeroBits && s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush.
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case !s.zeroBits:
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			s.bw.flush32()
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	default:
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			s.bw.flush32()
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	}
 
@@ -459,15 +456,17 @@ func (s *Scratch) countSimple(in []byte) (max int) {
 	for _, v := range in {
 		s.count[v]++
 	}
-	m := uint32(0)
+	m, symlen := uint32(0), s.symbolLen
 	for i, v := range s.count[:] {
+		if v == 0 {
+			continue
+		}
 		if v > m {
 			m = v
 		}
-		if v > 0 {
-			s.symbolLen = uint16(i) + 1
-		}
+		symlen = uint16(i) + 1
 	}
+	s.symbolLen = symlen
 	return int(m)
 }