From f2a79e9072ea31dc66224ebed86749291eb26fdf Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sun, 19 Nov 2023 15:38:08 +0100 Subject: [PATCH] huff0: Speed up symbol counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compiler inserts a nil check instruction into the first loop in countSimple. Lift that out of the loop for some extra throughput: goos: linux goarch: amd64 pkg: github.com/klauspost/compress/huff0 cpu: Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz │ huff0/old │ huff0/nilcheck │ │ B/s │ B/s vs base │ Compress1XReuseNone/digits-8 438.9Mi ± 0% 467.7Mi ± 1% +6.55% (p=0.002 n=6) Compress1XReuseNone/gettysburg-8 247.3Mi ± 2% 257.4Mi ± 1% +4.08% (p=0.002 n=6) Compress1XReuseNone/twain-8 349.2Mi ± 1% 367.9Mi ± 1% +5.35% (p=0.002 n=6) Compress1XReuseNone/low-ent.10k-8 474.3Mi ± 1% 488.4Mi ± 0% +2.98% (p=0.002 n=6) Compress1XReuseNone/superlow-ent-10k-8 303.4Mi ± 1% 341.6Mi ± 0% +12.57% (p=0.002 n=6) Compress1XReuseNone/crash2-8 10.89Mi ± 3% 10.90Mi ± 1% ~ (p=0.794 n=6) Compress1XReuseNone/endzerobits-8 15.40Mi ± 5% 15.54Mi ± 0% +0.87% (p=0.006 n=6) Compress1XReuseNone/endnonzero-8 7.548Mi ± 2% 7.539Mi ± 5% ~ (p=0.855 n=6) Compress1XReuseNone/case1-8 14.29Mi ± 1% 14.39Mi ± 1% ~ (p=0.082 n=6) Compress1XReuseNone/case2-8 11.89Mi ± 1% 11.92Mi ± 0% ~ (p=0.121 n=6) Compress1XReuseNone/case3-8 12.96Mi ± 0% 12.92Mi ± 0% ~ (p=0.201 n=6) Compress1XReuseNone/pngdata.001-8 289.2Mi ± 1% 318.2Mi ± 0% +10.03% (p=0.002 n=6) Compress1XReuseNone/normcount2-8 34.46Mi ± 1% 34.30Mi ± 1% ~ (p=0.485 n=6) Compress1XReuseAllow/digits-8 458.0Mi ± 1% 490.6Mi ± 1% +7.11% (p=0.002 n=6) Compress1XReuseAllow/gettysburg-8 279.5Mi ± 3% 293.1Mi ± 1% +4.85% (p=0.002 n=6) Compress1XReuseAllow/twain-8 348.2Mi ± 1% 367.2Mi ± 0% +5.45% (p=0.002 n=6) Compress1XReuseAllow/low-ent.10k-8 478.0Mi ± 1% 490.4Mi ± 1% +2.58% (p=0.002 n=6) Compress1XReuseAllow/superlow-ent-10k-8 307.2Mi ± 0% 345.6Mi ± 0% +12.49% (p=0.002 n=6) Compress1XReuseAllow/crash2-8 16.92Mi ± 1% 17.09Mi ± 1% +0.99% (p=0.006 n=6) Compress1XReuseAllow/endzerobits-8 16.75Mi ± 2% 16.84Mi ± 0% +0.54% (p=0.002 n=6) Compress1XReuseAllow/endnonzero-8 12.58Mi ± 1% 12.65Mi ± 0% +0.57% (p=0.002 n=6) Compress1XReuseAllow/case1-8 19.77Mi ± 1% 19.81Mi ± 1% ~ (p=0.589 n=6) Compress1XReuseAllow/case2-8 16.96Mi ± 3% 16.58Mi ± 3% ~ (p=0.288 n=6) Compress1XReuseAllow/case3-8 18.04Mi ± 2% 17.90Mi ± 2% ~ (p=0.818 n=6) Compress1XReuseAllow/pngdata.001-8 291.6Mi ± 0% 322.0Mi ± 0% +10.44% (p=0.002 n=6) Compress1XReuseAllow/normcount2-8 48.58Mi ± 1% 48.38Mi ± 1% ~ (p=0.258 n=6) Compress1XReusePrefer/digits-8 460.6Mi ± 0% 493.0Mi ± 0% +7.04% (p=0.002 n=6) Compress1XReusePrefer/gettysburg-8 412.8Mi ± 1% 436.7Mi ± 2% +5.77% (p=0.002 n=6) Compress1XReusePrefer/twain-8 350.4Mi ± 0% 369.4Mi ± 0% +5.41% (p=0.002 n=6) Compress1XReusePrefer/low-ent.10k-8 481.8Mi ± 0% 493.6Mi ± 0% +2.44% (p=0.002 n=6) Compress1XReusePrefer/superlow-ent-10k-8 311.3Mi ± 1% 351.8Mi ± 0% +12.99% (p=0.002 n=6) Compress1XReusePrefer/crash2-8 63.51Mi ± 1% 65.02Mi ± 1% +2.38% (p=0.002 n=6) Compress1XReusePrefer/endzerobits-8 24.28Mi ± 0% 24.38Mi ± 0% +0.43% (p=0.004 n=6) Compress1XReusePrefer/endnonzero-8 33.18Mi ± 0% 33.35Mi ± 0% +0.49% (p=0.017 n=6) Compress1XReusePrefer/case1-8 148.9Mi ± 1% 165.1Mi ± 0% +10.88% (p=0.002 n=6) Compress1XReusePrefer/case2-8 141.4Mi ± 0% 142.9Mi ± 0% +1.07% (p=0.002 n=6) Compress1XReusePrefer/case3-8 152.1Mi ± 0% 154.3Mi ± 0% +1.42% (p=0.002 n=6) Compress1XReusePrefer/pngdata.001-8 299.3Mi ± 1% 331.3Mi ± 0% +10.70% (p=0.002 n=6) Compress1XReusePrefer/normcount2-8 210.7Mi ± 1% 215.1Mi ± 1% +2.07% (p=0.002 n=6) Compress4XReuseNone/digits-8 457.9Mi ± 1% 490.0Mi ± 0% +7.01% (p=0.002 n=6) Compress4XReuseNone/gettysburg-8 245.4Mi ± 0% 255.5Mi ± 0% +4.11% (p=0.002 n=6) Compress4XReuseNone/twain-8 348.3Mi ± 0% 367.9Mi ± 0% +5.63% (p=0.002 n=6) Compress4XReuseNone/low-ent.10k-8 475.1Mi ± 1% 487.0Mi ± 0% +2.50% (p=0.002 n=6) Compress4XReuseNone/superlow-ent-10k-8 302.4Mi ± 0% 339.3Mi ± 3% +12.19% (p=0.002 n=6) Compress4XReuseNone/case1-8 14.31Mi ± 0% 14.24Mi ± 1% ~ (p=0.119 n=6) Compress4XReuseNone/case2-8 11.69Mi ± 1% 11.66Mi ± 1% ~ (p=0.502 n=6) Compress4XReuseNone/case3-8 12.72Mi ± 0% 12.67Mi ± 1% ~ (p=0.102 n=6) Compress4XReuseNone/pngdata.001-8 289.0Mi ± 1% 317.7Mi ± 0% +9.92% (p=0.002 n=6) Compress4XReuseNone/normcount2-8 33.35Mi ± 1% 33.45Mi ± 3% ~ (p=0.909 n=6) Compress4XReuseAllow/digits-8 458.1Mi ± 2% 491.1Mi ± 0% +7.21% (p=0.002 n=6) Compress4XReuseAllow/gettysburg-8 281.0Mi ± 1% 292.5Mi ± 0% +4.09% (p=0.002 n=6) Compress4XReuseAllow/twain-8 348.8Mi ± 0% 368.4Mi ± 1% +5.63% (p=0.002 n=6) Compress4XReuseAllow/low-ent.10k-8 477.3Mi ± 0% 488.7Mi ± 2% ~ (p=0.065 n=6) Compress4XReuseAllow/superlow-ent-10k-8 305.8Mi ± 0% 344.4Mi ± 0% +12.63% (p=0.002 n=6) Compress4XReuseAllow/case1-8 19.34Mi ± 1% 19.54Mi ± 2% +1.01% (p=0.039 n=6) Compress4XReuseAllow/case2-8 16.57Mi ± 0% 15.89Mi ± 5% ~ (p=0.061 n=6) Compress4XReuseAllow/case3-8 17.68Mi ± 0% 17.17Mi ± 8% ~ (p=0.061 n=6) Compress4XReuseAllow/pngdata.001-8 291.2Mi ± 0% 319.6Mi ± 1% +9.75% (p=0.002 n=6) Compress4XReuseAllow/normcount2-8 47.46Mi ± 1% 47.57Mi ± 1% ~ (p=1.000 n=6) Compress4XReusePrefer/digits-8 460.0Mi ± 0% 492.9Mi ± 0% +7.14% (p=0.002 n=6) Compress4XReusePrefer/gettysburg-8 408.3Mi ± 1% 432.6Mi ± 0% +5.95% (p=0.002 n=6) Compress4XReusePrefer/twain-8 350.0Mi ± 0% 370.0Mi ± 1% +5.70% (p=0.002 n=6) Compress4XReusePrefer/low-ent.10k-8 481.1Mi ± 0% 492.7Mi ± 0% +2.41% (p=0.002 n=6) Compress4XReusePrefer/superlow-ent-10k-8 309.3Mi ± 1% 351.0Mi ± 0% +13.50% (p=0.002 n=6) Compress4XReusePrefer/case1-8 130.5Mi ± 0% 140.2Mi ± 1% +7.44% (p=0.002 n=6) Compress4XReusePrefer/case2-8 120.0Mi ± 0% 120.8Mi ± 1% +0.69% (p=0.004 n=6) Compress4XReusePrefer/case3-8 126.3Mi ± 2% 129.6Mi ± 0% +2.64% (p=0.002 n=6) Compress4XReusePrefer/pngdata.001-8 300.2Mi ± 1% 330.6Mi ± 0% +10.13% (p=0.002 n=6) Compress4XReusePrefer/normcount2-8 183.7Mi ± 1% 187.2Mi ± 1% +1.88% (p=0.009 n=6) geomean 111.6Mi 116.1Mi +3.99% --- huff0/compress.go | 1 + 1 file changed, 1 insertion(+) diff --git a/huff0/compress.go b/huff0/compress.go index 518436cf3d..e3731d260e 100644 --- a/huff0/compress.go +++ b/huff0/compress.go @@ -350,6 +350,7 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) { // Does not update s.clearCount. func (s *Scratch) countSimple(in []byte) (max int, reuse bool) { reuse = true + _ = s.count // Assert that s != nil to speed up the following loop. for _, v := range in { s.count[v]++ }