json: improve performance by using a pool of scanners (#535)

* add benchmark for each detector with it's corresponding file * json/ndjson: use a pool of scanners for detection it helps by reducing memory allocations to amortized 0 ➜ mimetype git:(bench) ✗ benchstat master sync_pool goos: linux goarch: amd64 pkg: github.com/gabriel-vasile/mimetype cpu: Intel(R) Core(TM) i7-10510U CPU @ 1.80GHz │ master │ sync_pool │ │ sec/op │ sec/op vs base │ Files/json.json/application/json-8 1.403µ ± 0% 1.355µ ± 1% -3.39% (p=0.000 n=20) Files/ndjson.xl.ndjson/application/x-ndjson-8 8.655µ ± 1% 8.504µ ± 1% -1.76% (p=0.001 n=20) Files/ndjson.ndjson/application/x-ndjson-8 3.069µ ± 1% 2.958µ ± 5% ~ (p=0.129 n=20) geomean 3.340µ 3.242µ -2.92% │ master │ sync_pool │ │ B/op │ B/op vs base │ Files/json.json/application/json-8 120.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=20) Files/ndjson.xl.ndjson/application/x-ndjson-8 720.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=20) Files/ndjson.ndjson/application/x-ndjson-8 240.0 ± 0% 0.0 ± 0% -100.00% (p=0.000 n=20) geomean 274.7 ? ¹ ² ¹ summaries must be >0 to compute geomean ² ratios must be >0 to compute geomean │ master │ sync_pool │ │ allocs/op │ allocs/op vs base │ Files/json.json/application/json-8 4.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=20) Files/ndjson.xl.ndjson/application/x-ndjson-8 24.00 ± 0% 0.00 ± 0% -100.00% (p=0.000 n=20) Files/ndjson.ndjson/application/x-ndjson-8 8.000 ± 0% 0.000 ± 0% -100.00% (p=0.000 n=20) geomean 9.158 ? ¹ ² ¹ summaries must be >0 to compute geomean ² ratios must be >0 to compute geomean * benchmark: use sequential instead of concurrent benchmarks there is no reason to make it concurrent
gabriel-vasile · May 26, 2024 · 09ff708 · 09ff708
1 parent 43192c8
commit 09ff708
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 7 deletions.
diff --git a/internal/json/json.go b/internal/json/json.go
@@ -34,6 +34,7 @@ package json
 
 import (
  "fmt"
+ "sync"
 )
 
 type (
@@ -73,18 +74,38 @@ type (
  }
 )
 
+var scannerPool = sync.Pool{
+ New: func() any {
+ return &scanner{}
+ },
+}
+
+func newScanner() *scanner {
+ s := scannerPool.Get().(*scanner)
+ s.reset()
+ return s
+}
+
+func freeScanner(s *scanner) {
+ // Avoid hanging on to too much memory in extreme cases.
+ if len(s.parseState) > 1024 {
+ s.parseState = nil
+ }
+ scannerPool.Put(s)
+}
+
 // Scan returns the number of bytes scanned and if there was any error
 // in trying to reach the end of data.
 func Scan(data []byte) (int, error) {
- s := &scanner{}
+ s := newScanner()
+ defer freeScanner(s)
  _ = checkValid(data, s)
  return s.index, s.err
 }
 
 // checkValid verifies that data is valid JSON-encoded data.
 // scan is passed in for use by checkValid to avoid an allocation.
 func checkValid(data []byte, scan *scanner) error {
- scan.reset()
  for _, c := range data {
  scan.index++
  if scan.step(scan, c) == scanError {
@@ -105,6 +126,8 @@ func (s *scanner) reset() {
  s.step = stateBeginValue
  s.parseState = s.parseState[0:0]
  s.err = nil
+ s.endTop = false
+ s.index = 0
 }
 
 // eof tells the scanner that the end of input has been reached.

diff --git a/mimetype_test.go b/mimetype_test.go
@@ -488,11 +488,9 @@ func BenchmarkSliceRand(b *testing.B) {
  b.ResetTimer()
  b.ReportAllocs()
 
- b.RunParallel(func(pb *testing.PB) {
- for pb.Next() {
- Detect(data)
- }
- })
+ for n := 0; n < b.N; n++ {
+ Detect(data)
+ }
 }
 
 func BenchmarkText(b *testing.B) {
@@ -513,6 +511,30 @@ func BenchmarkText(b *testing.B) {
  }
 }
 
+// BenchmarkFiles benchmarks each detector with his coresponding file.
+func BenchmarkFiles(b *testing.B) {
+ for f, m := range files {
+ data, err := os.ReadFile(filepath.Join(testDataDir, f))
+ if err != nil {
+ b.Fatal(err)
+ }
+ if uint32(len(data)) > defaultLimit {
+ data = data[:defaultLimit]
+ }
+ b.Run(f+"/"+m, func(b *testing.B) {
+ b.ReportAllocs()
+ b.ResetTimer()
+ parsed, _, _ := mime.ParseMediaType(m)
+ mType := Lookup(parsed)
+ for n := 0; n < b.N; n++ {
+ if !mType.detector(data, uint32(len(data))) {
+ b.Fatal("detection should never fail")
+ }
+ }
+ })
+ }
+}
+
 func BenchmarkAll(b *testing.B) {
  r := rand.New(rand.NewSource(0))
  data := make([]byte, defaultLimit)