From 675c46eabe11025bba141c8ff1a769cbbf7e9e5a Mon Sep 17 00:00:00 2001 From: Gabriel Vasile Date: Mon, 13 May 2024 02:02:00 +0900 Subject: [PATCH 1/2] Add benchmark func for all detector functions --- mimetype_test.go | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/mimetype_test.go b/mimetype_test.go index dab76d85..0c9a63c0 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -491,6 +491,7 @@ func BenchmarkSliceRand(b *testing.B) { } b.ResetTimer() + b.ReportAllocs() b.RunParallel(func(pb *testing.PB) { for pb.Next() { @@ -499,6 +500,24 @@ func BenchmarkSliceRand(b *testing.B) { }) } +func BenchmarkAll(b *testing.B) { + r := rand.New(rand.NewSource(0)) + data := make([]byte, 3072) + if _, err := io.ReadFull(r, data); err != io.ErrUnexpectedEOF && err != nil { + b.Fatal(err) + } + for _, m := range root.flatten() { + b.Run(m.String(), func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for n := 0; n < b.N; n++ { + m.detector(data, uint32(len(data))) + } + }) + } + +} + func BenchmarkCommon(b *testing.B) { commonFiles := []string{ "xlsx.xlsx", @@ -512,13 +531,15 @@ func BenchmarkCommon(b *testing.B) { "gif.gif", "xls.xls", "webm.webm", + "csv.csv", } for _, file := range commonFiles { + f, err := os.ReadFile(filepath.Join(testDataDir, file)) + if err != nil { + b.Fatal(err) + } b.Run(filepath.Ext(file), func(b *testing.B) { - f, err := os.ReadFile(testDataDir + file) - if err != nil { - b.Fatal(err) - } + b.ReportAllocs() b.ResetTimer() for n := 0; n < b.N; n++ { Detect(f) From 7c53342ac0ab7dc9cc7d187ea002bb511b407036 Mon Sep 17 00:00:00 2001 From: Gabriel Vasile Date: Mon, 13 May 2024 03:42:13 +0900 Subject: [PATCH 2/2] srt: improve performance by using bytes.Cut instead of bufio.Scanner it helps with mem allocs Before: BenchmarkSrt-8 946042 1089 ns/op 4240 B/op 5 allocs/op After: BenchmarkSrt-8 3235448 368.8 ns/op 64 B/op 2 allocs/op --- internal/magic/text.go | 34 ++++++++++++++++++++++++---------- mimetype_test.go | 19 +++++++------------ testdata/not.srt.2.txt | 20 -------------------- testdata/not.srt.txt | 20 -------------------- 4 files changed, 31 insertions(+), 62 deletions(-) delete mode 100644 testdata/not.srt.2.txt delete mode 100644 testdata/not.srt.txt diff --git a/internal/magic/text.go b/internal/magic/text.go index e2a03caf..fedb5c20 100644 --- a/internal/magic/text.go +++ b/internal/magic/text.go @@ -302,19 +302,21 @@ func Svg(raw []byte, limit uint32) bool { // Srt matches a SubRip file. func Srt(in []byte, _ uint32) bool { - s := bufio.NewScanner(bytes.NewReader(in)) - if !s.Scan() { + line, in, found := scanLine(in) + if !found { return false } + // First line must be 1. - if s.Text() != "1" { + if string(line) != "1" { return false } - - if !s.Scan() { + line, in, found = scanLine(in) + if !found { return false } - secondLine := s.Text() + + secondLine := string(line) // Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine // length to exactly 29 characters. if len(secondLine) != 29 { @@ -325,14 +327,12 @@ func Srt(in []byte, _ uint32) bool { if strings.Contains(secondLine, ".") { return false } - // For Go <1.17, comma is not recognised as a decimal separator by `time.Parse`. - secondLine = strings.ReplaceAll(secondLine, ",", ".") // Second line must be a time range. ts := strings.Split(secondLine, " --> ") if len(ts) != 2 { return false } - const layout = "15:04:05.000" + const layout = "15:04:05,000" t0, err := time.Parse(layout, ts[0]) if err != nil { return false @@ -345,8 +345,9 @@ func Srt(in []byte, _ uint32) bool { return false } + line, _, found = scanLine(in) // A third line must exist and not be empty. This is the actual subtitle text. - return s.Scan() && len(s.Bytes()) != 0 + return found && len(line) != 0 } // Vtt matches a Web Video Text Tracks (WebVTT) file. See @@ -373,3 +374,16 @@ func Vtt(raw []byte, limit uint32) bool { return bytes.Equal(raw, []byte{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) || // UTF-8 BOM and "WEBVTT" bytes.Equal(raw, []byte{0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) // "WEBVTT" } + +func scanLine(in []byte) (line, remainder []byte, found bool) { + line, remainder, found = bytes.Cut(in, []byte("\n")) + if !found { + return + } + + // Drop off any \r before \n. + if lenLine := len(line); lenLine > 0 && line[lenLine-1] == '\r' { + line = line[:lenLine-1] + } + return +} diff --git a/mimetype_test.go b/mimetype_test.go index 0c9a63c0..0673f7e8 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -176,18 +176,13 @@ var files = map[string]string{ "so.so": "application/x-sharedlib", "sqlite.sqlite": "application/vnd.sqlite3", "srt.srt": "application/x-subrip", - // not.srt.txt uses periods instead of commas for the decimal separators of - // the timestamps. - "not.srt.txt": "text/plain; charset=utf-8", - // not.srt.2.txt does not specify milliseconds. - "not.srt.2.txt": "text/plain; charset=utf-8", - "svg.1.svg": "image/svg+xml", - "svg.svg": "image/svg+xml", - "swf.swf": "application/x-shockwave-flash", - "tar.tar": "application/x-tar", - "tar.gnu.tar": "application/x-tar", - "tar.oldgnu.tar": "application/x-tar", - "tar.posix.tar": "application/x-tar", + "svg.1.svg": "image/svg+xml", + "svg.svg": "image/svg+xml", + "swf.swf": "application/x-shockwave-flash", + "tar.tar": "application/x-tar", + "tar.gnu.tar": "application/x-tar", + "tar.oldgnu.tar": "application/x-tar", + "tar.posix.tar": "application/x-tar", // tar.star.tar was generated with star 1.6. "tar.star.tar": "application/x-tar", "tar.ustar.tar": "application/x-tar", diff --git a/testdata/not.srt.2.txt b/testdata/not.srt.2.txt deleted file mode 100644 index 78bebbc3..00000000 --- a/testdata/not.srt.2.txt +++ /dev/null @@ -1,20 +0,0 @@ -1 -00:02:16 --> 00:02:19 -Senator, we're making -our final approach into Coruscant. - -2 -00:02:19 --> 00:02:21 -Very good, Lieutenant. - -3 -00:03:13 --> 00:03:15 -We made it. - -4 -00:03:18 --> 00:03:20 -I guess I was wrong. - -5 -00:03:20 --> 00:03:22 -There was no danger at all. diff --git a/testdata/not.srt.txt b/testdata/not.srt.txt deleted file mode 100644 index 338c9ee5..00000000 --- a/testdata/not.srt.txt +++ /dev/null @@ -1,20 +0,0 @@ -1 -00:02:16.612 --> 00:02:19.376 -Senator, we're making -our final approach into Coruscant. - -2 -00:02:19.482 --> 00:02:21.609 -Very good, Lieutenant. - -3 -00:03:13.336 --> 00:03:15.167 -We made it. - -4 -00:03:18.608 --> 00:03:20.371 -I guess I was wrong. - -5 -00:03:20.476 --> 00:03:22.671 -There was no danger at all.