Skip to content

Commit

Permalink
Add support for SubRip (#232)
Browse files Browse the repository at this point in the history
* Add support for SubRip file format

* Handle Windows newlines

In Windows, a newline is represented as `\r\n`.

* Add benchmark for regex vs time.Parse .srt detection

* Make Srt timestamp detection more restrictive

* Remove unused Srt functions

* Use a constant for Srt timestamp layout

* Remove redundant comments in Srt

* make `Srt` work with Go <1.17

* Ensure correct Srt timestamp length before processing

Co-authored-by: Gabriel Vasile <gabriel.vasile@email.com>

* Remove `application/srt` alias from Srt

Co-authored-by: Gabriel Vasile <gabriel.vasile@email.com>
  • Loading branch information
joksas and gabriel-vasile committed Jan 17, 2022
1 parent db2464c commit e7d7726
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 45 deletions.
12 changes: 12 additions & 0 deletions internal/magic/magic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,15 @@ func TestDropLastLine(t *testing.T) {
}
}
}

func BenchmarkSrt(b *testing.B) {
const subtitle = `1
00:02:16,612 --> 00:02:19,376
Senator, we're making
our final approach into Coruscant.
`
for i := 0; i < b.N; i++ {
Srt([]byte(subtitle), 0)
}
}
51 changes: 51 additions & 0 deletions internal/magic/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package magic
import (
"bufio"
"bytes"
"strings"
"time"

"github.com/gabriel-vasile/mimetype/internal/charset"
"github.com/gabriel-vasile/mimetype/internal/json"
Expand Down Expand Up @@ -297,3 +299,52 @@ func HAR(raw []byte, limit uint32) bool {
func Svg(raw []byte, limit uint32) bool {
return bytes.Contains(raw, []byte("<svg"))
}

// Srt matches a SubRip file.
func Srt(in []byte, _ uint32) bool {
s := bufio.NewScanner(bytes.NewReader(in))
if !s.Scan() {
return false
}
// First line must be 1.
if s.Text() != "1" {
return false
}

if !s.Scan() {
return false
}
secondLine := s.Text()
// Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine
// length to exactly 29 characters.
if len(secondLine) != 29 {
return false
}
// Decimal separator of fractional seconds in the timestamps must be a
// comma, not a period.
if strings.Contains(secondLine, ".") {
return false
}
// For Go <1.17, comma is not recognised as a decimal separator by `time.Parse`.
secondLine = strings.ReplaceAll(secondLine, ",", ".")
// Second line must be a time range.
ts := strings.Split(secondLine, " --> ")
if len(ts) != 2 {
return false
}
const layout = "15:04:05.000"
t0, err := time.Parse(layout, ts[0])
if err != nil {
return false
}
t1, err := time.Parse(layout, ts[1])
if err != nil {
return false
}
if t0.After(t1) {
return false
}

// A third line must exist and not be empty. This is the actual subtitle text.
return s.Scan() && len(s.Bytes()) != 0
}
88 changes: 47 additions & 41 deletions mimetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,47 +171,53 @@ var files = map[string]string{
"shx.shx": "application/octet-stream",
"so.so": "application/x-sharedlib",
"sqlite.sqlite": "application/vnd.sqlite3",
"svg.1.svg": "image/svg+xml",
"svg.svg": "image/svg+xml",
"swf.swf": "application/x-shockwave-flash",
"tar.tar": "application/x-tar",
"tcl.tcl": "text/x-tcl",
"tcx.tcx": "application/vnd.garmin.tcx+xml",
"tiff.tiff": "image/tiff",
"torrent.torrent": "application/x-bittorrent",
"tsv.tsv": "text/tab-separated-values",
"ttf.ttf": "font/ttf",
"tzfile": "application/tzif",
"utf16bebom.txt": "text/plain; charset=utf-16be",
"utf16lebom.txt": "text/plain; charset=utf-16le",
"utf32bebom.txt": "text/plain; charset=utf-32be",
"utf32lebom.txt": "text/plain; charset=utf-32le",
"utf8.txt": "text/plain; charset=utf-8",
"utf8ctrlchars": "application/octet-stream",
"vcf.dos.vcf": "text/vcard",
"vcf.vcf": "text/vcard",
"voc.voc": "audio/x-unknown",
"warc.warc": "application/warc",
"wasm.wasm": "application/wasm",
"wav.wav": "audio/wav",
"webm.webm": "video/webm",
"webp.webp": "image/webp",
"woff.woff": "font/woff",
"woff2.woff2": "font/woff2",
"x3d.x3d": "model/x3d+xml",
"xar.xar": "application/x-xar",
"xcf.xcf": "image/x-xcf",
"xfdf.xfdf": "application/vnd.adobe.xfdf",
"xlf.xlf": "application/x-xliff+xml",
"xls.xls": "application/vnd.ms-excel",
"xlsx.1.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xlsx.2.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xlsx.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xml.xml": "text/xml; charset=utf-8",
"xml.withbr.xml": "text/xml; charset=utf-8",
"xz.xz": "application/x-xz",
"zip.zip": "application/zip",
"zst.zst": "application/zstd",
"srt.srt": "text/x-subrip",
// not.srt.txt uses periods instead of commas for the decimal separators of
// the timestamps.
"not.srt.txt": "text/plain; charset=utf-8",
// not.srt.2.txt does not specify milliseconds.
"not.srt.2.txt": "text/plain; charset=utf-8",
"svg.1.svg": "image/svg+xml",
"svg.svg": "image/svg+xml",
"swf.swf": "application/x-shockwave-flash",
"tar.tar": "application/x-tar",
"tcl.tcl": "text/x-tcl",
"tcx.tcx": "application/vnd.garmin.tcx+xml",
"tiff.tiff": "image/tiff",
"torrent.torrent": "application/x-bittorrent",
"tsv.tsv": "text/tab-separated-values",
"ttf.ttf": "font/ttf",
"tzfile": "application/tzif",
"utf16bebom.txt": "text/plain; charset=utf-16be",
"utf16lebom.txt": "text/plain; charset=utf-16le",
"utf32bebom.txt": "text/plain; charset=utf-32be",
"utf32lebom.txt": "text/plain; charset=utf-32le",
"utf8.txt": "text/plain; charset=utf-8",
"utf8ctrlchars": "application/octet-stream",
"vcf.dos.vcf": "text/vcard",
"vcf.vcf": "text/vcard",
"voc.voc": "audio/x-unknown",
"warc.warc": "application/warc",
"wasm.wasm": "application/wasm",
"wav.wav": "audio/wav",
"webm.webm": "video/webm",
"webp.webp": "image/webp",
"woff.woff": "font/woff",
"woff2.woff2": "font/woff2",
"x3d.x3d": "model/x3d+xml",
"xar.xar": "application/x-xar",
"xcf.xcf": "image/x-xcf",
"xfdf.xfdf": "application/vnd.adobe.xfdf",
"xlf.xlf": "application/x-xliff+xml",
"xls.xls": "application/vnd.ms-excel",
"xlsx.1.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xlsx.2.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xlsx.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xml.xml": "text/xml; charset=utf-8",
"xml.withbr.xml": "text/xml; charset=utf-8",
"xz.xz": "application/x-xz",
"zip.zip": "application/zip",
"zst.zst": "application/zstd",
}

func TestDetect(t *testing.T) {
Expand Down
3 changes: 2 additions & 1 deletion supported_mimes.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 166 Supported MIME types
## 167 Supported MIME types
This file is automatically generated when running tests. Do not edit manually.

Extension | MIME type | Aliases
Expand Down Expand Up @@ -163,6 +163,7 @@ Extension | MIME type | Aliases
**.har** | application/json | -
**.ndjson** | application/x-ndjson | -
**.rtf** | text/rtf | -
**.srt** | text/x-subrip | text/x-srt
**.tcl** | text/x-tcl | application/x-tcl
**.csv** | text/csv | -
**.tsv** | text/tab-separated-values | -
Expand Down
20 changes: 20 additions & 0 deletions testdata/not.srt.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
1
00:02:16 --> 00:02:19
Senator, we're making
our final approach into Coruscant.

2
00:02:19 --> 00:02:21
Very good, Lieutenant.

3
00:03:13 --> 00:03:15
We made it.

4
00:03:18 --> 00:03:20
I guess I was wrong.

5
00:03:20 --> 00:03:22
There was no danger at all.
20 changes: 20 additions & 0 deletions testdata/not.srt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
1
00:02:16.612 --> 00:02:19.376
Senator, we're making
our final approach into Coruscant.

2
00:02:19.482 --> 00:02:21.609
Very good, Lieutenant.

3
00:03:13.336 --> 00:03:15.167
We made it.

4
00:03:18.608 --> 00:03:20.371
I guess I was wrong.

5
00:03:20.476 --> 00:03:22.671
There was no danger at all.
20 changes: 20 additions & 0 deletions testdata/srt.srt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
1
00:02:16,612 --> 00:02:19,376
Senator, we're making
our final approach into Coruscant.

2
00:02:19,482 --> 00:02:21,609
Very good, Lieutenant.

3
00:03:13,336 --> 00:03:15,167
We made it.

4
00:03:18,608 --> 00:03:20,371
I guess I was wrong.

5
00:03:20,476 --> 00:03:22,671
There was no danger at all.
8 changes: 5 additions & 3 deletions tree.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ var (
alias("application/x-ogg")
oggAudio = newMIME("audio/ogg", ".oga", magic.OggAudio)
oggVideo = newMIME("video/ogg", ".ogv", magic.OggVideo)
text = newMIME("text/plain", ".txt", magic.Text, html, svg, xml, php, js, lua, perl, python, json, ndJSON, rtf, tcl, csv, tsv, vCard, iCalendar, warc)
text = newMIME("text/plain", ".txt", magic.Text, html, svg, xml, php, js, lua, perl, python, json, ndJSON, rtf, srt, tcl, csv, tsv, vCard, iCalendar, warc)
xml = newMIME("text/xml", ".xml", magic.XML, rss, atom, x3d, kml, xliff, collada, gml, gpx, tcx, amf, threemf, xfdf, owl2)
json = newMIME("application/json", ".json", magic.JSON, geoJSON, har)
har = newMIME("application/json", ".har", magic.HAR)
Expand All @@ -87,8 +87,10 @@ var (
html = newMIME("text/html", ".html", magic.HTML)
php = newMIME("text/x-php", ".php", magic.Php)
rtf = newMIME("text/rtf", ".rtf", magic.Rtf)
js = newMIME("application/javascript", ".js", magic.Js).
alias("application/x-javascript", "text/javascript")
srt = newMIME("text/x-subrip", ".srt", magic.Srt).
alias("text/x-srt")
js = newMIME("application/javascript", ".js", magic.Js).
alias("application/x-javascript", "text/javascript")
lua = newMIME("text/x-lua", ".lua", magic.Lua)
perl = newMIME("text/x-perl", ".pl", magic.Perl)
python = newMIME("application/x-python", ".py", magic.Python)
Expand Down

0 comments on commit e7d7726

Please sign in to comment.