Skip to content

Commit

Permalink
Merge pull request #176 from gabriel-vasile/jsonvalues
Browse files Browse the repository at this point in the history
Fix detection for files containing a single JSON value
  • Loading branch information
gabriel-vasile authored Sep 21, 2021
2 parents a9b9b9d + 7a7d2a6 commit 220897d
Show file tree
Hide file tree
Showing 8 changed files with 293 additions and 61 deletions.
72 changes: 29 additions & 43 deletions internal/magic/text.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package magic

import (
"bufio"
"bytes"

"github.com/gabriel-vasile/mimetype/internal/charset"
Expand Down Expand Up @@ -142,27 +143,33 @@ func Php(raw []byte, limit uint32) bool {
return phpScriptF(raw, limit)
}

// Json matches a JavaScript Object Notation file.
func Json(raw []byte, limit uint32) bool {
// JSON matches a JavaScript Object Notation file.
func JSON(raw []byte, limit uint32) bool {
raw = trimLWS(raw)
if len(raw) == 0 || (raw[0] != '[' && raw[0] != '{') {
return false
}
parsed, err := json.Scan(raw)
// If the full file content was provided, check there is no error.
if len(raw) < int(limit) {
return err == nil
}

// If a section of the file was provided, check if all of it was parsed.
return parsed == len(raw) && len(raw) > 0
}

// GeoJson matches a RFC 7946 GeoJSON file.
// GeoJSON matches a RFC 7946 GeoJSON file.
//
// GeoJson detection implies searching for key:value pairs like: `"type": "Feature"`
// GeoJSON detection implies searching for key:value pairs like: `"type": "Feature"`
// in the input.
// BUG(gabriel-vasile): The "type" key should be searched for in the root object.
func GeoJson(raw []byte, limit uint32) bool {
func GeoJSON(raw []byte, limit uint32) bool {
raw = trimLWS(raw)
if len(raw) == 0 {
return false
}
// GeoJSON is always a JSON object, not a JSON array.
// GeoJSON is always a JSON object, not a JSON array or any other JSON value.
if raw[0] != '{' {
return false
}
Expand Down Expand Up @@ -190,7 +197,7 @@ func GeoJson(raw []byte, limit uint32) bool {
// Skip any whitespace after the colon.
raw = trimLWS(raw[1:])

geoJsonTypes := [][]byte{
geoJSONTypes := [][]byte{
[]byte(`"Feature"`),
[]byte(`"FeatureCollection"`),
[]byte(`"Point"`),
Expand All @@ -201,7 +208,7 @@ func GeoJson(raw []byte, limit uint32) bool {
[]byte(`"MultiPolygon"`),
[]byte(`"GeometryCollection"`),
}
for _, t := range geoJsonTypes {
for _, t := range geoJSONTypes {
if bytes.HasPrefix(raw, t) {
return true
}
Expand All @@ -210,45 +217,24 @@ func GeoJson(raw []byte, limit uint32) bool {
return false
}

// NdJson matches a Newline delimited JSON file.
func NdJson(raw []byte, limit uint32) bool {
// Separator with carriage return and new line `\r\n`.
srn := []byte{0x0D, 0x0A}

// Separator with only new line `\n`.
sn := []byte{0x0A}

// Total bytes scanned.
parsed := 0

// Split by `srn`.
for rni, insrn := range bytes.Split(raw, srn) {
// Separator byte count should be added only after the first split.
if rni != 0 {
// Add two as `\r\n` is used for split.
parsed += 2
// NdJSON matches a Newline delimited JSON file.
func NdJSON(raw []byte, limit uint32) bool {
lCount := 0
sc := bufio.NewScanner(dropLastLine(raw, limit))
for sc.Scan() {
l := sc.Bytes()
// Empty lines are allowed in NDJSON.
if l = trimRWS(trimLWS(l)); len(l) == 0 {
continue
}
// Split again by `sn`.
for ni, insn := range bytes.Split(insrn, sn) {
// Separator byte count should be added only after the first split.
if ni != 0 {
// Add one as `\n` is used for split.
parsed++
}
// Empty line is valid.
if len(insn) == 0 {
continue
}
p, err := json.Scan(insn)
parsed += p
if parsed < int(limit) && err != nil {
return false
}
_, err := json.Scan(l)
if err != nil {
return false
}
lCount++
}

// Empty inputs should not pass as valid NDJSON with 0 lines.
return parsed > 2 && parsed == len(raw)
return lCount > 1
}

// Har matches a HAR Spec file.
Expand Down
33 changes: 19 additions & 14 deletions internal/magic/text_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ import (

// Csv matches a comma-separated values file.
func Csv(raw []byte, limit uint32) bool {
return sv(raw, ',')
return sv(raw, ',', limit)
}

// Tsv matches a tab-separated values file.
func Tsv(raw []byte, limit uint32) bool {
return sv(raw, '\t')
return sv(raw, '\t', limit)
}

func sv(in []byte, comma rune) bool {
r := csv.NewReader(butLastLineReader(in, len(in)))
func sv(in []byte, comma rune, limit uint32) bool {
r := csv.NewReader(dropLastLine(in, limit))
r.Comma = comma
r.TrimLeadingSpace = true
r.LazyQuotes = true
Expand All @@ -27,20 +27,25 @@ func sv(in []byte, comma rune) bool {
return err == nil && r.FieldsPerRecord > 1 && len(lines) > 1
}

// butLastLineReader returns a reader to the provided byte slice.
// The reader is guaranteed to reach EOF before it reads `cutAt` bytes.
// Bytes after the last newline are dropped from the input.
func butLastLineReader(in []byte, cutAt int) io.Reader {
if len(in) >= cutAt {
// dropLastLine drops the last incomplete line from b.
//
// mimetype limits itself to ReadLimit bytes when performing a detection.
// This means, for file formats like CSV for NDJSON, the last line of the input
// can be an incomplete line.
func dropLastLine(b []byte, cutAt uint32) io.Reader {
if cutAt == 0 {
return bytes.NewReader(b)
}
if uint32(len(b)) >= cutAt {
for i := cutAt - 1; i > 0; i-- {
if in[i] == '\n' {
return bytes.NewReader(in[:i])
if b[i] == '\n' {
return bytes.NewReader(b[:i])
}
}

// no newline was found between the 0 index and cutAt
return bytes.NewReader(in[:cutAt])
// No newline was found between the 0 index and cutAt.
return bytes.NewReader(b[:cutAt])
}

return bytes.NewReader(in)
return bytes.NewReader(b)
}
7 changes: 7 additions & 0 deletions mimetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ var files = map[string]string{
"xpm.xpm": "image/x-xpixmap",
"js.js": "application/javascript",
"json.json": "application/json",
// json.{int,float,string}.txt contain a single JSON value. They are valid JSON
// documents, but they should not be detected as application/json. This mimics
// the behaviour of the file utility and seems the correct thing to do.
"json.int.txt": "text/plain; charset=utf-8",
"json.float.txt": "text/plain; charset=utf-8",
"json.string.txt": "text/plain; charset=utf-8",
"kml.kml": "application/vnd.google-earth.kml+xml",
"lit.lit": "application/x-ms-reader",
"ln": "application/x-executable",
Expand All @@ -117,6 +123,7 @@ var files = map[string]string{
"mqv.mqv": "video/quicktime",
"mrc.mrc": "application/marc",
"msg.msg": "application/vnd.ms-outlook",
"ndjson.xl.ndjson": "application/x-ndjson",
"ndjson.ndjson": "application/x-ndjson",
"nes.nes": "application/vnd.nintendo.snes.rom",
"elfobject": "application/x-object",
Expand Down
1 change: 1 addition & 0 deletions testdata/json.float.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-1.23456789
1 change: 1 addition & 0 deletions testdata/json.int.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1
1 change: 1 addition & 0 deletions testdata/json.string.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"json \" string value"
Loading

0 comments on commit 220897d

Please sign in to comment.