diff --git a/internal/charset/charset.go b/internal/charset/charset.go index b0e1388b..0647f730 100644 --- a/internal/charset/charset.go +++ b/internal/charset/charset.go @@ -157,9 +157,14 @@ func fromXML(content []byte) string { return strings.ToLower(xmlEncoding(string(t.Inst))) } -// FromHTML returns the charset of an HTML document. It relies on the meta tag -// and falls back on the plain text content. +// FromHTML returns the charset of an HTML document. It first looks if a BOM is +// present and if so uses it to determine the charset. If no BOM is present, +// it relies on the meta tag and falls back on the +// plain text content. func FromHTML(content []byte) string { + if cset := FromBOM(content); cset != "" { + return cset + } if cset := fromHTML(content); cset != "" { return cset } diff --git a/internal/charset/charset_test.go b/internal/charset/charset_test.go index d9b8603e..387c1ec2 100644 --- a/internal/charset/charset_test.go +++ b/internal/charset/charset_test.go @@ -24,6 +24,29 @@ const htmlDoc = ` ` +const htmlDocWithIncorrectCharset = ` + + + + + + + test + + + + + + + + + +
+ +` func TestFromXML(t *testing.T) { charset := FromXML([]byte(xmlDoc)) @@ -39,6 +62,13 @@ func TestFromHTML(t *testing.T) { } } +func TestFromHTMLWithBOM(t *testing.T) { + charset := FromHTML(append([]byte{0xEF, 0xBB, 0xBF}, []byte(htmlDocWithIncorrectCharset)...)) + if charset != "utf-8" { + t.Errorf("expected: utf-8; got: %s", charset) + } +} + func TestFromPlain(t *testing.T) { tcases := []struct { raw []byte diff --git a/internal/magic/magic.go b/internal/magic/magic.go index abab040b..466058fb 100644 --- a/internal/magic/magic.go +++ b/internal/magic/magic.go @@ -104,7 +104,14 @@ func xmlCheck(sig xmlSig, raw []byte) bool { // matches the raw input. func markup(sigs ...[]byte) Detector { return func(raw []byte, limit uint32) bool { - raw = trimLWS(raw) + if bytes.HasPrefix(raw, []byte{0xEF, 0xBB, 0xBF}) { + // We skip the UTF-8 BOM if present to ensure we correctly + // process any leading whitespace. The presence of the BOM + // is taken into account during charset detection in charset.go. + raw = trimLWS(raw[3:]) + } else { + raw = trimLWS(raw) + } if len(raw) == 0 { return false } diff --git a/mimetype_test.go b/mimetype_test.go index 2b926932..06325362 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -18,83 +18,86 @@ const testDataDir = "testdata" // test files sorted by the file name in alphabetical order. var files = map[string]string{ - "3g2.3g2": "video/3gpp2", - "3gp.3gp": "video/3gpp", - "3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml", - "7z.7z": "application/x-7z-compressed", - "a.a": "application/x-archive", - "aac.aac": "audio/aac", - "aaf.aaf": "application/octet-stream", - "accdb.accdb": "application/x-msaccess", - "aiff.aiff": "audio/aiff", - "amf.amf": "application/x-amf", - "amr.amr": "audio/amr", - "ape.ape": "audio/ape", - "apng.png": "image/vnd.mozilla.apng", - "asf.asf": "video/x-ms-asf", - "atom.atom": "application/atom+xml", - "au.au": "audio/basic", - "avi.avi": "video/x-msvideo", - "avif.avif": "image/avif", - "avifsequence.avif": "image/avif", - "bmp.bmp": "image/bmp", - "bpg.bpg": "image/bpg", - "bz2.bz2": "application/x-bzip2", - "cab.cab": "application/vnd.ms-cab-compressed", - "class.class": "application/x-java-applet", - "crx.crx": "application/x-chrome-extension", - "csv.csv": "text/csv", - "cpio.cpio": "application/x-cpio", - "dae.dae": "model/vnd.collada+xml", - "dbf.dbf": "application/x-dbf", - "dcm.dcm": "application/dicom", - "deb.deb": "application/vnd.debian.binary-package", - "djvu.djvu": "image/vnd.djvu", - "doc.doc": "application/msword", - "docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "drpm.rpm": "application/x-rpm", - "dwg.1.dwg": "image/vnd.dwg", - "dwg.dwg": "image/vnd.dwg", - "eot.eot": "application/vnd.ms-fontobject", - "epub.epub": "application/epub+zip", - "exe.exe": "application/vnd.microsoft.portable-executable", - "fdf.fdf": "application/vnd.fdf", - "fits.fits": "application/fits", - "flac.flac": "audio/flac", - "flv.flv": "video/x-flv", - "gbr.gbr": "image/x-gimp-gbr", - "geojson.1.geojson": "application/geo+json", - "geojson.geojson": "application/geo+json", - "gif.gif": "image/gif", - "glb.glb": "model/gltf-binary", - "gml.gml": "application/gml+xml", - "gpx.gpx": "application/gpx+xml", - "gz.gz": "application/gzip", - "har.har": "application/json", - "hdr.hdr": "image/vnd.radiance", - "heic.single.heic": "image/heic", - "heif.heif": "image/heif", - "html.html": "text/html; charset=utf-8", - "html.iso88591.html": "text/html; charset=iso-8859-1", - "html.svg.html": "text/html; charset=utf-8", - "html.usascii.html": "text/html; charset=us-ascii", - "html.utf8.html": "text/html; charset=utf-8", - "html.withbr.html": "text/html; charset=utf-8", - "ico.ico": "image/x-icon", - "ics.dos.ics": "text/calendar", - "ics.ics": "text/calendar", - "iso88591.txt": "text/plain; charset=iso-8859-1", - "jar.jar": "application/jar", - "jp2.jp2": "image/jp2", - "jpf.jpf": "image/jpx", - "jpg.jpg": "image/jpeg", - "jpm.jpm": "image/jpm", - "jxl.jxl": "image/jxl", - "xpm.xpm": "image/x-xpixmap", - "js.js": "application/javascript", - "json.json": "application/json", - "json.lowascii.json": "application/json", + "3g2.3g2": "video/3gpp2", + "3gp.3gp": "video/3gpp", + "3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml", + "7z.7z": "application/x-7z-compressed", + "a.a": "application/x-archive", + "aac.aac": "audio/aac", + "aaf.aaf": "application/octet-stream", + "accdb.accdb": "application/x-msaccess", + "aiff.aiff": "audio/aiff", + "amf.amf": "application/x-amf", + "amr.amr": "audio/amr", + "ape.ape": "audio/ape", + "apng.png": "image/vnd.mozilla.apng", + "asf.asf": "video/x-ms-asf", + "atom.atom": "application/atom+xml", + "au.au": "audio/basic", + "avi.avi": "video/x-msvideo", + "avif.avif": "image/avif", + "avifsequence.avif": "image/avif", + "bmp.bmp": "image/bmp", + "bpg.bpg": "image/bpg", + "bz2.bz2": "application/x-bzip2", + "cab.cab": "application/vnd.ms-cab-compressed", + "class.class": "application/x-java-applet", + "crx.crx": "application/x-chrome-extension", + "csv.csv": "text/csv", + "cpio.cpio": "application/x-cpio", + "dae.dae": "model/vnd.collada+xml", + "dbf.dbf": "application/x-dbf", + "dcm.dcm": "application/dicom", + "deb.deb": "application/vnd.debian.binary-package", + "djvu.djvu": "image/vnd.djvu", + "doc.doc": "application/msword", + "docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "drpm.rpm": "application/x-rpm", + "dwg.1.dwg": "image/vnd.dwg", + "dwg.dwg": "image/vnd.dwg", + "eot.eot": "application/vnd.ms-fontobject", + "epub.epub": "application/epub+zip", + "exe.exe": "application/vnd.microsoft.portable-executable", + "fdf.fdf": "application/vnd.fdf", + "fits.fits": "application/fits", + "flac.flac": "audio/flac", + "flv.flv": "video/x-flv", + "gbr.gbr": "image/x-gimp-gbr", + "geojson.1.geojson": "application/geo+json", + "geojson.geojson": "application/geo+json", + "gif.gif": "image/gif", + "glb.glb": "model/gltf-binary", + "gml.gml": "application/gml+xml", + "gpx.gpx": "application/gpx+xml", + "gz.gz": "application/gzip", + "har.har": "application/json", + "hdr.hdr": "image/vnd.radiance", + "heic.single.heic": "image/heic", + "heif.heif": "image/heif", + "html.html": "text/html; charset=utf-8", + "html.iso88591.html": "text/html; charset=iso-8859-1", + "html.svg.html": "text/html; charset=utf-8", + "html.usascii.html": "text/html; charset=us-ascii", + "html.utf8.html": "text/html; charset=utf-8", + "html.utf8bom.html": "text/html; charset=utf-8", + "html.utf8bomws.html": "text/html; charset=utf-8", + "html.utf8bomdetect.html": "text/html; charset=utf-8", + "html.withbr.html": "text/html; charset=utf-8", + "ico.ico": "image/x-icon", + "ics.dos.ics": "text/calendar", + "ics.ics": "text/calendar", + "iso88591.txt": "text/plain; charset=iso-8859-1", + "jar.jar": "application/jar", + "jp2.jp2": "image/jp2", + "jpf.jpf": "image/jpx", + "jpg.jpg": "image/jpeg", + "jpm.jpm": "image/jpm", + "jxl.jxl": "image/jxl", + "xpm.xpm": "image/x-xpixmap", + "js.js": "application/javascript", + "json.json": "application/json", + "json.lowascii.json": "application/json", // json.{int,float,string}.txt contain a single JSON value. They are valid JSON // documents, but they should not be detected as application/json. This mimics // the behaviour of the file utility and seems the correct thing to do. diff --git a/testdata/html.utf8bom.html b/testdata/html.utf8bom.html new file mode 100644 index 00000000..3c47ba38 --- /dev/null +++ b/testdata/html.utf8bom.html @@ -0,0 +1,23 @@ + + + + + + + + test + + + + + + + + + +
+ + diff --git a/testdata/html.utf8bomdetect.html b/testdata/html.utf8bomdetect.html new file mode 100644 index 00000000..c7767727 --- /dev/null +++ b/testdata/html.utf8bomdetect.html @@ -0,0 +1,24 @@ + + + + + + + + + test + + + + + + + + + +
+ + diff --git a/testdata/html.utf8bomws.html b/testdata/html.utf8bomws.html new file mode 100644 index 00000000..69fa3f3f --- /dev/null +++ b/testdata/html.utf8bomws.html @@ -0,0 +1,24 @@ + + + + + + + + + test + + + + + + + + + +
+ +