From 8ae06f3a7a399bddd2eac2690e94040dbb600eb3 Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Thu, 31 Mar 2022 12:46:04 +0200 Subject: [PATCH 1/7] add html utf8 bom signature closes #267 --- internal/magic/text.go | 1 + mimetype_test.go | 1 + testdata/html.utf8bom.html | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 testdata/html.utf8bom.html diff --git a/internal/magic/text.go b/internal/magic/text.go index e2a03caf..f4d50465 100644 --- a/internal/magic/text.go +++ b/internal/magic/text.go @@ -13,6 +13,7 @@ import ( var ( // HTML matches a Hypertext Markup Language file. HTML = markup( + append([]byte{0xEF, 0xBB, 0xBF}, []byte(" + + + + + + + test + + + + + + + + + +
+ + From bdb9f8f19cf19bb893a6b19bad45abe8f8c38fd9 Mon Sep 17 00:00:00 2001 From: Gabriel Vasile Date: Sun, 3 Apr 2022 19:27:10 +0300 Subject: [PATCH 2/7] Add utf8 BOM+whitespace testcase for html detection --- mimetype_test.go | 1 + testdata/html.utf8bomws.html | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 testdata/html.utf8bomws.html diff --git a/mimetype_test.go b/mimetype_test.go index 442cac84..79d319c0 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -81,6 +81,7 @@ var files = map[string]string{ "html.usascii.html": "text/html; charset=us-ascii", "html.utf8.html": "text/html; charset=utf-8", "html.utf8bom.html": "text/html; charset=utf-8", + "html.bomws.html": "text/html; charset=utf-8", "html.withbr.html": "text/html; charset=utf-8", "ico.ico": "image/x-icon", "ics.dos.ics": "text/calendar", diff --git a/testdata/html.utf8bomws.html b/testdata/html.utf8bomws.html new file mode 100644 index 00000000..69fa3f3f --- /dev/null +++ b/testdata/html.utf8bomws.html @@ -0,0 +1,24 @@ + + + + + + + + + test + + + + + + + + + +
+ + From bc9575b24291d25c44a76e54d1db100faa85c952 Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Tue, 5 Apr 2022 10:35:14 +0200 Subject: [PATCH 3/7] account for whitespace after BOM in html --- internal/magic/magic.go | 8 +- mimetype_test.go | 158 ++++++++++++++++++++-------------------- 2 files changed, 86 insertions(+), 80 deletions(-) diff --git a/internal/magic/magic.go b/internal/magic/magic.go index abab040b..2788803b 100644 --- a/internal/magic/magic.go +++ b/internal/magic/magic.go @@ -104,7 +104,13 @@ func xmlCheck(sig xmlSig, raw []byte) bool { // matches the raw input. func markup(sigs ...[]byte) Detector { return func(raw []byte, limit uint32) bool { - raw = trimLWS(raw) + if bytes.HasPrefix(raw, []byte{0xEF, 0xBB, 0xBF}) { + // we restore BOM after stripping WS so its presence can be used + // in subsequent functions + raw = append([]byte{0xEF, 0xBB, 0xBF}, trimLWS(raw[3:])...) + } else { + raw = trimLWS(raw) + } if len(raw) == 0 { return false } diff --git a/mimetype_test.go b/mimetype_test.go index 79d319c0..2b6c4565 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -18,85 +18,85 @@ const testDataDir = "testdata" // test files sorted by the file name in alphabetical order. var files = map[string]string{ - "3g2.3g2": "video/3gpp2", - "3gp.3gp": "video/3gpp", - "3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml", - "7z.7z": "application/x-7z-compressed", - "a.a": "application/x-archive", - "aac.aac": "audio/aac", - "aaf.aaf": "application/octet-stream", - "accdb.accdb": "application/x-msaccess", - "aiff.aiff": "audio/aiff", - "amf.amf": "application/x-amf", - "amr.amr": "audio/amr", - "ape.ape": "audio/ape", - "apng.png": "image/vnd.mozilla.apng", - "asf.asf": "video/x-ms-asf", - "atom.atom": "application/atom+xml", - "au.au": "audio/basic", - "avi.avi": "video/x-msvideo", - "avif.avif": "image/avif", - "avifsequence.avif": "image/avif", - "bmp.bmp": "image/bmp", - "bpg.bpg": "image/bpg", - "bz2.bz2": "application/x-bzip2", - "cab.cab": "application/vnd.ms-cab-compressed", - "class.class": "application/x-java-applet", - "crx.crx": "application/x-chrome-extension", - "csv.csv": "text/csv", - "cpio.cpio": "application/x-cpio", - "dae.dae": "model/vnd.collada+xml", - "dbf.dbf": "application/x-dbf", - "dcm.dcm": "application/dicom", - "deb.deb": "application/vnd.debian.binary-package", - "djvu.djvu": "image/vnd.djvu", - "doc.doc": "application/msword", - "docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "drpm.rpm": "application/x-rpm", - "dwg.1.dwg": "image/vnd.dwg", - "dwg.dwg": "image/vnd.dwg", - "eot.eot": "application/vnd.ms-fontobject", - "epub.epub": "application/epub+zip", - "exe.exe": "application/vnd.microsoft.portable-executable", - "fdf.fdf": "application/vnd.fdf", - "fits.fits": "application/fits", - "flac.flac": "audio/flac", - "flv.flv": "video/x-flv", - "gbr.gbr": "image/x-gimp-gbr", - "geojson.1.geojson": "application/geo+json", - "geojson.geojson": "application/geo+json", - "gif.gif": "image/gif", - "glb.glb": "model/gltf-binary", - "gml.gml": "application/gml+xml", - "gpx.gpx": "application/gpx+xml", - "gz.gz": "application/gzip", - "har.har": "application/json", - "hdr.hdr": "image/vnd.radiance", - "heic.single.heic": "image/heic", - "heif.heif": "image/heif", - "html.html": "text/html; charset=utf-8", - "html.iso88591.html": "text/html; charset=iso-8859-1", - "html.svg.html": "text/html; charset=utf-8", - "html.usascii.html": "text/html; charset=us-ascii", - "html.utf8.html": "text/html; charset=utf-8", - "html.utf8bom.html": "text/html; charset=utf-8", - "html.bomws.html": "text/html; charset=utf-8", - "html.withbr.html": "text/html; charset=utf-8", - "ico.ico": "image/x-icon", - "ics.dos.ics": "text/calendar", - "ics.ics": "text/calendar", - "iso88591.txt": "text/plain; charset=iso-8859-1", - "jar.jar": "application/jar", - "jp2.jp2": "image/jp2", - "jpf.jpf": "image/jpx", - "jpg.jpg": "image/jpeg", - "jpm.jpm": "image/jpm", - "jxl.jxl": "image/jxl", - "xpm.xpm": "image/x-xpixmap", - "js.js": "application/javascript", - "json.json": "application/json", - "json.lowascii.json": "application/json", + "3g2.3g2": "video/3gpp2", + "3gp.3gp": "video/3gpp", + "3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml", + "7z.7z": "application/x-7z-compressed", + "a.a": "application/x-archive", + "aac.aac": "audio/aac", + "aaf.aaf": "application/octet-stream", + "accdb.accdb": "application/x-msaccess", + "aiff.aiff": "audio/aiff", + "amf.amf": "application/x-amf", + "amr.amr": "audio/amr", + "ape.ape": "audio/ape", + "apng.png": "image/vnd.mozilla.apng", + "asf.asf": "video/x-ms-asf", + "atom.atom": "application/atom+xml", + "au.au": "audio/basic", + "avi.avi": "video/x-msvideo", + "avif.avif": "image/avif", + "avifsequence.avif": "image/avif", + "bmp.bmp": "image/bmp", + "bpg.bpg": "image/bpg", + "bz2.bz2": "application/x-bzip2", + "cab.cab": "application/vnd.ms-cab-compressed", + "class.class": "application/x-java-applet", + "crx.crx": "application/x-chrome-extension", + "csv.csv": "text/csv", + "cpio.cpio": "application/x-cpio", + "dae.dae": "model/vnd.collada+xml", + "dbf.dbf": "application/x-dbf", + "dcm.dcm": "application/dicom", + "deb.deb": "application/vnd.debian.binary-package", + "djvu.djvu": "image/vnd.djvu", + "doc.doc": "application/msword", + "docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "drpm.rpm": "application/x-rpm", + "dwg.1.dwg": "image/vnd.dwg", + "dwg.dwg": "image/vnd.dwg", + "eot.eot": "application/vnd.ms-fontobject", + "epub.epub": "application/epub+zip", + "exe.exe": "application/vnd.microsoft.portable-executable", + "fdf.fdf": "application/vnd.fdf", + "fits.fits": "application/fits", + "flac.flac": "audio/flac", + "flv.flv": "video/x-flv", + "gbr.gbr": "image/x-gimp-gbr", + "geojson.1.geojson": "application/geo+json", + "geojson.geojson": "application/geo+json", + "gif.gif": "image/gif", + "glb.glb": "model/gltf-binary", + "gml.gml": "application/gml+xml", + "gpx.gpx": "application/gpx+xml", + "gz.gz": "application/gzip", + "har.har": "application/json", + "hdr.hdr": "image/vnd.radiance", + "heic.single.heic": "image/heic", + "heif.heif": "image/heif", + "html.html": "text/html; charset=utf-8", + "html.iso88591.html": "text/html; charset=iso-8859-1", + "html.svg.html": "text/html; charset=utf-8", + "html.usascii.html": "text/html; charset=us-ascii", + "html.utf8.html": "text/html; charset=utf-8", + "html.utf8bom.html": "text/html; charset=utf-8", + "html.utf8bomws.html": "text/html; charset=utf-8", + "html.withbr.html": "text/html; charset=utf-8", + "ico.ico": "image/x-icon", + "ics.dos.ics": "text/calendar", + "ics.ics": "text/calendar", + "iso88591.txt": "text/plain; charset=iso-8859-1", + "jar.jar": "application/jar", + "jp2.jp2": "image/jp2", + "jpf.jpf": "image/jpx", + "jpg.jpg": "image/jpeg", + "jpm.jpm": "image/jpm", + "jxl.jxl": "image/jxl", + "xpm.xpm": "image/x-xpixmap", + "js.js": "application/javascript", + "json.json": "application/json", + "json.lowascii.json": "application/json", // json.{int,float,string}.txt contain a single JSON value. They are valid JSON // documents, but they should not be detected as application/json. This mimics // the behaviour of the file utility and seems the correct thing to do. From 7abfb09f2cd7c875d9b75a3e8b559bfe6652f0ce Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Thu, 7 Apr 2022 00:37:35 +0200 Subject: [PATCH 4/7] strip BOM in magic.go ensure BOM has precedence in html encoding detection add tests for BOM detection override --- internal/charset/charset.go | 3 + internal/charset/charset_test.go | 30 ++++++ internal/magic/magic.go | 2 +- mimetype_test.go | 159 ++++++++++++++++--------------- testdata/html.utf8bomdetect.html | 24 +++++ 5 files changed, 138 insertions(+), 80 deletions(-) create mode 100644 testdata/html.utf8bomdetect.html diff --git a/internal/charset/charset.go b/internal/charset/charset.go index b0e1388b..83cfd6f4 100644 --- a/internal/charset/charset.go +++ b/internal/charset/charset.go @@ -160,6 +160,9 @@ func fromXML(content []byte) string { // FromHTML returns the charset of an HTML document. It relies on the meta tag // and falls back on the plain text content. func FromHTML(content []byte) string { + if cset := FromBOM(content); cset != "" { + return cset + } if cset := fromHTML(content); cset != "" { return cset } diff --git a/internal/charset/charset_test.go b/internal/charset/charset_test.go index d9b8603e..387c1ec2 100644 --- a/internal/charset/charset_test.go +++ b/internal/charset/charset_test.go @@ -24,6 +24,29 @@ const htmlDoc = ` ` +const htmlDocWithIncorrectCharset = ` + + + + + + + test + + + + + + + + + +
+ +` func TestFromXML(t *testing.T) { charset := FromXML([]byte(xmlDoc)) @@ -39,6 +62,13 @@ func TestFromHTML(t *testing.T) { } } +func TestFromHTMLWithBOM(t *testing.T) { + charset := FromHTML(append([]byte{0xEF, 0xBB, 0xBF}, []byte(htmlDocWithIncorrectCharset)...)) + if charset != "utf-8" { + t.Errorf("expected: utf-8; got: %s", charset) + } +} + func TestFromPlain(t *testing.T) { tcases := []struct { raw []byte diff --git a/internal/magic/magic.go b/internal/magic/magic.go index 2788803b..f20d1aef 100644 --- a/internal/magic/magic.go +++ b/internal/magic/magic.go @@ -107,7 +107,7 @@ func markup(sigs ...[]byte) Detector { if bytes.HasPrefix(raw, []byte{0xEF, 0xBB, 0xBF}) { // we restore BOM after stripping WS so its presence can be used // in subsequent functions - raw = append([]byte{0xEF, 0xBB, 0xBF}, trimLWS(raw[3:])...) + raw = trimLWS(raw[3:]) } else { raw = trimLWS(raw) } diff --git a/mimetype_test.go b/mimetype_test.go index 2b6c4565..06325362 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -18,85 +18,86 @@ const testDataDir = "testdata" // test files sorted by the file name in alphabetical order. var files = map[string]string{ - "3g2.3g2": "video/3gpp2", - "3gp.3gp": "video/3gpp", - "3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml", - "7z.7z": "application/x-7z-compressed", - "a.a": "application/x-archive", - "aac.aac": "audio/aac", - "aaf.aaf": "application/octet-stream", - "accdb.accdb": "application/x-msaccess", - "aiff.aiff": "audio/aiff", - "amf.amf": "application/x-amf", - "amr.amr": "audio/amr", - "ape.ape": "audio/ape", - "apng.png": "image/vnd.mozilla.apng", - "asf.asf": "video/x-ms-asf", - "atom.atom": "application/atom+xml", - "au.au": "audio/basic", - "avi.avi": "video/x-msvideo", - "avif.avif": "image/avif", - "avifsequence.avif": "image/avif", - "bmp.bmp": "image/bmp", - "bpg.bpg": "image/bpg", - "bz2.bz2": "application/x-bzip2", - "cab.cab": "application/vnd.ms-cab-compressed", - "class.class": "application/x-java-applet", - "crx.crx": "application/x-chrome-extension", - "csv.csv": "text/csv", - "cpio.cpio": "application/x-cpio", - "dae.dae": "model/vnd.collada+xml", - "dbf.dbf": "application/x-dbf", - "dcm.dcm": "application/dicom", - "deb.deb": "application/vnd.debian.binary-package", - "djvu.djvu": "image/vnd.djvu", - "doc.doc": "application/msword", - "docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "drpm.rpm": "application/x-rpm", - "dwg.1.dwg": "image/vnd.dwg", - "dwg.dwg": "image/vnd.dwg", - "eot.eot": "application/vnd.ms-fontobject", - "epub.epub": "application/epub+zip", - "exe.exe": "application/vnd.microsoft.portable-executable", - "fdf.fdf": "application/vnd.fdf", - "fits.fits": "application/fits", - "flac.flac": "audio/flac", - "flv.flv": "video/x-flv", - "gbr.gbr": "image/x-gimp-gbr", - "geojson.1.geojson": "application/geo+json", - "geojson.geojson": "application/geo+json", - "gif.gif": "image/gif", - "glb.glb": "model/gltf-binary", - "gml.gml": "application/gml+xml", - "gpx.gpx": "application/gpx+xml", - "gz.gz": "application/gzip", - "har.har": "application/json", - "hdr.hdr": "image/vnd.radiance", - "heic.single.heic": "image/heic", - "heif.heif": "image/heif", - "html.html": "text/html; charset=utf-8", - "html.iso88591.html": "text/html; charset=iso-8859-1", - "html.svg.html": "text/html; charset=utf-8", - "html.usascii.html": "text/html; charset=us-ascii", - "html.utf8.html": "text/html; charset=utf-8", - "html.utf8bom.html": "text/html; charset=utf-8", - "html.utf8bomws.html": "text/html; charset=utf-8", - "html.withbr.html": "text/html; charset=utf-8", - "ico.ico": "image/x-icon", - "ics.dos.ics": "text/calendar", - "ics.ics": "text/calendar", - "iso88591.txt": "text/plain; charset=iso-8859-1", - "jar.jar": "application/jar", - "jp2.jp2": "image/jp2", - "jpf.jpf": "image/jpx", - "jpg.jpg": "image/jpeg", - "jpm.jpm": "image/jpm", - "jxl.jxl": "image/jxl", - "xpm.xpm": "image/x-xpixmap", - "js.js": "application/javascript", - "json.json": "application/json", - "json.lowascii.json": "application/json", + "3g2.3g2": "video/3gpp2", + "3gp.3gp": "video/3gpp", + "3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml", + "7z.7z": "application/x-7z-compressed", + "a.a": "application/x-archive", + "aac.aac": "audio/aac", + "aaf.aaf": "application/octet-stream", + "accdb.accdb": "application/x-msaccess", + "aiff.aiff": "audio/aiff", + "amf.amf": "application/x-amf", + "amr.amr": "audio/amr", + "ape.ape": "audio/ape", + "apng.png": "image/vnd.mozilla.apng", + "asf.asf": "video/x-ms-asf", + "atom.atom": "application/atom+xml", + "au.au": "audio/basic", + "avi.avi": "video/x-msvideo", + "avif.avif": "image/avif", + "avifsequence.avif": "image/avif", + "bmp.bmp": "image/bmp", + "bpg.bpg": "image/bpg", + "bz2.bz2": "application/x-bzip2", + "cab.cab": "application/vnd.ms-cab-compressed", + "class.class": "application/x-java-applet", + "crx.crx": "application/x-chrome-extension", + "csv.csv": "text/csv", + "cpio.cpio": "application/x-cpio", + "dae.dae": "model/vnd.collada+xml", + "dbf.dbf": "application/x-dbf", + "dcm.dcm": "application/dicom", + "deb.deb": "application/vnd.debian.binary-package", + "djvu.djvu": "image/vnd.djvu", + "doc.doc": "application/msword", + "docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "drpm.rpm": "application/x-rpm", + "dwg.1.dwg": "image/vnd.dwg", + "dwg.dwg": "image/vnd.dwg", + "eot.eot": "application/vnd.ms-fontobject", + "epub.epub": "application/epub+zip", + "exe.exe": "application/vnd.microsoft.portable-executable", + "fdf.fdf": "application/vnd.fdf", + "fits.fits": "application/fits", + "flac.flac": "audio/flac", + "flv.flv": "video/x-flv", + "gbr.gbr": "image/x-gimp-gbr", + "geojson.1.geojson": "application/geo+json", + "geojson.geojson": "application/geo+json", + "gif.gif": "image/gif", + "glb.glb": "model/gltf-binary", + "gml.gml": "application/gml+xml", + "gpx.gpx": "application/gpx+xml", + "gz.gz": "application/gzip", + "har.har": "application/json", + "hdr.hdr": "image/vnd.radiance", + "heic.single.heic": "image/heic", + "heif.heif": "image/heif", + "html.html": "text/html; charset=utf-8", + "html.iso88591.html": "text/html; charset=iso-8859-1", + "html.svg.html": "text/html; charset=utf-8", + "html.usascii.html": "text/html; charset=us-ascii", + "html.utf8.html": "text/html; charset=utf-8", + "html.utf8bom.html": "text/html; charset=utf-8", + "html.utf8bomws.html": "text/html; charset=utf-8", + "html.utf8bomdetect.html": "text/html; charset=utf-8", + "html.withbr.html": "text/html; charset=utf-8", + "ico.ico": "image/x-icon", + "ics.dos.ics": "text/calendar", + "ics.ics": "text/calendar", + "iso88591.txt": "text/plain; charset=iso-8859-1", + "jar.jar": "application/jar", + "jp2.jp2": "image/jp2", + "jpf.jpf": "image/jpx", + "jpg.jpg": "image/jpeg", + "jpm.jpm": "image/jpm", + "jxl.jxl": "image/jxl", + "xpm.xpm": "image/x-xpixmap", + "js.js": "application/javascript", + "json.json": "application/json", + "json.lowascii.json": "application/json", // json.{int,float,string}.txt contain a single JSON value. They are valid JSON // documents, but they should not be detected as application/json. This mimics // the behaviour of the file utility and seems the correct thing to do. diff --git a/testdata/html.utf8bomdetect.html b/testdata/html.utf8bomdetect.html new file mode 100644 index 00000000..c7767727 --- /dev/null +++ b/testdata/html.utf8bomdetect.html @@ -0,0 +1,24 @@ + + + + + + + + + test + + + + + + + + + +
+ + From 29effd9265ddf2f3cc729e4b18542df45b689195 Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Thu, 7 Apr 2022 00:42:52 +0200 Subject: [PATCH 5/7] update FromHTML comment with modified behaviour --- internal/charset/charset.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/charset/charset.go b/internal/charset/charset.go index 83cfd6f4..0647f730 100644 --- a/internal/charset/charset.go +++ b/internal/charset/charset.go @@ -157,8 +157,10 @@ func fromXML(content []byte) string { return strings.ToLower(xmlEncoding(string(t.Inst))) } -// FromHTML returns the charset of an HTML document. It relies on the meta tag -// and falls back on the plain text content. +// FromHTML returns the charset of an HTML document. It first looks if a BOM is +// present and if so uses it to determine the charset. If no BOM is present, +// it relies on the meta tag and falls back on the +// plain text content. func FromHTML(content []byte) string { if cset := FromBOM(content); cset != "" { return cset From e4f5c6c96533c3e35b8bd01f5161308cba48576c Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Thu, 7 Apr 2022 09:29:42 +0200 Subject: [PATCH 6/7] remove superfluous BOM spec in markup --- internal/magic/text.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/magic/text.go b/internal/magic/text.go index f4d50465..e2a03caf 100644 --- a/internal/magic/text.go +++ b/internal/magic/text.go @@ -13,7 +13,6 @@ import ( var ( // HTML matches a Hypertext Markup Language file. HTML = markup( - append([]byte{0xEF, 0xBB, 0xBF}, []byte(" Date: Sun, 10 Apr 2022 10:43:33 +0200 Subject: [PATCH 7/7] update comment on UTF-8 BOM stripping --- internal/magic/magic.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/internal/magic/magic.go b/internal/magic/magic.go index f20d1aef..466058fb 100644 --- a/internal/magic/magic.go +++ b/internal/magic/magic.go @@ -105,8 +105,9 @@ func xmlCheck(sig xmlSig, raw []byte) bool { func markup(sigs ...[]byte) Detector { return func(raw []byte, limit uint32) bool { if bytes.HasPrefix(raw, []byte{0xEF, 0xBB, 0xBF}) { - // we restore BOM after stripping WS so its presence can be used - // in subsequent functions + // We skip the UTF-8 BOM if present to ensure we correctly + // process any leading whitespace. The presence of the BOM + // is taken into account during charset detection in charset.go. raw = trimLWS(raw[3:]) } else { raw = trimLWS(raw)