diff --git a/internal/charset/charset.go b/internal/charset/charset.go index b0e1388b..0647f730 100644 --- a/internal/charset/charset.go +++ b/internal/charset/charset.go @@ -157,9 +157,14 @@ func fromXML(content []byte) string { return strings.ToLower(xmlEncoding(string(t.Inst))) } -// FromHTML returns the charset of an HTML document. It relies on the meta tag -// and falls back on the plain text content. +// FromHTML returns the charset of an HTML document. It first looks if a BOM is +// present and if so uses it to determine the charset. If no BOM is present, +// it relies on the meta tag and falls back on the +// plain text content. func FromHTML(content []byte) string { + if cset := FromBOM(content); cset != "" { + return cset + } if cset := fromHTML(content); cset != "" { return cset } diff --git a/internal/charset/charset_test.go b/internal/charset/charset_test.go index d9b8603e..387c1ec2 100644 --- a/internal/charset/charset_test.go +++ b/internal/charset/charset_test.go @@ -24,6 +24,29 @@ const htmlDoc = `