forked from antchfx/antch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
html.go
95 lines (84 loc) · 2.1 KB
/
html.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package antch
import (
"bytes"
"fmt"
"io"
"mime"
"net/http"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/htmlindex"
"golang.org/x/text/transform"
)
// MediaType describe the content type of an HTTP request or HTTP response.
type MediaType struct {
// Type is the HTTP content type represents. such as
// "text/html", "image/jpeg".
Type string
// Charset is the HTTP content encoding represents.
Charset string
}
// ContentType returns the HTTP header content-type value.
func (m MediaType) ContentType() string {
if len(m.Type) > 0 && m.Charset != "" {
return fmt.Sprintf("%s; charset=%s", m.Type, m.Charset)
}
return m.Type
}
// ParseMediaType parsing a specified string v to MediaType struct.
func ParseMediaType(v string) MediaType {
if v == "" {
return MediaType{}
}
mimetype, params, err := mime.ParseMediaType(v)
if err != nil {
return MediaType{}
}
return MediaType{
Type: mimetype,
Charset: params["charset"],
}
}
func readResponseBody(resp *http.Response) (io.Reader, error) {
var (
ce encoding.Encoding
r io.Reader = resp.Body
)
mediatype := ParseMediaType(resp.Header.Get("Content-Type"))
if mediatype.Charset == "" {
// If HTTP Response's header not include a charset field,
// reads 1024 bytes from Response body and geting encoding.
preview := make([]byte, 1024)
n, err := io.ReadFull(r, preview)
switch {
case err == io.ErrUnexpectedEOF:
preview = preview[:n]
r = bytes.NewReader(preview)
case err != nil:
return nil, err
default:
r = io.MultiReader(bytes.NewReader(preview), r)
}
ce, _, _ = charset.DetermineEncoding(preview, "")
} else {
e, err := htmlindex.Get(mediatype.Charset)
if err != nil {
return nil, err
}
ce = e
}
if ce != encoding.Nop {
r = transform.NewReader(r, ce.NewDecoder())
}
return r, nil
}
// ParseHTML parses an HTTP response as HTML document.
func ParseHTML(resp *http.Response) (*html.Node, error) {
r, err := readResponseBody(resp)
if err != nil {
return nil, err
}
return htmlquery.Parse(r)
}