-
Notifications
You must be signed in to change notification settings - Fork 8
/
readability_test.go
179 lines (154 loc) · 6.22 KB
/
readability_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
package readability
import (
"strings"
"testing"
"github.com/PuerkitoBio/goquery"
"github.com/stretchr/testify/assert"
)
var urlWithAbsoluteImgPaths = "http://www.espn.com/nba/insider/story/_/id/22450965/drafting-nba-rising-stars-future-star-potential-ben-simmons-lonzo-ball-joel-embiid-more"
var urlWithRelativeImgPaths = "http://www.boogiejack.com/server_paths.html"
func TestExtractWhenImgPathsAreAbsolute(t *testing.T) {
opt := NewOption()
opt.ImageRequestTimeout = 500
// not using opengraph (traditional readability rule)
opt.LookupOpenGraphTags = false
c, err := Extract(urlWithAbsoluteImgPaths, opt)
assert.Nil(t, err)
assert.Equal(t, "Drafting NBA rising stars by future star potential - Ben Simmons, Lonzo Ball, Joel Embiid and more", c.Title)
assert.Equal(t, " ABOUT COOKIES To help make this website better, to improve and personalize your experience and for advertising purposes, are you happy to accept cookies and other technologies? Yes More Info Here Cookie Choices ", c.Description)
assert.NotContains(t, c.Description, "\n")
assert.Empty(t, c.Images) // empty since images are lazily-loaded
// using opengraph
opt.LookupOpenGraphTags = true
c, err = Extract(urlWithAbsoluteImgPaths, opt)
assert.Nil(t, err)
assert.Equal(t, "Drafting Embiid, Ball and NBA Rising Stars by future potential", c.Title)
assert.Equal(t, "We draft the best NBA rookies and sophomores to build two teams for five years from now. Who you got?", c.Description)
assert.NotEmpty(t, c.Images)
assert.Equal(t, 1, len(c.Images))
}
func TestExtractWhenImgPathsAreRelative(t *testing.T) {
opt := NewOption()
opt.ImageRequestTimeout = 500
c, err := Extract(urlWithRelativeImgPaths, opt)
assert.Nil(t, err)
assert.NotEmpty(t, c.Title)
assert.NotContains(t, c.Title, "\n")
assert.NotEmpty(t, c.Description)
assert.NotContains(t, c.Description, "\n")
assert.NotEmpty(t, c.Images)
}
func TestExtractForImages(t *testing.T) {
u := "http://www.orangesmile.com/travelguide/palermo/photo-gallery.htm"
opt := NewOption()
opt.IgnoreImageFormat = []string{"data:image/", ".svg", ".webp", ".gif"}
opt.ImageRequestTimeout = 2000
opt.CheckImageLoopCount = 20
opt.MaxImageCount = 3
opt.MinImageWidth = 300
opt.MinImageHeight = 300
c, _ := Extract(u, opt)
assert.Equal(t, opt.MaxImageCount, len(c.Images))
}
func TestPattern(t *testing.T) {
p := newPattern()
assert.Empty(t, p.Video.FindString("http://WWW.ITUBE.COM"))
assert.NotEmpty(t, p.Video.FindString("http://WWW.YOUTUBE.COM"))
assert.NotEmpty(t, p.UnlikelyCandidates.FindString("My Comment"))
}
func TestClassWeight(t *testing.T) {
html := `<div id="main-article" class="text blog">for positive class weight</div>
<a id="footer-link" class="btn" href="#">for negative class weight</a>`
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
s := doc.Find("div").First()
assert.Equal(t, 50.0, classWeight(s, NewOption()))
s = doc.Find("a").First()
assert.Equal(t, -25.0, classWeight(s, NewOption()))
}
func TestLinkDensity(t *testing.T) {
html := `<div>Speak blah blah!<a>123</a><a>4</a></div>`
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
assert.Equal(t, 0.2, linkDensity(doc.Selection))
}
func TestAbsPath(t *testing.T) {
// for absolute path
url := "http://www.kakao.com/talk"
in := url + "/img/a.jpg"
out, err := absPath(in, url)
assert.Nil(t, err)
assert.Equal(t, in, out)
// for relative path starting with "/"
in = "/img/b.jpg"
out, err = absPath(in, url)
assert.Nil(t, err)
assert.Equal(t, "http://www.kakao.com/img/b.jpg", out)
// for relative path not starting with "/" and reqURL does not have subdirectories
in = "img/b.jpg"
out, err = absPath(in, "http://www.kakao.com")
assert.Nil(t, err)
assert.Equal(t, "http://www.kakao.com/img/b.jpg", out)
// for relative path not starting with "/"
url = "https://www.wto.org/english/tratop_e/envir_e/envir_req_e.htm"
in = "../../../images/top_logo.gif"
out, err = absPath(in, url)
assert.Nil(t, err)
assert.Equal(t, "https://www.wto.org/english/tratop_e/envir_e/../../../images/top_logo.gif", out)
// for empty input path
in = ""
out, err = absPath(in, url)
assert.Equal(t, "", out)
assert.NotNil(t, err)
// failing case - invalid input path
url = "http://www.kakao.com"
in = "fhsjkdfhjsdf#$%^#&^"
_, err = absPath(in, url)
assert.NotNil(t, err)
// failing case - invalid requestURL string
url = "yirqywi8r4o"
in = "/a.jpg"
_, err = absPath(in, url)
assert.NotNil(t, err)
}
func TestAbsPathWithoutScheme(t *testing.T) {
url := "https://brunch.co.kr/@julieted17/19"
in := "//t1.daumcdn.net/brunch/static/icon/favicon/favicon64_150520.ico"
out, err := absPath(in, url)
assert.Nil(t, err)
assert.Equal(t, "https:"+in, out)
}
func TestDescriptionTimeout(t *testing.T) {
url := "https://tools.ietf.org/rfc/"
opt := NewOption()
opt.DescriptionExtractionTimeout = 10
c, err := Extract(url, opt)
assert.Nil(t, err)
assert.NotNil(t, c)
assert.Empty(t, c.Description)
assert.Empty(t, c.Images)
}
func TestAuthor(t *testing.T) {
// <span class='author'>Jonathan Givony and Mike Schmitz</span>
doc, _ := goquery.NewDocument(urlWithAbsoluteImgPaths)
assert.Equal(t, "Jonathan Givony and Mike Schmitz", author(doc))
// <meta name="dc.creator" content="Finch" />
html := `<head><meta name="dc.creator" content="Finch" /></head>`
doc, _ = goquery.NewDocumentFromReader(strings.NewReader(html))
assert.Equal(t, "Finch", author(doc))
// <meta name="author" content="philip" />
html = `<head><meta name="author" content="philip" /></head>`
doc, _ = goquery.NewDocumentFromReader(strings.NewReader(html))
assert.Equal(t, "philip", author(doc))
// <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
html = `<a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>`
doc, _ = goquery.NewDocumentFromReader(strings.NewReader(html))
assert.Equal(t, "Danny Banks (rel)", author(doc))
}
func TestForOpengraph(t *testing.T) {
url := "https://roadsandkingdoms.com/2019/rk-insider-going-dublin/"
opt := NewOption()
c, err := Extract(url, opt)
assert.Nil(t, err)
assert.NotNil(t, c)
assert.Equal(t, "R&K Insider: Going to Dublin", c.Title)
assert.Equal(t, "This week on R&K: What to know before you go to Dublin, a ridiculously calorific breakfast in Norway, and how to hunt for food in Tokyo.", c.Description)
}