-
Notifications
You must be signed in to change notification settings - Fork 2
/
tagify_test.go
237 lines (218 loc) · 6.38 KB
/
tagify_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
package tagify
import (
"context"
"fmt"
"log"
"net/http"
"os"
"testing"
"github.com/stretchr/testify/assert"
"golang.org/x/net/html"
"github.com/zoomio/tagify/config"
"github.com/zoomio/tagify/extension"
"github.com/zoomio/tagify/model"
thtml "github.com/zoomio/tagify/processor/html"
)
var ctx = context.TODO()
// table driven tests
var runTests = []struct {
name string
in []Option
expectTags []string
expectTitle string
expectHash string
}{
{
"run",
[]Option{Source(fmt.Sprintf("http://localhost:%d", port)),
TargetType(HTML), Limit(5), NoStopWords(true), ContentOnly(true)},
[]string{"test", "boy", "bang", "befell", "began"},
"Test",
"07b4291477fc6404cc89497b5665464c0d07c7e3358f4b84e3df4b9793d4aaa0787b0980b99649285c0aa08e30c9910f092a6b47ed48385555b65c34c6e82af4",
},
{
"run with query",
[]Option{Source(fmt.Sprintf("http://localhost:%d", port)),
TargetType(HTML), Limit(5), NoStopWords(true),
Query("#box3 p"), ContentOnly(true)},
[]string{"bang", "began", "boy", "day", "eat"},
"",
"e5e0aef65e77e87a3e23a3f157357444910f94f5dccd5d0fe185da73cb72a8b7bff6ac80d71cfca1da27e9d1b7a3e810a348ceeee52c2e4b68393c8ba5d92cc4",
},
{
"run custom weights",
[]Option{Source(fmt.Sprintf("http://localhost:%d", port)),
TargetType(HTML), Limit(5), NoStopWords(true), TagWeightsString("title:3")},
[]string{"test"},
"Test",
"20c62640489dbc272c51abfd1fbe7b5aa7280f814fbfdb2baf993fb1e8b4c860fb1f1c6964760144e2ef15849ef073f47cb89284481d17845565395d7574e2e7",
},
}
func Test_Run_HTML(t *testing.T) {
defer stopServer(startServer(fmt.Sprintf(":%d", port), indexHTML))
for _, tt := range runTests {
t.Run(tt.name, func(t *testing.T) {
res, err := Run(ctx, tt.in...)
assert.Nil(t, err)
assert.Equal(t, HTML, res.Meta.ContentType)
assert.Equal(t, tt.expectTitle, res.Meta.DocTitle)
assert.Equal(t, tt.expectHash, res.Meta.DocHash)
assert.ElementsMatch(t, tt.expectTags, res.TagsStrings())
})
}
}
func Test_GetTagsFromString(t *testing.T) {
res, err := Run(ctx,
Content("Test input reader of type text"),
TargetType(Text),
Limit(3),
NoStopWords(true),
)
assert.Nil(t, err)
assert.Len(t, res.Tags, 3)
assert.Equal(t,
"7d95ed3e8436c978f3e7f19f1645f89091f9fdb0439c15547f0a6f82bc4a0babebd06ff6285d9dff8db77861edf2cc8e6919ea5613bec0f30dba24bace839dda",
res.Meta.DocHash)
}
func Test_ToStrings(t *testing.T) {
res, err := Run(ctx,
Content("Test input reader of type text"),
TargetType(Text),
Limit(3),
NoStopWords(true),
)
assert.Nil(t, err)
strs := model.ToStrings(res.Tags)
assert.Len(t, strs, 3)
}
func Test_CustomHTML(t *testing.T) {
ytPage, _ := os.ReadFile("_resources_test/html/yt_page.html")
ext := &customHTML{}
defer stopServer(startServer(fmt.Sprintf(":%d", port), string(ytPage)))
res, err := Run(ctx,
Source(fmt.Sprintf("http://localhost:%d", port)),
Limit(2),
TargetType(HTML),
NoStopWords(true),
ExtraTagWeightsString("link:0"),
Extensions([]extension.Extension{ext}),
)
assert.Nil(t, err)
assert.Len(t, res.Extensions, 1)
assert.Equal(t, "Next Level Reynolds - YouTube", res.Meta.DocTitle)
assert.Equal(t, "Ryan Reynolds", ext.text)
/* var found int
res.ForEach(func(i int, tag *model.Tag) {
if tag.Value == "ryan" || tag.Value == "reynolds" {
found++
}
})
assert.Equal(t, 2, found) */
}
// startServer is a simple HTTP server that displays the passed headers in the html.
func startServer(addr string, pageHTML string) *http.Server {
mux := http.NewServeMux()
mux.HandleFunc("/", func(res http.ResponseWriter, _ *http.Request) {
res.Header().Set("Content-Type", "text/html; charset=utf-8")
fmt.Fprint(res, pageHTML)
})
srv := &http.Server{Addr: addr, Handler: mux}
go func() {
// returns ErrServerClosed on graceful close
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
log.Fatalf("ListenAndServe(): %s", err)
}
}()
return srv
}
func stopServer(srv *http.Server) {
// close the server gracefully
if err := srv.Shutdown(ctx); err != nil {
panic(err) // failure/timeout shutting down the server gracefully
}
}
const (
port = 8655
indexHTML = `<!doctype html>
<html>
<head>
<title>Test</title>
</head>
<body>
<div id="box1">
<div id="box2">
<p>There was a Boy whose name was Jim;</p>
<p>His Friends were very good to him.
<p>They gave him Tea, and Cakes, and Jam,</p>
<p>And slices of delicious Ham,</p>
<p>And Chocolate with pink inside,</p>
<p>And little Tricycles to ride,</p>
<p>And read him Stories through and through,</p>
<p>And even took him to the Zoo—</p>
<p>But there it was the dreadful Fate</p>
<p>Befell him, which I now relate.</p>
</div>
</div>
<div id="box3" style="display:none">
<p class="line">Now this was Jim’s especial Foible,</p>
<p class="line">He ran away when he was able,</p>
<p class="line">And on this inauspicious day</p>
<p class="line">He slipped his hand and ran away!</p>
<p class="line">He hadn’t gone a yard when—Bang!</p>
<p class="line">With open Jaws, a Lion sprang,</p>
<p class="line">And hungrily began to eat</p>
<p class="line">The Boy: beginning at his feet.</p>
</div>
<foo-stuff>
<bar-stuff>
<a href="https://www.zoomio.org">Zoom IO is here</a>
</bar-stuff>
</foo-stuff>
<script>
setTimeout(function() {
document.querySelector('#box3').style.display = '';
}, 3000);
</script>
</body>
</html>`
)
type customHTML struct {
text string
}
func (ext *customHTML) Name() string {
return "custom-html"
}
func (ext *customHTML) Version() string {
return "v0.0.1"
}
func (ext *customHTML) Result() *extension.ExtResult {
return extension.NewResult(ext, map[string]interface{}{"text": ext.text}, nil)
}
func (ext *customHTML) ParseTag(cfg *config.Config, token *html.Token, lineIdx int, cnts *thtml.HTMLContents) (bool, error) {
tag := token.Data
var appended bool
if ext.text == "" && tag == "link" {
var itemprop, content string
for _, v := range token.Attr {
if v.Key == "itemprop" {
itemprop = v.Val
}
if v.Key == "content" {
content = v.Val
}
}
if itemprop == "name" && content != "" {
// collect YouTube channel name
ext.text = content
// make it count as a tag too
// 1st check if line is there and append it if it is not
if lineIdx >= cnts.Len() {
cnts.Append(lineIdx, tag, []byte(content))
appended = true
}
// 2nd weight the line higher to boost its tags
cnts.Weigh(lineIdx, 6)
}
}
return appended, nil
}