forked from phishdetect/phishdetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpage.go
69 lines (59 loc) · 1.77 KB
/
page.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// PhishDetect
// Copyright (c) 2018-2019 Claudio Guarnieri.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package phishdetect
import (
"fmt"
"strings"
"github.com/anaskhan96/soup"
"jaytaylor.com/html2text"
)
// Page contains information on the HTML page.
type Page struct {
HTML string
Soup soup.Root
Text string
}
// NewPage instantiates a new Page struct.
func NewPage(html string) (*Page, error) {
if strings.TrimSpace(html) == "" {
return nil, fmt.Errorf("No valid HTML provided")
}
soup := soup.HTMLParse(html)
text, _ := html2text.FromString(html, html2text.Options{
PrettyTables: false,
})
return &Page{
HTML: html,
Soup: soup,
Text: text,
}, nil
}
// GetTitle returns the content of the <title> tag from the HTML page.
func (p *Page) GetTitle() string {
title := p.Soup.Find("title")
if title.Error != nil {
return ""
}
return title.Text()
}
// GetInputs returns any form input.
func (p *Page) GetInputs(inputType string) []soup.Root {
return p.Soup.FindAll("input", "type", inputType)
}
// GetEntities returns any HTML entity of the specified type.
func (p *Page) GetEntities(entity string) []soup.Root {
return p.Soup.FindAll(entity)
}