-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathexported.go
139 lines (102 loc) · 2.98 KB
/
exported.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package articletext
/*
The package is used extracts article text from a HTML page
It drops all additional elements from a html page (navigation, advertizing etc)
This file contains exported functiosn of a package. It is entry point of the package
Author: Roman Gelembjuk <roman@gelembjuk.com>
*/
import (
"io"
"log"
"os"
"github.com/PuerkitoBio/goquery"
)
// extracts useful text from a html file
func GetArticleTextFromFile(filepath string) (string, error) {
// create reader from file
reader, err := os.Open(filepath)
if err != nil {
log.Fatal(err)
return "", err
}
return GetArticleText(reader)
}
// extracts useful text from a html page presented by an url
func GetArticleTextFromUrl(url string) (string, error) {
doc, err := goquery.NewDocument(url)
if err != nil {
log.Fatal(err)
return "", err
}
return processArticle(doc, 1)
}
// extracts useful text from a html document presented as a Reader object
func GetArticleText(input io.Reader) (string, error) {
doc, err := goquery.NewDocumentFromReader(input)
if err != nil {
log.Fatal(err)
return "", err
}
return processArticle(doc, 1)
}
// extracts useful text from a html file
// returns a DOM signature
func GetArticleSignatureFromFile(filepath string) (string, error) {
// create reader from file
reader, err := os.Open(filepath)
if err != nil {
log.Fatal(err)
return "", err
}
return GetArticleSignature(reader)
}
// extracts useful text from a html page presented by an url
func GetArticleSignatureFromUrl(url string) (string, error) {
doc, err := goquery.NewDocument(url)
if err != nil {
log.Fatal(err)
return "", err
}
return processArticle(doc, 2)
}
// extracts useful text from a html document presented as a Reader object
func GetArticleSignature(input io.Reader) (string, error) {
doc, err := goquery.NewDocumentFromReader(input)
if err != nil {
log.Fatal(err)
return "", err
}
return processArticle(doc, 2)
}
// extracts useful text from a html file
func GetArticleTextFromFileByPath(filepath string, path string) (string, error) {
// create reader from file
reader, err := os.Open(filepath)
if err != nil {
log.Fatal(err)
return "", err
}
return GetArticleTextByPath(reader, path)
}
// extracts useful text from a html page presented by an url
func GetArticleTextFromUrlByPath(url string, path string) (string, error) {
doc, err := goquery.NewDocument(url)
if err != nil {
log.Fatal(err)
return "", err
}
return getTextByPathFromDocument(doc, path)
}
// extracts useful text from a html document presented as a Reader object
func GetArticleTextByPath(input io.Reader, path string) (string, error) {
doc, err := goquery.NewDocumentFromReader(input)
if err != nil {
log.Fatal(err)
return "", err
}
return getTextByPathFromDocument(doc, path)
}
// the functions finds a path (selector, signature) for each url and returns one that was found most often
func GetOptimalArticleSignatureByUrls(urls []string) (string, error) {
return getOptimalArticleSignatureByUrls(urls)
}