-
Notifications
You must be signed in to change notification settings - Fork 90
/
readability.go
77 lines (67 loc) · 2.48 KB
/
readability.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
// Package readability is a Go package that find the main readable
// content from a HTML page. It works by removing clutter like buttons,
// ads, background images, script, etc.
//
// This package is based from Readability.js by Mozilla, and written line
// by line to make sure it looks and works as similar as possible. This
// way, hopefully all web page that can be parsed by Readability.js
// are parse-able by go-readability as well.
package readability
import (
"fmt"
"io"
"net/http"
nurl "net/url"
"strings"
"time"
"golang.org/x/net/html"
)
// FromReader parses an `io.Reader` and returns the readable content. It's the wrapper
// or `Parser.Parse()` and useful if you only want to use the default parser.
func FromReader(input io.Reader, pageURL *nurl.URL) (Article, error) {
parser := NewParser()
return parser.Parse(input, pageURL)
}
// FromDocument parses an document and returns the readable content. It's the wrapper
// or `Parser.ParseDocument()` and useful if you only want to use the default parser.
func FromDocument(doc *html.Node, pageURL *nurl.URL) (Article, error) {
parser := NewParser()
return parser.ParseDocument(doc, pageURL)
}
// FromURL fetch the web page from specified url then parses the response to find
// the readable content.
func FromURL(pageURL string, timeout time.Duration) (Article, error) {
// Make sure URL is valid
parsedURL, err := nurl.ParseRequestURI(pageURL)
if err != nil {
return Article{}, fmt.Errorf("failed to parse URL: %v", err)
}
// Fetch page from URL
client := &http.Client{Timeout: timeout}
resp, err := client.Get(pageURL)
if err != nil {
return Article{}, fmt.Errorf("failed to fetch the page: %v", err)
}
defer resp.Body.Close()
// Make sure content type is HTML
cp := resp.Header.Get("Content-Type")
if !strings.Contains(cp, "text/html") {
return Article{}, fmt.Errorf("URL is not a HTML document")
}
// Parse content
parser := NewParser()
return parser.Parse(resp.Body, parsedURL)
}
// Check checks whether the input is readable without parsing the whole thing. It's the
// wrapper for `Parser.Check()` and useful if you only use the default parser.
func Check(input io.Reader) bool {
parser := NewParser()
return parser.Check(input)
}
// CheckDocument checks whether the document is readable without parsing the whole thing.
// It's the wrapper for `Parser.CheckDocument()` and useful if you only use the default
// parser.
func CheckDocument(doc *html.Node) bool {
parser := NewParser()
return parser.CheckDocument(doc)
}