Skip to content

Commit

Permalink
feat(scraper): improve web scraping
Browse files Browse the repository at this point in the history
- support forward proxy
- remove external scraper support
- add body size limit
  • Loading branch information
ncarlier committed Sep 27, 2024
1 parent d444a4f commit 9b3c611
Show file tree
Hide file tree
Showing 11 changed files with 88 additions and 142 deletions.
6 changes: 4 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ require (
github.com/brianvoe/gofakeit v3.18.0+incompatible
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/getsentry/raven-go v0.2.0
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c
github.com/go-shiori/go-readability v0.0.0-20240701094332-1070de7e32ef
github.com/golang-jwt/jwt/v4 v4.5.0
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da
github.com/google/uuid v1.3.0
Expand All @@ -37,6 +37,8 @@ require (
golang.org/x/sync v0.7.0
)

require github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect

require (
github.com/PuerkitoBio/goquery v1.8.1 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
Expand Down
17 changes: 9 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAc
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
github.com/SherClockHolmes/webpush-go v1.2.0 h1:sGv0/ZWCvb1HUH+izLqrb2i68HuqD/0Y+AmGQfyqKJA=
github.com/SherClockHolmes/webpush-go v1.2.0/go.mod h1:w6X47YApe/B9wUz2Wh8xukxlyupaxSSEbu6yKJcHN2w=
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef h1:2JGTg6JapxP9/R33ZaagQtAM4EkkSYnIAlOG5EI8gkM=
github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef/go.mod h1:JS7hed4L1fj0hXcyEejnW57/7LCetXggd+vwrRnYeII=
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
Expand Down Expand Up @@ -48,12 +49,11 @@ github.com/galdor/go-thumbhash v1.0.0 h1:Q7xSnaDvSC91SuNmQI94JuUVHva29FDdA4/PkV0
github.com/galdor/go-thumbhash v1.0.0/go.mod h1:gEK2wZqIxS2W4mXNf48lPl6HWjX0vWsH1LpK/cU74Ho=
github.com/getsentry/raven-go v0.2.0 h1:no+xWJRb5ZI7eE8TWgIq1jLulQiIoLG0IfYxv5JYMGs=
github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ=
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 h1:zx4B0AiwqKDQq+AgqxWeHwbbLJQeidq20hgfP+aMNWI=
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65/go.mod h1:NPO1+buE6TYOWhUI98/hXLHHJhunIpXRuvDN4xjkCoE=
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad h1:3VP5Q8Mh165h2DHmXWFT4LJlwwvgTRlEuoe2vnsVnJ4=
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad/go.mod h1:2DpZlTJO/ycxp/vsc/C11oUyveStOgIXB88SYV1lncI=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
github.com/go-shiori/go-readability v0.0.0-20240701094332-1070de7e32ef h1:6y2GmHDeuF2xwC5L7fLMTlgnOjm5Jy8RYDI1YYpcOKU=
github.com/go-shiori/go-readability v0.0.0-20240701094332-1070de7e32ef/go.mod h1:jH+l/xV/8x8utphLx72GLIuw9wGhGzrZS5i7arOk8zc=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/gogs/chardet v0.0.0-20191104214054-4b6791f73a28/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
Expand Down Expand Up @@ -105,6 +105,7 @@ github.com/mattn/go-colorable v0.1.12 h1:jF+Du6AlPIjs2BiUiQlKOX0rt3SujHxPnksPKZb
github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo=
github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
github.com/microcosm-cc/bluemonday v1.0.23 h1:SMZe2IGa0NuHvnVNAZ+6B38gsTbi5e4sViiWJyDDqFY=
Expand Down Expand Up @@ -133,6 +134,7 @@ github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI
github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc=
github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI=
github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY=
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/rs/cors v1.9.0 h1:l9HGsTsHJcvW14Nk7J9KFz8bzeAWXn3CG6bgt7LsrAE=
Expand All @@ -141,6 +143,7 @@ github.com/rs/xid v1.4.0 h1:qd7wPTDkN6KQx2VmMBLrpHkiyQwgFXRnkOLacUiaSNY=
github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg=
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
Expand Down Expand Up @@ -196,10 +199,8 @@ golang.org/x/image v0.18.0 h1:jGzIakQa/ZXI1I0Fxvaa9W7yP25TqT6cHIHn+6CqvSQ=
golang.org/x/image v0.18.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210505214959-0714010a04ed/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
Expand Down
13 changes: 10 additions & 3 deletions internal/config/defaults.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,22 @@ user_agent = "${READFLOW_SCRAPING_USER_AGENT}"
## Timeout of the HTTP client used by the Web Scraper
# Default: 5s
timeout = "${READFLOW_SCRAPING_TIMEOUT}"
## External Web Scraper URL, using internal if empty
# Example: "https://example.org/scrap"
service_provider = "${READFLOW_SCRAPING_SERVICE_PROVIDER}"
## Block-list local or remote location, disabled if empty
# Examples:
# - file:///var/opt/block-list-txt
# - https://raw.githubusercontent.com/anudeepND/blacklist/master/adservers.txt
# - https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt
block_list = "${READFLOW_SRAPING_BLOCK_LIST}"
## Forward Proxy used to fetch special pages
[scraping.forward_proxy]
## Forward Proxy endpoint, disabled if empty
# {url} is a placehoder for the target page URL.
# Examples: https://my-html-rewriting.site?url={url}
endpoint = "${READFLOW_SCRAPING_FORWARD_PROXY_ENDPOINT}"
## Hosts to be proxied
# The use of a wildcard character ("*") means that ALL URLs will be proxied.
# Example: [ "medium.com", "dev.to" ]
hosts = [ "medium.com", "dev.to" ]

[avatar]
## Avatar local or remote service provider
Expand Down
14 changes: 10 additions & 4 deletions internal/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,18 @@ type DownloaderConfig struct {
MaxConcurentDownloads uint `toml:"max_concurent_downloads"`
}

// ScrapingForwardProxyConfig for forward proxy configuration
type ScrapingForwardProxyConfig struct {
Endpoint string
Hosts []string
}

// ScrapingConfig for scraping configuration section
type ScrapingConfig struct {
UserAgent string `toml:"user_agent"`
Timeout types.Duration `toml:"timeout"`
ServiceProvider string `toml:"service_provider"`
BlockList string `toml:"block_list"`
UserAgent string `toml:"user_agent"`
Timeout types.Duration `toml:"timeout"`
BlockList string `toml:"block_list"`
ForwardProxy ScrapingForwardProxyConfig `toml:"forward_proxy"`
}

// AvatarConfig for avatar configuration section
Expand Down
16 changes: 8 additions & 8 deletions internal/service/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ type Registry struct {
logger zerolog.Logger
downloadCache cache.Cache
properties *model.Properties
webScraper scraper.WebScraper
webScraper *scraper.WebScraper
dl downloader.Downloader
hashid *hashid.HashIDHandler
notificationRateLimiter ratelimiter.RateLimiter
Expand All @@ -53,14 +53,14 @@ func Configure(conf config.Config, database db.DB) error {
return err
}
// configure web scraper
webScraper, err := scraper.NewWebScraper(&scraper.WebScraperConfiguration{
HttpClient: &http.Client{Timeout: conf.Scraping.Timeout.Duration},
UserAgent: conf.Scraping.UserAgent,
ExternalServiceEndpoint: conf.Scraping.ServiceProvider,
webScraper := scraper.NewWebScraper(&scraper.WebScraperConfiguration{
HttpClient: &http.Client{Timeout: conf.Scraping.Timeout.Duration},
UserAgent: conf.Scraping.UserAgent,
ForwardProxy: &scraper.ForwardProxyConfiguration{
Endpoint: conf.Scraping.ForwardProxy.Endpoint,
Hosts: conf.Scraping.ForwardProxy.Hosts,
},
})
if err != nil {
return err
}
hid, err := hashid.NewHashIDHandler(conf.Hash.SecretSalt.Value)
if err != nil {
return err
Expand Down
3 changes: 2 additions & 1 deletion pkg/sanitizer/sanitizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ func NewSanitizer(blockList *BlockList) *Sanitizer {
policy := bluemonday.UGCPolicy()
policy.AddTargetBlankToFullyQualifiedLinks(true)
policy.AllowAttrs("width", "height", "src", "allowfullscreen", "sandbox").OnElements("iframe")
policy.AllowAttrs("srcset", "sizes", "data-src").OnElements("img")
policy.AllowAttrs("srcset", "sizes", "data-src").OnElements("img", "source")
policy.AllowElements("picture", "source")

if blockList != nil {
logger.Info().
Expand Down
81 changes: 0 additions & 81 deletions pkg/scraper/external.go

This file was deleted.

7 changes: 4 additions & 3 deletions pkg/scraper/readability.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@ import (
"github.com/ncarlier/readflow/pkg/html"
)

const MAX_RESPONSE_SIZE = 2 << 20 // 2Mb

func ReadWebPage(body io.Reader, pageUrl *url.URL) (*WebPage, error) {
// Set body limit
body = io.LimitReader(body, MAX_RESPONSE_SIZE)
// Parse DOM
doc, err := dom.Parse(body)
if err != nil {
return nil, err
}
// Extract meta
meta := html.ExtractMetaFromDOM(doc)
if err != nil {
return nil, err
}

// Create article with Open Graph attributes
result := &WebPage{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ import (
"github.com/ncarlier/readflow/pkg/scraper"
)

func TestInternalScraper(t *testing.T) {
func TestWebScraper(t *testing.T) {
ctx := context.TODO()
page, err := scraper.NewInternalWebScraper(&scraper.WebScraperConfiguration{}).Scrap(ctx, "https://about.readflow.app/")
page, err := scraper.NewWebScraper(&scraper.WebScraperConfiguration{}).Scrap(ctx, "https://about.readflow.app/")
assert.Nil(t, err)
assert.NotNil(t, page)
assert.Equal(t, "https://about.readflow.app/", page.URL)
Expand All @@ -28,7 +28,7 @@ func TestInternalScraper(t *testing.T) {

func TestInternalScraperTimeout(t *testing.T) {
ctx := context.TODO()
_, err := scraper.NewInternalWebScraper(&scraper.WebScraperConfiguration{
_, err := scraper.NewWebScraper(&scraper.WebScraperConfiguration{
HttpClient: &http.Client{Timeout: time.Second},
}).Scrap(ctx, "https://httpstat.us/200?sleep=2000")
require.NotNil(t, err)
Expand Down
33 changes: 18 additions & 15 deletions pkg/scraper/scraper.go → pkg/scraper/types.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package scraper

import (
"context"
"net/http"
"strings"
)

// WebPage is the result of a web scraping
Expand All @@ -18,22 +18,25 @@ type WebPage struct {
Favicon string `json:"favicon,omitempty"`
}

// WebScraper is an interface with Web Scrapping provider
type WebScraper interface {
Scrap(ctx context.Context, rawurl string) (*WebPage, error)
// ForwardProxyConfiguration to configure forward proxy
type ForwardProxyConfiguration struct {
Endpoint string
Hosts []string
}

// WebScraperConfiguration to configure a Web scraper
type WebScraperConfiguration struct {
HttpClient *http.Client
UserAgent string
ExternalServiceEndpoint string
// Match test if hostname is in the hosts list
func (fpc *ForwardProxyConfiguration) Match(hostname string) bool {
for _, value := range fpc.Hosts {
if strings.HasSuffix(hostname, value) {
return true
}
}
return false
}

// NewWebScraper create new Web Scraping service
func NewWebScraper(conf *WebScraperConfiguration) (WebScraper, error) {
if conf.ExternalServiceEndpoint == "" {
return NewInternalWebScraper(conf), nil
}
return NewExternalWebScraper(conf)
// WebScraperConfiguration to configure a Web scraper
type WebScraperConfiguration struct {
HttpClient *http.Client
UserAgent string
ForwardProxy *ForwardProxyConfiguration
}
Loading

0 comments on commit 9b3c611

Please sign in to comment.