From 9b3c61175afcacce9b33d337e22719a5712107d3 Mon Sep 17 00:00:00 2001 From: Nicolas Carlier Date: Fri, 27 Sep 2024 13:54:45 +0000 Subject: [PATCH] feat(scraper): improve web scraping - support forward proxy - remove external scraper support - add body size limit --- go.mod | 6 +- go.sum | 17 ++-- internal/config/defaults.toml | 13 ++- internal/config/types.go | 14 +++- internal/service/registry.go | 16 ++-- pkg/sanitizer/sanitizer.go | 3 +- pkg/scraper/external.go | 81 ------------------- pkg/scraper/readability.go | 7 +- .../{internal_test.go => web-scraper_test.go} | 6 +- pkg/scraper/{scraper.go => types.go} | 33 ++++---- pkg/scraper/{internal.go => web-scraper.go} | 34 ++++---- 11 files changed, 88 insertions(+), 142 deletions(-) delete mode 100644 pkg/scraper/external.go rename pkg/scraper/test/{internal_test.go => web-scraper_test.go} (80%) rename pkg/scraper/{scraper.go => types.go} (54%) rename pkg/scraper/{internal.go => web-scraper.go} (56%) diff --git a/go.mod b/go.mod index 8896e0fe..2d5cfa27 100644 --- a/go.mod +++ b/go.mod @@ -10,8 +10,8 @@ require ( github.com/brianvoe/gofakeit v3.18.0+incompatible github.com/dgrijalva/jwt-go v3.2.0+incompatible github.com/getsentry/raven-go v0.2.0 - github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 - github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad + github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c + github.com/go-shiori/go-readability v0.0.0-20240701094332-1070de7e32ef github.com/golang-jwt/jwt/v4 v4.5.0 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da github.com/google/uuid v1.3.0 @@ -37,6 +37,8 @@ require ( golang.org/x/sync v0.7.0 ) +require github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect + require ( github.com/PuerkitoBio/goquery v1.8.1 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect diff --git a/go.sum b/go.sum index 1bcde809..e76f1fc8 100644 --- a/go.sum +++ b/go.sum @@ -8,10 +8,11 @@ github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAc github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/SherClockHolmes/webpush-go v1.2.0 h1:sGv0/ZWCvb1HUH+izLqrb2i68HuqD/0Y+AmGQfyqKJA= github.com/SherClockHolmes/webpush-go v1.2.0/go.mod h1:w6X47YApe/B9wUz2Wh8xukxlyupaxSSEbu6yKJcHN2w= -github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA= +github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw= github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef h1:2JGTg6JapxP9/R33ZaagQtAM4EkkSYnIAlOG5EI8gkM= github.com/asaskevich/EventBus v0.0.0-20200907212545-49d423059eef/go.mod h1:JS7hed4L1fj0hXcyEejnW57/7LCetXggd+vwrRnYeII= github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= @@ -48,12 +49,11 @@ github.com/galdor/go-thumbhash v1.0.0 h1:Q7xSnaDvSC91SuNmQI94JuUVHva29FDdA4/PkV0 github.com/galdor/go-thumbhash v1.0.0/go.mod h1:gEK2wZqIxS2W4mXNf48lPl6HWjX0vWsH1LpK/cU74Ho= github.com/getsentry/raven-go v0.2.0 h1:no+xWJRb5ZI7eE8TWgIq1jLulQiIoLG0IfYxv5JYMGs= github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ= -github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 h1:zx4B0AiwqKDQq+AgqxWeHwbbLJQeidq20hgfP+aMNWI= -github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65/go.mod h1:NPO1+buE6TYOWhUI98/hXLHHJhunIpXRuvDN4xjkCoE= -github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad h1:3VP5Q8Mh165h2DHmXWFT4LJlwwvgTRlEuoe2vnsVnJ4= -github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad/go.mod h1:2DpZlTJO/ycxp/vsc/C11oUyveStOgIXB88SYV1lncI= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= +github.com/go-shiori/go-readability v0.0.0-20240701094332-1070de7e32ef h1:6y2GmHDeuF2xwC5L7fLMTlgnOjm5Jy8RYDI1YYpcOKU= +github.com/go-shiori/go-readability v0.0.0-20240701094332-1070de7e32ef/go.mod h1:jH+l/xV/8x8utphLx72GLIuw9wGhGzrZS5i7arOk8zc= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/gogs/chardet v0.0.0-20191104214054-4b6791f73a28/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY= @@ -105,6 +105,7 @@ github.com/mattn/go-colorable v0.1.12 h1:jF+Du6AlPIjs2BiUiQlKOX0rt3SujHxPnksPKZb github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= +github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/microcosm-cc/bluemonday v1.0.23 h1:SMZe2IGa0NuHvnVNAZ+6B38gsTbi5e4sViiWJyDDqFY= @@ -133,6 +134,7 @@ github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= +github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/cors v1.9.0 h1:l9HGsTsHJcvW14Nk7J9KFz8bzeAWXn3CG6bgt7LsrAE= @@ -141,6 +143,7 @@ github.com/rs/xid v1.4.0 h1:qd7wPTDkN6KQx2VmMBLrpHkiyQwgFXRnkOLacUiaSNY= github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc= github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU= +github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg= github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= @@ -196,10 +199,8 @@ golang.org/x/image v0.18.0 h1:jGzIakQa/ZXI1I0Fxvaa9W7yP25TqT6cHIHn+6CqvSQ= golang.org/x/image v0.18.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210505214959-0714010a04ed/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= diff --git a/internal/config/defaults.toml b/internal/config/defaults.toml index ea1f7c2d..1c2d7723 100644 --- a/internal/config/defaults.toml +++ b/internal/config/defaults.toml @@ -115,15 +115,22 @@ user_agent = "${READFLOW_SCRAPING_USER_AGENT}" ## Timeout of the HTTP client used by the Web Scraper # Default: 5s timeout = "${READFLOW_SCRAPING_TIMEOUT}" -## External Web Scraper URL, using internal if empty -# Example: "https://example.org/scrap" -service_provider = "${READFLOW_SCRAPING_SERVICE_PROVIDER}" ## Block-list local or remote location, disabled if empty # Examples: # - file:///var/opt/block-list-txt # - https://raw.githubusercontent.com/anudeepND/blacklist/master/adservers.txt # - https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt block_list = "${READFLOW_SRAPING_BLOCK_LIST}" +## Forward Proxy used to fetch special pages +[scraping.forward_proxy] +## Forward Proxy endpoint, disabled if empty +# {url} is a placehoder for the target page URL. +# Examples: https://my-html-rewriting.site?url={url} +endpoint = "${READFLOW_SCRAPING_FORWARD_PROXY_ENDPOINT}" +## Hosts to be proxied +# The use of a wildcard character ("*") means that ALL URLs will be proxied. +# Example: [ "medium.com", "dev.to" ] +hosts = [ "medium.com", "dev.to" ] [avatar] ## Avatar local or remote service provider diff --git a/internal/config/types.go b/internal/config/types.go index 141d98cd..73e72df6 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -102,12 +102,18 @@ type DownloaderConfig struct { MaxConcurentDownloads uint `toml:"max_concurent_downloads"` } +// ScrapingForwardProxyConfig for forward proxy configuration +type ScrapingForwardProxyConfig struct { + Endpoint string + Hosts []string +} + // ScrapingConfig for scraping configuration section type ScrapingConfig struct { - UserAgent string `toml:"user_agent"` - Timeout types.Duration `toml:"timeout"` - ServiceProvider string `toml:"service_provider"` - BlockList string `toml:"block_list"` + UserAgent string `toml:"user_agent"` + Timeout types.Duration `toml:"timeout"` + BlockList string `toml:"block_list"` + ForwardProxy ScrapingForwardProxyConfig `toml:"forward_proxy"` } // AvatarConfig for avatar configuration section diff --git a/internal/service/registry.go b/internal/service/registry.go index 5215115c..b2141ab4 100644 --- a/internal/service/registry.go +++ b/internal/service/registry.go @@ -33,7 +33,7 @@ type Registry struct { logger zerolog.Logger downloadCache cache.Cache properties *model.Properties - webScraper scraper.WebScraper + webScraper *scraper.WebScraper dl downloader.Downloader hashid *hashid.HashIDHandler notificationRateLimiter ratelimiter.RateLimiter @@ -53,14 +53,14 @@ func Configure(conf config.Config, database db.DB) error { return err } // configure web scraper - webScraper, err := scraper.NewWebScraper(&scraper.WebScraperConfiguration{ - HttpClient: &http.Client{Timeout: conf.Scraping.Timeout.Duration}, - UserAgent: conf.Scraping.UserAgent, - ExternalServiceEndpoint: conf.Scraping.ServiceProvider, + webScraper := scraper.NewWebScraper(&scraper.WebScraperConfiguration{ + HttpClient: &http.Client{Timeout: conf.Scraping.Timeout.Duration}, + UserAgent: conf.Scraping.UserAgent, + ForwardProxy: &scraper.ForwardProxyConfiguration{ + Endpoint: conf.Scraping.ForwardProxy.Endpoint, + Hosts: conf.Scraping.ForwardProxy.Hosts, + }, }) - if err != nil { - return err - } hid, err := hashid.NewHashIDHandler(conf.Hash.SecretSalt.Value) if err != nil { return err diff --git a/pkg/sanitizer/sanitizer.go b/pkg/sanitizer/sanitizer.go index 62d76d0a..49b36866 100644 --- a/pkg/sanitizer/sanitizer.go +++ b/pkg/sanitizer/sanitizer.go @@ -27,7 +27,8 @@ func NewSanitizer(blockList *BlockList) *Sanitizer { policy := bluemonday.UGCPolicy() policy.AddTargetBlankToFullyQualifiedLinks(true) policy.AllowAttrs("width", "height", "src", "allowfullscreen", "sandbox").OnElements("iframe") - policy.AllowAttrs("srcset", "sizes", "data-src").OnElements("img") + policy.AllowAttrs("srcset", "sizes", "data-src").OnElements("img", "source") + policy.AllowElements("picture", "source") if blockList != nil { logger.Info(). diff --git a/pkg/scraper/external.go b/pkg/scraper/external.go deleted file mode 100644 index c5a6f1dc..00000000 --- a/pkg/scraper/external.go +++ /dev/null @@ -1,81 +0,0 @@ -package scraper - -import ( - "context" - "encoding/json" - "fmt" - "net/http" - "net/url" - "strings" - - "github.com/ncarlier/readflow/pkg/defaults" - "github.com/ncarlier/readflow/pkg/logger" - "github.com/ncarlier/readflow/pkg/utils" - "github.com/rs/zerolog" -) - -type extrenalWebScraper struct { - uri string - httpClient *http.Client - userAgent string - logger zerolog.Logger -} - -// NewExternalWebScraper create an external web scrapping service -func NewExternalWebScraper(conf *WebScraperConfiguration) (WebScraper, error) { - if _, err := url.ParseRequestURI(conf.ExternalServiceEndpoint); err != nil { - return nil, fmt.Errorf("invalid Web Scraping service URI: %s", conf.ExternalServiceEndpoint) - } - log := logger.With().Str("component", "webscraper").Str("uri", conf.ExternalServiceEndpoint).Logger() - log.Debug().Msg("using external service") - - return &extrenalWebScraper{ - uri: conf.ExternalServiceEndpoint, - httpClient: utils.If(conf.HttpClient == nil, defaults.HTTPClient, conf.HttpClient), - userAgent: utils.If(conf.UserAgent == "", defaults.UserAgent, conf.UserAgent), - logger: log, - }, nil -} - -func (ws extrenalWebScraper) Scrap(ctx context.Context, rawurl string) (*WebPage, error) { - webPage, err := ws.scrap(ctx, rawurl) - if err != nil { - ws.logger.Error().Err(err).Msg("unable to scrap web page with external service, fallback on internal service") - return NewInternalWebScraper(&WebScraperConfiguration{ - HttpClient: ws.httpClient, - UserAgent: ws.userAgent, - }).Scrap(ctx, rawurl) - } - return webPage, nil -} - -func (ws extrenalWebScraper) scrap(ctx context.Context, rawurl string) (*WebPage, error) { - req, err := http.NewRequestWithContext(ctx, "GET", ws.uri, http.NoBody) - if err != nil { - return nil, err - } - q := req.URL.Query() - q.Add("u", rawurl) - req.URL.RawQuery = q.Encode() - - ws.logger.Debug().Str("url", rawurl).Msg("scraping webpage") - res, err := ws.httpClient.Do(req) - if err != nil { - return nil, err - } - defer res.Body.Close() - - if res.StatusCode >= 400 { - return nil, fmt.Errorf("invalid web scraping response: %d", res.StatusCode) - } - - if ct := res.Header.Get("Content-Type"); ct != "" { - if !strings.HasPrefix(ct, "application/json") { - return nil, fmt.Errorf("invalid web scraping Content-Type response: %s", ct) - } - } - - webPage := WebPage{} - err = json.NewDecoder(res.Body).Decode(&webPage) - return &webPage, err -} diff --git a/pkg/scraper/readability.go b/pkg/scraper/readability.go index d8212998..847cee40 100644 --- a/pkg/scraper/readability.go +++ b/pkg/scraper/readability.go @@ -9,7 +9,11 @@ import ( "github.com/ncarlier/readflow/pkg/html" ) +const MAX_RESPONSE_SIZE = 2 << 20 // 2Mb + func ReadWebPage(body io.Reader, pageUrl *url.URL) (*WebPage, error) { + // Set body limit + body = io.LimitReader(body, MAX_RESPONSE_SIZE) // Parse DOM doc, err := dom.Parse(body) if err != nil { @@ -17,9 +21,6 @@ func ReadWebPage(body io.Reader, pageUrl *url.URL) (*WebPage, error) { } // Extract meta meta := html.ExtractMetaFromDOM(doc) - if err != nil { - return nil, err - } // Create article with Open Graph attributes result := &WebPage{ diff --git a/pkg/scraper/test/internal_test.go b/pkg/scraper/test/web-scraper_test.go similarity index 80% rename from pkg/scraper/test/internal_test.go rename to pkg/scraper/test/web-scraper_test.go index 94fc227b..ce03ef6e 100644 --- a/pkg/scraper/test/internal_test.go +++ b/pkg/scraper/test/web-scraper_test.go @@ -13,9 +13,9 @@ import ( "github.com/ncarlier/readflow/pkg/scraper" ) -func TestInternalScraper(t *testing.T) { +func TestWebScraper(t *testing.T) { ctx := context.TODO() - page, err := scraper.NewInternalWebScraper(&scraper.WebScraperConfiguration{}).Scrap(ctx, "https://about.readflow.app/") + page, err := scraper.NewWebScraper(&scraper.WebScraperConfiguration{}).Scrap(ctx, "https://about.readflow.app/") assert.Nil(t, err) assert.NotNil(t, page) assert.Equal(t, "https://about.readflow.app/", page.URL) @@ -28,7 +28,7 @@ func TestInternalScraper(t *testing.T) { func TestInternalScraperTimeout(t *testing.T) { ctx := context.TODO() - _, err := scraper.NewInternalWebScraper(&scraper.WebScraperConfiguration{ + _, err := scraper.NewWebScraper(&scraper.WebScraperConfiguration{ HttpClient: &http.Client{Timeout: time.Second}, }).Scrap(ctx, "https://httpstat.us/200?sleep=2000") require.NotNil(t, err) diff --git a/pkg/scraper/scraper.go b/pkg/scraper/types.go similarity index 54% rename from pkg/scraper/scraper.go rename to pkg/scraper/types.go index e3b33770..f2fd383d 100644 --- a/pkg/scraper/scraper.go +++ b/pkg/scraper/types.go @@ -1,8 +1,8 @@ package scraper import ( - "context" "net/http" + "strings" ) // WebPage is the result of a web scraping @@ -18,22 +18,25 @@ type WebPage struct { Favicon string `json:"favicon,omitempty"` } -// WebScraper is an interface with Web Scrapping provider -type WebScraper interface { - Scrap(ctx context.Context, rawurl string) (*WebPage, error) +// ForwardProxyConfiguration to configure forward proxy +type ForwardProxyConfiguration struct { + Endpoint string + Hosts []string } -// WebScraperConfiguration to configure a Web scraper -type WebScraperConfiguration struct { - HttpClient *http.Client - UserAgent string - ExternalServiceEndpoint string +// Match test if hostname is in the hosts list +func (fpc *ForwardProxyConfiguration) Match(hostname string) bool { + for _, value := range fpc.Hosts { + if strings.HasSuffix(hostname, value) { + return true + } + } + return false } -// NewWebScraper create new Web Scraping service -func NewWebScraper(conf *WebScraperConfiguration) (WebScraper, error) { - if conf.ExternalServiceEndpoint == "" { - return NewInternalWebScraper(conf), nil - } - return NewExternalWebScraper(conf) +// WebScraperConfiguration to configure a Web scraper +type WebScraperConfiguration struct { + HttpClient *http.Client + UserAgent string + ForwardProxy *ForwardProxyConfiguration } diff --git a/pkg/scraper/internal.go b/pkg/scraper/web-scraper.go similarity index 56% rename from pkg/scraper/internal.go rename to pkg/scraper/web-scraper.go index 0d18eda2..25305397 100644 --- a/pkg/scraper/internal.go +++ b/pkg/scraper/web-scraper.go @@ -12,22 +12,24 @@ import ( "golang.org/x/net/html/charset" ) -type internalWebScraper struct { - httpClient *http.Client - userAgent string +type WebScraper struct { + httpClient *http.Client + userAgent string + forwardProxy *ForwardProxyConfiguration } -// NewInternalWebScraper create an internal web scrapping service -func NewInternalWebScraper(conf *WebScraperConfiguration) WebScraper { - return &internalWebScraper{ - httpClient: utils.If(conf.HttpClient == nil, defaults.HTTPClient, conf.HttpClient), - userAgent: utils.If(conf.UserAgent == "", defaults.UserAgent, conf.UserAgent), +// NewWebScraper create an internal web scrapping service +func NewWebScraper(conf *WebScraperConfiguration) *WebScraper { + return &WebScraper{ + httpClient: utils.If(conf.HttpClient == nil, defaults.HTTPClient, conf.HttpClient), + userAgent: utils.If(conf.UserAgent == "", defaults.UserAgent, conf.UserAgent), + forwardProxy: conf.ForwardProxy, } } -func (ws internalWebScraper) Scrap(ctx context.Context, rawurl string) (*WebPage, error) { +func (ws WebScraper) Scrap(ctx context.Context, rawurl string) (*WebPage, error) { // Validate URL - _, err := url.ParseRequestURI(rawurl) + pageURL, err := url.ParseRequestURI(rawurl) if err != nil { return nil, fmt.Errorf("invalid URL: %v", err) } @@ -49,7 +51,7 @@ func (ws internalWebScraper) Scrap(ctx context.Context, rawurl string) (*WebPage } // Get URL content - res, err := ws.get(ctx, rawurl) + res, err := ws.get(ctx, pageURL) if err != nil { return nil, err } @@ -59,10 +61,10 @@ func (ws internalWebScraper) Scrap(ctx context.Context, rawurl string) (*WebPage return nil, err } - return ReadWebPage(body, res.Request.URL) + return ReadWebPage(body, pageURL) } -func (ws internalWebScraper) getContentType(ctx context.Context, rawurl string) (string, error) { +func (ws WebScraper) getContentType(ctx context.Context, rawurl string) (string, error) { req, err := http.NewRequest("HEAD", rawurl, http.NoBody) if err != nil { return "", err @@ -76,7 +78,11 @@ func (ws internalWebScraper) getContentType(ctx context.Context, rawurl string) return res.Header.Get("Content-type"), nil } -func (ws internalWebScraper) get(ctx context.Context, rawurl string) (*http.Response, error) { +func (ws WebScraper) get(ctx context.Context, pageURL *url.URL) (*http.Response, error) { + rawurl := pageURL.String() + if ws.forwardProxy != nil && ws.forwardProxy.Endpoint != "" && ws.forwardProxy.Match(pageURL.Hostname()) { + rawurl = strings.ReplaceAll(ws.forwardProxy.Endpoint, "{url}", rawurl) + } req, err := http.NewRequest("GET", rawurl, http.NoBody) if err != nil { return nil, err