diff --git a/.gitignore b/.gitignore index 9046723..e3b34f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -cmd/tests -makefile -logs -tmp \ No newline at end of file +bin/ +tests/ +.idea.md +*.*prof +.vscode/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7c8dbea --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +clean: +start: +init: +build: +usage: +.PHONY: clean start init build usage \ No newline at end of file diff --git a/README.md b/README.md index 314f725..5212bb0 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,67 @@ -## WBot +# WBot A configurable, thread-safe web crawler, provides a minimal interface for crawling and downloading web pages. -### Features: +## Features + - Clean minimal API. - Configurable: MaxDepth, MaxBodySize, Rate Limit, Parrallelism, User Agent & Proxy rotation. - Memory-efficient, thread-safe. - Provides built-in interface: Fetcher, Store, Queue & a Logger. +## API + +WBot provides a minimal API for crawling web pages. + +```go +Run(links ...string) error +OnReponse(fn func(*wbot.Response)) +Metrics() map[string]int64 +Shutdown() +``` + +## Usage + +```go +package main + +import ( + "fmt" + "log" -### [Examples & API](https://github.com/twiny/wbot/wiki) + "github.com/rs/zerolog" + "github.com/twiny/wbot" + "github.com/twiny/wbot/crawler" +) -### TODO -- [ ] Add support for robots.txt. -- [ ] Add test cases. -- [ ] Implement `Fetch` using Chromedp. -- [ ] Add more examples. -- [ ] Add documentation. +func main() { + bot := crawler.New( + crawler.WithParallel(50), + crawler.WithMaxDepth(5), + crawler.WithRateLimit(&wbot.RateLimit{ + Hostname: "*", + Rate: "10/1s", + }), + crawler.WithLogLevel(zerolog.DebugLevel), + ) + defer bot.Shutdown() + + // read responses + bot.OnReponse(func(resp *wbot.Response) { + fmt.Printf("crawled: %s\n", resp.URL.String()) + }) + + if err := bot.Run( + "https://crawler-test.com/", + ); err != nil { + log.Fatal(err) + } + + log.Printf("finished crawling\n") +} + +``` ### Bugs + Bugs or suggestions? Please visit the [issue tracker](https://github.com/twiny/wbot/issues). diff --git a/config.go b/config.go deleted file mode 100644 index ac056ad..0000000 --- a/config.go +++ /dev/null @@ -1,10 +0,0 @@ -package wbot - -// Config -type config struct { - maxDepth int32 - parallel int - maxBodySize int64 - userAgents *rotator - proxies *rotator -} diff --git a/crawler/config.go b/crawler/config.go new file mode 100644 index 0000000..1a7de4b --- /dev/null +++ b/crawler/config.go @@ -0,0 +1,66 @@ +package crawler + +import ( + "runtime" + "time" + + "github.com/twiny/poxa" +) + +const ( + defaultReferrer = "https://www.google.com/search" + defaultUserAgent = "WBot/v0.2.0 (+https://github.com/twiny/wbot)" + defaultTimeout = 10 * time.Second + defaultMaxBodySize = int64(1024 * 1024 * 5) // 5MB +) + +type ( + config struct { + parallel int + maxDepth int32 + maxBodySize int64 + timeout time.Duration + userAgents poxa.Spinner[string] + referrers poxa.Spinner[string] + proxies poxa.Spinner[string] + } +) + +func newConfig(maxDepth int32, userAgents, referrers, proxies []string) *config { + if maxDepth <= 0 { + maxDepth = 10 + } + + var conf = &config{ + parallel: runtime.NumCPU(), + maxDepth: maxDepth, + maxBodySize: defaultMaxBodySize, + timeout: defaultTimeout, + userAgents: poxa.NewSpinner(defaultUserAgent), + referrers: poxa.NewSpinner(defaultReferrer), + proxies: nil, + } + + if len(userAgents) > 0 { + uaList := poxa.NewSpinner(userAgents...) + if uaList != nil { + conf.userAgents = uaList + } + } + + if len(referrers) > 0 { + refList := poxa.NewSpinner(referrers...) + if refList != nil { + conf.referrers = refList + } + } + + if len(proxies) > 0 { + proxyList := poxa.NewSpinner(proxies...) + if proxyList != nil { + conf.proxies = proxyList + } + } + + return conf +} diff --git a/crawler/crawler.go b/crawler/crawler.go new file mode 100644 index 0000000..d8388bd --- /dev/null +++ b/crawler/crawler.go @@ -0,0 +1,290 @@ +package crawler + +import ( + "context" + "fmt" + "os" + "os/signal" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/rs/zerolog" + "github.com/twiny/flare" + "github.com/twiny/wbot" + "github.com/twiny/wbot/plugin/fetcher" + "github.com/twiny/wbot/plugin/metrics" + "github.com/twiny/wbot/plugin/queue" + "github.com/twiny/wbot/plugin/store" +) + +const ( + crawlRunning = 0 + crawlStopped = 1 +) + +type ( + Crawler struct { + wg *sync.WaitGroup + cfg *config + + fetcher wbot.Fetcher + store wbot.Store + queue wbot.Queue + metrics wbot.MetricsMonitor + + filter *filter + limiter *rateLimiter + robot *robotManager + + stream chan *wbot.Response + + status int32 + flare flare.Notifier + + logger zerolog.Logger + + ctx context.Context + stop context.CancelFunc + } +) + +func New(opts ...Option) *Crawler { + cw := zerolog.ConsoleWriter{ + Out: os.Stdout, + TimeFormat: time.RFC3339, + NoColor: false, + } + zerolog.SetGlobalLevel(zerolog.TraceLevel) + + logger := zerolog.New(cw).With().Timestamp().Logger() + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) + c := &Crawler{ + wg: new(sync.WaitGroup), + cfg: newConfig(-1, nil, nil, nil), + + fetcher: fetcher.NewHTTPClient(), + store: store.NewInMemoryStore(), + queue: queue.NewInMemoryQueue(2048), + metrics: metrics.NewMetricsMonitor(), + + filter: newFilter(), + limiter: newRateLimiter(), + robot: newRobotManager(false), + + stream: make(chan *wbot.Response, 1024), + + status: crawlStopped, + flare: flare.New(), + logger: logger, + + ctx: ctx, + stop: stop, + } + + for _, opt := range opts { + opt(c) + } + + // this routine waits for quit signal + go func() { + <-c.ctx.Done() + c.logger.Info().Msgf("Crawler is shutting down") + + c.flare.Cancel() + c.queue.Close() + c.store.Close() + c.fetcher.Close() + + c.wg.Wait() + close(c.stream) + }() + + return c +} + +func (c *Crawler) Run(links ...string) error { + var ( + targets []*wbot.ParsedURL + errs []error + ) + + for _, link := range links { + target, err := wbot.NewURL(link) + if err != nil { + errs = append(errs, err) + continue + } + targets = append(targets, target) + } + + if len(errs) > 0 { + return fmt.Errorf("invalid links: %v", errs) + } + + if len(targets) == 0 { + return fmt.Errorf("no valid links") + } + + for _, target := range targets { + c.add(target) + } + + c.status = crawlRunning + + c.logger.Info().Msgf("Starting crawler with %d links", len(targets)) + + c.wg.Add(c.cfg.parallel) + for i := 0; i < c.cfg.parallel; i++ { + go c.crawl(i) + } + + c.wg.Wait() + return nil +} +func (c *Crawler) OnReponse(fn func(*wbot.Response)) { + c.wg.Add(1) + go func() { + defer c.wg.Done() + for { + select { + case <-c.ctx.Done(): + return + case <-c.flare.Done(): + return + case resp, ok := <-c.stream: + if ok { + fn(resp) + } + } + } + }() +} +func (c *Crawler) Metrics() map[string]int64 { + return c.metrics.Metrics() +} +func (c *Crawler) Shutdown() { + c.stop() +} + +func (c *Crawler) add(target *wbot.ParsedURL) { + param := &wbot.Param{ + MaxBodySize: c.cfg.maxBodySize, + UserAgent: c.cfg.userAgents.Next(), + Timeout: c.cfg.timeout, + } + + if c.cfg.proxies != nil { + param.Proxy = c.cfg.proxies.Next() + } + + req := &wbot.Request{ + Target: target, + Param: param, + Depth: 0, + } + + if err := c.queue.Push(c.ctx, req); err != nil { + c.logger.Err(err).Msgf("pop") + return + } +} +func (c *Crawler) crawl(id int) { + defer c.wg.Done() + + for { + select { + case <-c.ctx.Done(): + return + default: + if atomic.LoadInt32(&c.status) == crawlStopped && c.queue.Len() == 0 { + c.flare.Cancel() + return + } + + // not elegant, but just to give the queue some time to fill up + if atomic.LoadInt32(&c.status) == crawlRunning && c.queue.Len() == 0 { + <-time.After(1 * time.Second) + continue + } + + req, err := c.queue.Pop(c.ctx) + if err != nil { + c.logger.Err(err).Msgf("pop") + continue + } + c.metrics.IncTotalRequests() + + // if the next response will exceed the max depth, + // we signal the crawler to stop + if atomic.LoadInt32(&req.Depth) > c.cfg.maxDepth-1 { + atomic.StoreInt32(&c.status, crawlStopped) + } + + c.limiter.wait(req.Target) + + resp, err := c.fetcher.Fetch(c.ctx, req) + if err != nil { + c.metrics.IncFailedRequests() + c.logger.Err(err).Any("target", req.Target.String()).Msgf("fetch") + continue + } + + c.stream <- resp + c.metrics.IncSuccessfulRequests() + + c.logger.Debug().Any("target", req.Target.String()).Msgf("fetched") + + // increment the depth for the next requests + nextDepth := atomic.AddInt32(&req.Depth, 1) + + if nextDepth > c.cfg.maxDepth { + continue + } + + // logging here will just flood the logs + for _, target := range resp.NextURLs { + c.metrics.IncTotalLink() + + if !strings.Contains(target.URL.Host, req.Target.Root) { + c.metrics.IncSkippedLink() + continue + } + + if !c.robot.Allowed(req.Param.UserAgent, req.Target.URL.String()) { + c.metrics.IncSkippedLink() + // todo: log + continue + } + + if !c.filter.allow(target) { + c.metrics.IncSkippedLink() + continue + } + + if visited, err := c.store.HasVisited(c.ctx, target); visited { + if err != nil { + c.logger.Err(err).Msgf("store") + } + c.metrics.IncDuplicatedLink() + continue + } + + nextReq := &wbot.Request{ + Target: target, + Depth: req.Depth, + Param: req.Param, + } + + if err := c.queue.Push(c.ctx, nextReq); err != nil { + c.logger.Err(err).Any("target", target.String()).Msgf("push") + continue + } + + c.metrics.IncCrawledLink() + } + } + } +} diff --git a/crawler/filter.go b/crawler/filter.go new file mode 100644 index 0000000..7d004c6 --- /dev/null +++ b/crawler/filter.go @@ -0,0 +1,57 @@ +package crawler + +import ( + "regexp" + + "github.com/twiny/wbot" +) + +var ( + badExtensions = regexp.MustCompile(`\.(png|jpg|jpeg|gif|ico|eps|pdf|iso|mp3|mp4|zip|aif|mpa|wav|wma|7z|deb|pkg|rar|rpm|bin|dmg|dat|tar|exe|ps|psd|svg|tif|tiff|pps|ppt|pptx|xls|xlsx|wmv|doc|docx|txt|mov|mpl|css|js)$`) +) + +type ( + filter struct { + rules map[string]*wbot.FilterRule + } +) + +func newFilter(rules ...*wbot.FilterRule) *filter { + f := &filter{ + rules: make(map[string]*wbot.FilterRule), + } + + for _, rule := range rules { + f.rules[rule.Hostname] = rule + } + + return f +} +func (f *filter) allow(u *wbot.ParsedURL) bool { + if badExtensions.MatchString(u.URL.Path) { + return false + } + + rule, found := f.rules[u.Root] + if !found { + // check if there is a wildcard rule + rule, found = f.rules["*"] + if !found { + return true + } + } + + for _, pattern := range rule.Disallow { + if pattern.MatchString(u.URL.String()) { + return false + } + } + + for _, pattern := range rule.Allow { + if pattern.MatchString(u.URL.String()) { + return true + } + } + + return false // default deny +} diff --git a/crawler/limiter.go b/crawler/limiter.go new file mode 100644 index 0000000..eb7cf0d --- /dev/null +++ b/crawler/limiter.go @@ -0,0 +1,90 @@ +package crawler + +import ( + "strconv" + "strings" + "time" + + "github.com/twiny/ratelimit" + "github.com/twiny/wbot" +) + +var ( + defaultRateLimit = "10/1s" +) + +type ( + rateLimiter struct { + table map[string]*ratelimit.Limiter + } +) + +func newRateLimiter(limits ...*wbot.RateLimit) *rateLimiter { + rl := &rateLimiter{ + table: make(map[string]*ratelimit.Limiter), + } + + // Handle the default wildcard limit. + hasWildcard := false + if len(limits) > 0 { + for _, limit := range limits { + if limit.Hostname == "*" { + hasWildcard = true + break + } + } + } + + if !hasWildcard { + limits = append(limits, &wbot.RateLimit{ + Hostname: "*", + Rate: defaultRateLimit, + }) + } + + for _, rate := range limits { + r, l := parseRateLimit(rate.Rate) + rl.table[rate.Hostname] = ratelimit.NewLimiter(r, l) + } + + return rl +} +func (l *rateLimiter) wait(u *wbot.ParsedURL) { + limit, found := l.table[u.Root] + if !found { + limit = l.table["*"] + } + + limit.Take() +} + +func parseRateLimit(s string) (rate int, interval time.Duration) { + parts := strings.Split(s, "/") + if len(parts) != 2 { + return parseRateLimit(defaultRateLimit) + } + + rate, err := strconv.Atoi(parts[0]) + if err != nil { + return parseRateLimit(defaultRateLimit) + } + + intervalValueStr := parts[1][:len(parts[1])-1] + intervalValue, err := strconv.Atoi(intervalValueStr) + if err != nil { + return parseRateLimit(defaultRateLimit) + } + + switch parts[1][len(parts[1])-1] { + case 's', 'S': + interval = time.Duration(intervalValue) * time.Second + case 'm', 'M': + interval = time.Duration(intervalValue) * time.Minute + case 'h', 'H': + interval = time.Duration(intervalValue) * time.Hour + default: + return parseRateLimit(defaultRateLimit) + } + + return rate, interval +} diff --git a/crawler/option.go b/crawler/option.go new file mode 100644 index 0000000..979625d --- /dev/null +++ b/crawler/option.go @@ -0,0 +1,62 @@ +package crawler + +import ( + "github.com/rs/zerolog" + "github.com/twiny/poxa" + "github.com/twiny/wbot" +) + +type ( + Option func(c *Crawler) +) + +func WithParallel(parallel int) Option { + return func(c *Crawler) { + c.cfg.parallel = parallel + } +} +func WithMaxDepth(maxDepth int32) Option { + return func(c *Crawler) { + c.cfg.maxDepth = maxDepth + } +} +func WithUserAgents(userAgents []string) Option { + return func(c *Crawler) { + c.cfg.userAgents = poxa.NewSpinner(userAgents...) + } +} +func WithProxies(proxies []string) Option { + return func(c *Crawler) { + c.cfg.proxies = poxa.NewSpinner(proxies...) + } +} +func WithRateLimit(rates ...*wbot.RateLimit) Option { + return func(c *Crawler) { + c.limiter = newRateLimiter(rates...) + } +} +func WithFilter(rules ...*wbot.FilterRule) Option { + return func(c *Crawler) { + c.filter = newFilter(rules...) + } +} +func WithFetcher(fetcher wbot.Fetcher) Option { + return func(c *Crawler) { + c.fetcher = fetcher + } +} +func WithStore(store wbot.Store) Option { + return func(c *Crawler) { + c.store = store + } +} +func WithQueue(queue wbot.Queue) Option { + return func(c *Crawler) { + c.queue = queue + } +} +func WithLogLevel(level zerolog.Level) Option { + return func(c *Crawler) { + c.logger = c.logger.Level(level) + } +} diff --git a/crawler/robot.go b/crawler/robot.go new file mode 100644 index 0000000..63cefa6 --- /dev/null +++ b/crawler/robot.go @@ -0,0 +1,41 @@ +package crawler + +import ( + "github.com/temoto/robotstxt" +) + +const ( + robotstxtPath = "/robots.txt" +) + +type robotManager struct { + followRobots bool + robots map[string]*robotstxt.RobotsData +} + +func newRobotManager(follow bool) *robotManager { + return &robotManager{ + followRobots: follow, + robots: make(map[string]*robotstxt.RobotsData), + } +} + +func (rm *robotManager) AddRobotsTxt(hostname string, body []byte) error { + data, err := robotstxt.FromBytes(body) + if err != nil { + return err // Return the error if parsing fails. + } + + rm.robots[hostname] = data + return nil +} +func (rm *robotManager) Allowed(userAgent, url string) bool { + hostname := url // Simplification; use proper URL parsing in production. + + robotsData, exists := rm.robots[hostname] + if !exists { + return true + } + + return robotsData.TestAgent(url, userAgent) +} diff --git a/fetcher.go b/fetcher.go deleted file mode 100644 index 0875087..0000000 --- a/fetcher.go +++ /dev/null @@ -1,150 +0,0 @@ -package wbot - -import ( - "bytes" - "io" - "net" - "net/http" - "net/url" - "time" - - "github.com/PuerkitoBio/goquery" -) - -// Fetcher -type Fetcher interface { - Fetch(req Request) (Response, error) - Close() error -} - -// Default Fetcher - -// -var ( - defaultUserAgent = `wbot/0.1` -) - -// Fetcher -type fetcher struct { - cli *http.Client -} - -// defaultFetcher -func defaultFetcher() *fetcher { - return &fetcher{ - cli: newHTTPClient(), - } -} - -// Fetch -func (f *fetcher) Fetch(req Request) (Response, error) { - var ( - userAgent = defaultUserAgent - maxBodySize = int64(1024 * 1024 * 10) - ) - - if req.Param.UserAgent != "" { - userAgent = req.Param.UserAgent - } - - if req.Param.MaxBodySize > 0 { - maxBodySize = req.Param.MaxBodySize - } - - // add headers - var header = make(http.Header) - header.Set("User-Agent", userAgent) - header.Set("Referer", req.Param.Referer) - - f.cli.Transport = newHTTPTransport(req.Param.Proxy) - - resp, err := f.cli.Do(&http.Request{ - Method: http.MethodGet, - URL: req.URL, - Header: header, - Proto: "HTTP/1.1", - ProtoMajor: 1, - ProtoMinor: 1, - }) - if err != nil { - return Response{}, err - } - - // Limit response body reading - bodyReader := io.LimitReader(resp.Body, maxBodySize) - - body, err := io.ReadAll(bodyReader) - if err != nil { - return Response{}, err - } - - nextURLs := findLinks(body) - - resp.Body.Close() - - return Response{ - URL: req.URL, - Status: resp.StatusCode, - Body: body, - NextURLs: nextURLs, - Depth: req.Depth, - }, nil -} - -// Close -func (f *fetcher) Close() error { - f.cli.CloseIdleConnections() - return nil -} - -// newHTTPClient -func newHTTPClient() *http.Client { - return &http.Client{ - Jar: http.DefaultClient.Jar, - Timeout: 5 * time.Second, - } -} - -// newHTTPTransport -func newHTTPTransport(purl string) *http.Transport { - var proxy = http.ProxyFromEnvironment - - if purl != "" { - proxy = func(req *http.Request) (*url.URL, error) { - return url.Parse(purl) - } - } - return &http.Transport{ - Proxy: proxy, - DialContext: (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, - DualStack: true, - }).DialContext, - ForceAttemptHTTP2: true, - MaxIdleConns: 100, // Default: 100 - MaxIdleConnsPerHost: 2, // Default: 2 - IdleConnTimeout: 10 * time.Second, - TLSHandshakeTimeout: 5 * time.Second, - ExpectContinueTimeout: 1 * time.Second, - // DisableKeepAlives: true, // twiny - } -} - -// linkFinder finds links in a response -func findLinks(body []byte) []string { - var hrefs []string - - doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) - if err != nil { - return hrefs - } - - doc.Find("a[href]").Each(func(index int, item *goquery.Selection) { - if href, found := item.Attr("href"); found { - hrefs = append(hrefs, href) - } - }) - - return hrefs -} diff --git a/filter.go b/filter.go deleted file mode 100644 index 6339601..0000000 --- a/filter.go +++ /dev/null @@ -1,60 +0,0 @@ -package wbot - -import ( - "net/url" - "regexp" -) - -var ( - badExtensions = regexp.MustCompile(`^.*\.(png|jpg|jpeg|gif|ico|eps|pdf|iso|mp3|mp4|zip|aif|mpa|wav|wma|7z|deb|pkg|rar|rpm|bin|dmg|dat|tar|exe|ps|psd|svg|tif|tiff|pps|ppt|pptx|xls|xlsx|wmv|doc|docx|txt|mov|mpl)$`) -) - -// -// Filter -type filter struct { - allowed []*regexp.Regexp - disallowed []*regexp.Regexp -} - -// newFilter -func newFilter(allowed, disallowed []string) *filter { - var f = &filter{ - allowed: make([]*regexp.Regexp, 0), - disallowed: make([]*regexp.Regexp, 0), - } - - for _, p := range allowed { - f.allowed = append(f.allowed, regexp.MustCompile(p)) - } - - for _, p := range disallowed { - f.disallowed = append(f.disallowed, regexp.MustCompile(p)) - } - - return f -} - -// Allow -func (f *filter) Allow(l *url.URL) bool { - raw := l.String() - - if badExtensions.MatchString(l.Path) { - return false - } - - // disallowed - for _, d := range f.disallowed { - if d.MatchString(raw) { - return false - } - } - - // allowed - for _, a := range f.allowed { - if !a.MatchString(raw) { - return false - } - } - - return true -} diff --git a/go.mod b/go.mod index f24e2b1..cc34cf0 100644 --- a/go.mod +++ b/go.mod @@ -1,16 +1,24 @@ module github.com/twiny/wbot -go 1.18 +go 1.22.0 require ( - github.com/PuerkitoBio/goquery v1.8.0 + github.com/PuerkitoBio/goquery v1.8.1 + github.com/rs/zerolog v1.32.0 + github.com/temoto/robotstxt v1.1.2 + github.com/twiny/flare v0.1.0 + github.com/twiny/poxa v0.1.0 github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac - go.etcd.io/bbolt v1.3.5 - golang.org/x/net v0.0.0-20220513224357-95641704303c + github.com/weppos/publicsuffix-go v0.30.1 ) require ( github.com/andybalholm/cascadia v1.3.1 // indirect github.com/benbjohnson/clock v1.3.0 // indirect - golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.19 // indirect + github.com/stretchr/testify v1.8.0 // indirect + golang.org/x/net v0.17.0 // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/text v0.13.0 // indirect ) diff --git a/go.sum b/go.sum index 8ca3830..5ca2f34 100644 --- a/go.sum +++ b/go.sum @@ -1,21 +1,121 @@ -github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= -github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= +github.com/ProtonMail/go-crypto v0.0.0-20230217124315-7d5c6f04bbb8/go.mod h1:I0gYDMZ6Z5GRU7l58bNFSkPTFN6Yl12dsUlAZ8xy98g= +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A= github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= +github.com/bwesterb/go-ristretto v1.2.0/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= +github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-github/v50 v50.2.0/go.mod h1:VBY8FB6yPIjrtKhozXv4FQupxKLS6H4m6xFZlT43q8Q= +github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= +github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/rs/zerolog v1.32.0 h1:keLypqrlIjaFsbmJOBdB/qvyF8KEtCWHwobLp5l/mQ0= +github.com/rs/zerolog v1.32.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/twiny/flare v0.1.0 h1:bq50IXYNUpiJULoIXXwerL1gwr+KBz49ayYgQo/CqnY= +github.com/twiny/flare v0.1.0/go.mod h1:Rlzkek5PDlOGFue015tC7fe/ROyeSx3hqy+jfAZGezQ= +github.com/twiny/poxa v0.1.0 h1:NMM1ZeRfGFVOz60NjHR4r78pQYkq09VyOjKjdkhkWsE= +github.com/twiny/poxa v0.1.0/go.mod h1:zTPmnK5Ta+Ro+HL1R/LREGg3LNqs/bpNcEWlUipKl7A= github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac h1:nT+8DFvrU5Nu3Be2bK7LooU8AslFJeypQoAF+wm1CM0= github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac/go.mod h1:C589KqlnfcMeRAJ+evrNJwSf9ddkXO926hRDtgjjoYM= -go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0= -go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= +github.com/weppos/publicsuffix-go v0.30.1 h1:8q+QwBS1MY56Zjfk/50ycu33NN8aa1iCCEQwo/71Oos= +github.com/weppos/publicsuffix-go v0.30.1/go.mod h1:s41lQh6dIsDWIC1OWh7ChWJXLH0zkJ9KHZVqA7vHyuQ= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= +golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20220513224357-95641704303c h1:nF9mHSvoKBLkQNQhJZNsc66z2UzAMUbLGjC95CF3pU0= -golang.org/x/net v0.0.0-20220513224357-95641704303c/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e h1:fLOSk5Q00efkSvAm+4xcoXD+RRmLmmulPn5I3Y9F2EM= -golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/limiter.go b/limiter.go deleted file mode 100644 index 651c5c0..0000000 --- a/limiter.go +++ /dev/null @@ -1,37 +0,0 @@ -package wbot - -import ( - "net/url" - "time" - - "github.com/twiny/ratelimit" -) - -// Limiter -type limiter struct { - rate int - duration time.Duration - list map[string]*ratelimit.Limiter -} - -// newLimiter -func newLimiter(r int, d time.Duration) *limiter { - return &limiter{ - rate: r, - duration: d, - list: make(map[string]*ratelimit.Limiter), - } -} - -// Take -func (l *limiter) take(u *url.URL) { - hostname := u.Hostname() - - limit, found := l.list[hostname] - if !found { - limit = ratelimit.NewLimiter(l.rate, l.duration) - l.list[hostname] = limit - } - - limit.Take() -} diff --git a/logger.go b/logger.go deleted file mode 100644 index 7ce739b..0000000 --- a/logger.go +++ /dev/null @@ -1,30 +0,0 @@ -package wbot - -// Logger -type Logger interface { - Send(rep Report) - Close() error -} - -// Report -type Report struct { - RequestURL string - Status int - Depth int32 - Err error -} - -// newReport -func newReport(resp Response, err error) Report { - requestURL := "" - if resp.URL != nil { - requestURL = resp.URL.String() - } - // - return Report{ - RequestURL: requestURL, - Status: resp.Status, - Depth: resp.Depth, - Err: err, - } -} diff --git a/option.go b/option.go deleted file mode 100644 index b722a53..0000000 --- a/option.go +++ /dev/null @@ -1,85 +0,0 @@ -package wbot - -import ( - "time" -) - -// Option -type Option func(*WBot) - -// SetFetcher -func SetFetcher(f Fetcher) Option { - return func(w *WBot) { - w.fetcher = f - } -} - -// SetStore -func SetStore(s Store) Option { - return func(w *WBot) { - w.store = s - } -} - -// SetQueue -func SetQueue(q Queue) Option { - return func(w *WBot) { - w.queue = q - } -} - -// SetLogger -func SetLogger(l Logger) Option { - return func(w *WBot) { - w.log = l - } -} - -// SetLimiter -func SetRateLimit(rate int, interval time.Duration) Option { - return func(w *WBot) { - w.limit = newLimiter(rate, interval) - } -} - -// SetFilter -func SetFilter(allowed, disallowed []string) Option { - return func(w *WBot) { - w.filter = newFilter(allowed, disallowed) - } -} - -// SetMaxDepth -func SetMaxDepth(depth int32) Option { - return func(w *WBot) { - w.conf.maxDepth = depth - } -} - -// SetParallel -func SetParallel(parallel int) Option { - return func(w *WBot) { - w.conf.parallel = parallel - } -} - -// SetMaxBodySize -func SetMaxBodySize(size int64) Option { - return func(w *WBot) { - w.conf.maxBodySize = size - } -} - -// SetUserAgents -func SetUserAgents(agents []string) Option { - return func(w *WBot) { - w.conf.userAgents = newRotator(agents) - } -} - -// SetProxies -func SetProxies(proxies []string) Option { - return func(w *WBot) { - w.conf.proxies = newRotator(proxies) - } -} diff --git a/plugin/fetcher/chromedp.go b/plugin/fetcher/headless.go similarity index 100% rename from plugin/fetcher/chromedp.go rename to plugin/fetcher/headless.go diff --git a/plugin/fetcher/http_client.go b/plugin/fetcher/http_client.go new file mode 100644 index 0000000..9c0b2f0 --- /dev/null +++ b/plugin/fetcher/http_client.go @@ -0,0 +1,169 @@ +package fetcher + +import ( + "bytes" + "context" + "io" + "net" + "net/http" + "net/url" + "sync" + "time" + + "github.com/twiny/wbot" +) + +type ( + defaultHTTPClient struct { + client *http.Client + bufferPool *sync.Pool + } +) + +func NewHTTPClient() wbot.Fetcher { + var ( + fn = func() any { + return new(bytes.Buffer) + } + ) + + return &defaultHTTPClient{ + client: &http.Client{ + Jar: http.DefaultClient.Jar, + Timeout: 10 * time.Second, + Transport: &http.Transport{ + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 10 * time.Second, + DualStack: true, + }).DialContext, + ForceAttemptHTTP2: true, + MaxIdleConns: 100, // Default: 100 + MaxIdleConnsPerHost: 2, // Default: 2 + IdleConnTimeout: 10 * time.Second, + TLSHandshakeTimeout: 5 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + // DisableKeepAlives: true, + }, + }, + bufferPool: &sync.Pool{ + New: fn, + }, + } +} + +func (f *defaultHTTPClient) Fetch(ctx context.Context, req *wbot.Request) (*wbot.Response, error) { + var ( + respCh = make(chan *wbot.Response, 1) + fetchErr error + ) + + fctx, done := context.WithTimeout(ctx, req.Param.Timeout) + defer done() + + go func() { + resp, err := f.fetch(req) + if err != nil { + fetchErr = err + return + } + respCh <- resp + }() + + for { + select { + case <-fctx.Done(): + return nil, fctx.Err() + case resp := <-respCh: + if fetchErr != nil { + return nil, fetchErr + } + return resp, nil + } + } +} +func (f *defaultHTTPClient) Close() error { + f.client.CloseIdleConnections() + return nil +} + +func (f *defaultHTTPClient) fetch(req *wbot.Request) (*wbot.Response, error) { + var header = make(http.Header) + header.Set("User-Agent", req.Param.UserAgent) + header.Set("Referer", req.Param.Referer) + + if req.Param.Proxy != "" { + f.client.Transport = newHTTPTransport(req.Param.Proxy) + } + + resp, err := f.client.Do(&http.Request{ + Method: http.MethodGet, + URL: req.Target.URL, + Header: header, + Proto: "HTTP/1.1", + ProtoMajor: 1, + ProtoMinor: 1, + }) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + buf := f.bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer f.bufferPool.Put(buf) + + // Limit response body reading + if _, err := io.CopyN(buf, resp.Body, req.Param.MaxBodySize); err != nil && err != io.EOF { + return nil, err + } + + bytes := buf.Bytes() + + links := wbot.FindLinks(bytes) + + var nextURLs []*wbot.ParsedURL + for _, link := range links { + absURL, err := req.ResolveURL(link) + if err != nil { + continue + } + parsedURL, err := wbot.NewURL(absURL.String()) + if err != nil { + continue + } + nextURLs = append(nextURLs, parsedURL) + } + + return &wbot.Response{ + URL: req.Target, + Status: resp.StatusCode, + Body: bytes, + NextURLs: nextURLs, + Depth: req.Depth, + }, nil +} +func newHTTPTransport(purl string) *http.Transport { + var proxy = http.ProxyFromEnvironment + + if purl != "" { + proxy = func(req *http.Request) (*url.URL, error) { + return url.Parse(purl) + } + } + return &http.Transport{ + Proxy: proxy, + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 10 * time.Second, + DualStack: true, + }).DialContext, + ForceAttemptHTTP2: false, + MaxIdleConns: 10, // Default: 100 + MaxIdleConnsPerHost: 5, // Default: 2 + IdleConnTimeout: 10 * time.Second, + TLSHandshakeTimeout: 2 * time.Second, + ExpectContinueTimeout: 2 * time.Second, + DisableKeepAlives: true, + } +} diff --git a/plugin/metrics/metrics.go b/plugin/metrics/metrics.go new file mode 100644 index 0000000..b612b80 --- /dev/null +++ b/plugin/metrics/metrics.go @@ -0,0 +1,55 @@ +package metrics + +import ( + "sync/atomic" +) + +type ( + metricsMonitor struct { + totalRequests int64 + successfulRequests int64 + failedRequests int64 + + totalLink int64 + crawledLink int64 + skippedLink int64 + duplicatedLink int64 + } +) + +func NewMetricsMonitor() *metricsMonitor { + return &metricsMonitor{} +} + +func (m *metricsMonitor) IncTotalRequests() { + atomic.AddInt64(&m.totalRequests, 1) +} +func (m *metricsMonitor) IncSuccessfulRequests() { + atomic.AddInt64(&m.successfulRequests, 1) +} +func (m *metricsMonitor) IncFailedRequests() { + atomic.AddInt64(&m.failedRequests, 1) +} +func (m *metricsMonitor) IncTotalLink() { + atomic.AddInt64(&m.totalLink, 1) +} +func (m *metricsMonitor) IncCrawledLink() { + atomic.AddInt64(&m.crawledLink, 1) +} +func (m *metricsMonitor) IncSkippedLink() { + atomic.AddInt64(&m.skippedLink, 1) +} +func (m *metricsMonitor) IncDuplicatedLink() { + atomic.AddInt64(&m.duplicatedLink, 1) +} +func (m *metricsMonitor) Metrics() map[string]int64 { + return map[string]int64{ + "total_requests": atomic.LoadInt64(&m.totalRequests), + "successful_requests": atomic.LoadInt64(&m.successfulRequests), + "failed_requests": atomic.LoadInt64(&m.failedRequests), + "total_link": atomic.LoadInt64(&m.totalLink), + "crawled_link": atomic.LoadInt64(&m.crawledLink), + "skipped_link": atomic.LoadInt64(&m.skippedLink), + "duplicated_link": atomic.LoadInt64(&m.duplicatedLink), + } +} diff --git a/plugin/queue/bolt.go b/plugin/queue/bolt.go deleted file mode 100644 index e6085a5..0000000 --- a/plugin/queue/bolt.go +++ /dev/null @@ -1,108 +0,0 @@ -package queue - -import ( - "bytes" - "encoding/binary" - "encoding/gob" - "errors" - - "github.com/twiny/wbot" - "go.etcd.io/bbolt" -) - -var ( - ErrEmptyQueue = errors.New("queue is empty") -) - -// prefix -var prefix = "queue" - -// BQueue -type BQueue struct { - prefix string - db *bbolt.DB // Bolt stores its keys in byte-sorted order within a bucket. -} - -// NewBQueue -func NewBQueue(db *bbolt.DB) (wbot.Queue, error) { - if err := db.Update(func(tx *bbolt.Tx) error { - _, err := tx.CreateBucketIfNotExists([]byte(prefix)) - return err - }); err != nil { - return nil, err - } - - return &BQueue{ - prefix: prefix, - db: db, - }, nil -} - -// Enqueue -func (bq *BQueue) Enqueue(req wbot.Request) error { - var buf bytes.Buffer - if err := gob.NewEncoder(&buf).Encode(req); err != nil { - return err - } - - return bq.db.Update(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(prefix)) - - var key = make([]byte, 8) - seq, err := bu.NextSequence() - if err != nil { - return err - } - - binary.BigEndian.PutUint64(key, seq) - - return bu.Put(key, buf.Bytes()) - }) -} - -// Dequeue -func (bq *BQueue) Dequeue() (wbot.Request, error) { - // get from db - var req wbot.Request - if err := bq.db.Update(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(prefix)) - - c := bu.Cursor() - - k, v := c.First() - if k == nil { - return ErrEmptyQueue - } - - if err := gob.NewDecoder(bytes.NewReader(v)).Decode(&req); err != nil { - return err - } - - return c.Delete() - }); err != nil { - return wbot.Request{}, err - } - - return req, nil -} - -// Next -func (bq *BQueue) Next() bool { - return bq.db.View(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(bq.prefix)) - - c := bu.Cursor() - - k, _ := c.First() - if k == nil { - return ErrEmptyQueue - } - - return nil - }) == nil -} - -// Close -func (bq *BQueue) Close() error { - return bq.db.Close() -} diff --git a/plugin/queue/queue.go b/plugin/queue/queue.go new file mode 100644 index 0000000..fcb4fbe --- /dev/null +++ b/plugin/queue/queue.go @@ -0,0 +1,59 @@ +package queue + +import ( + "context" + "fmt" + "sync" + + "github.com/twiny/wbot" +) + +/* +read first page add requests to queue +if request depth is exceeded return +*/ +type defaultInMemoryQueue struct { + mu *sync.RWMutex + list []*wbot.Request +} + +func NewInMemoryQueue(size int) wbot.Queue { + q := &defaultInMemoryQueue{ + mu: new(sync.RWMutex), + list: make([]*wbot.Request, 0, size), + } + + return q +} + +func (q *defaultInMemoryQueue) Push(ctx context.Context, req *wbot.Request) error { + q.mu.Lock() + defer q.mu.Unlock() + + q.list = append(q.list, req) + + return nil +} +func (q *defaultInMemoryQueue) Pop(ctx context.Context) (*wbot.Request, error) { + q.mu.Lock() + defer q.mu.Unlock() + + if len(q.list) == 0 { + return nil, fmt.Errorf("queue is empty") + } + + req := q.list[0] + q.list = q.list[1:] + + return req, nil +} +func (q *defaultInMemoryQueue) Len() int32 { + q.mu.RLock() + defer q.mu.RUnlock() + + return int32(len(q.list)) +} +func (q *defaultInMemoryQueue) Close() error { + clear(q.list) + return nil +} diff --git a/plugin/store/bbolt.go b/plugin/store/bbolt.go deleted file mode 100644 index b5fef0d..0000000 --- a/plugin/store/bbolt.go +++ /dev/null @@ -1,68 +0,0 @@ -package store - -import ( - "crypto/sha256" - "encoding/hex" - "fmt" - "strings" - - "github.com/twiny/wbot" - "go.etcd.io/bbolt" -) - -var ( - prefix = "store" -) - -// BStore -type BStore struct { - prefix string - db *bbolt.DB -} - -// NewBStore -func NewBStore(db *bbolt.DB) (wbot.Store, error) { - // create bucket for store - if err := db.Update(func(tx *bbolt.Tx) error { - _, err := tx.CreateBucketIfNotExists([]byte(prefix)) - return err - }); err != nil { - return nil, err - } - - return &BStore{ - prefix: prefix, - db: db, - }, nil -} - -// Visited -func (bs *BStore) Visited(link string) bool { - sum := sha256.Sum224([]byte(link)) - - // - key := strings.Join([]string{ - bs.prefix, - hex.EncodeToString(sum[:]), - }, "_") - - return bs.db.Update(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(prefix)) - - d := bu.Get([]byte(key)) - // if d == nil means not found - if d == nil { - if err := bu.Put([]byte(key), []byte(link)); err != nil { - return err - } - return nil - } - - return fmt.Errorf("visited") - }) != nil -} - -// Close -func (bs *BStore) Close() error { - return bs.db.Close() -} diff --git a/plugin/store/in_memory.go b/plugin/store/in_memory.go new file mode 100644 index 0000000..5f0a4c2 --- /dev/null +++ b/plugin/store/in_memory.go @@ -0,0 +1,40 @@ +package store + +import ( + "context" + "sync" + + "github.com/twiny/wbot" +) + +type ( + defaultInMemoryStore struct { + mu sync.Mutex + table map[string]bool + } +) + +func NewInMemoryStore() wbot.Store { + return &defaultInMemoryStore{ + table: make(map[string]bool), + } +} +func (s *defaultInMemoryStore) HasVisited(ctx context.Context, link *wbot.ParsedURL) (bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + + _, found := s.table[link.Hash] + if !found { + s.table[link.Hash] = true + return false, nil + } + + return found, nil +} +func (s *defaultInMemoryStore) Close() error { + s.mu.Lock() + defer s.mu.Unlock() + + clear(s.table) + return nil +} diff --git a/plugin/store/in_memory_test.go b/plugin/store/in_memory_test.go new file mode 100644 index 0000000..72440ea --- /dev/null +++ b/plugin/store/in_memory_test.go @@ -0,0 +1 @@ +package store diff --git a/queue.go b/queue.go deleted file mode 100644 index 2179d73..0000000 --- a/queue.go +++ /dev/null @@ -1,63 +0,0 @@ -package wbot - -import ( - "fmt" - "sync" -) - -// Queue -type Queue interface { - Enqueue(req Request) error - Dequeue() (Request, error) - Next() bool - Close() error -} - -// Default Queue - -// Queue -type queue[T any] struct { - mu *sync.Mutex - q []T -} - -// NewQueue -func defaultQueue[T any]() *queue[T] { - return &queue[T]{ - mu: &sync.Mutex{}, - q: make([]T, 0), - } -} - -// Enqueue -func (q *queue[T]) Enqueue(item T) error { - q.mu.Lock() - defer q.mu.Unlock() - q.q = append(q.q, item) - - return nil -} - -// Dequeue -func (q *queue[T]) Dequeue() (T, error) { - q.mu.Lock() - defer q.mu.Unlock() - if len(q.q) == 0 { - var t T - return t, fmt.Errorf("queue is empty") - } - r := q.q[0] - q.q = q.q[1:] - return r, nil -} - -// Next -func (q *queue[T]) Next() bool { - return len(q.q) != 0 -} - -// Close -func (q *queue[T]) Close() error { - q.q = nil - return nil -} diff --git a/request.go b/request.go deleted file mode 100644 index beb09a9..0000000 --- a/request.go +++ /dev/null @@ -1,61 +0,0 @@ -package wbot - -import ( - "fmt" - "net/url" - "strings" - - "golang.org/x/net/publicsuffix" -) - -// Request -type Request struct { - BaseDomain string - URL *url.URL - Depth int32 - Param Param -} - -// param -type Param struct { - Referer string - MaxBodySize int64 - UserAgent string - Proxy string -} - -// newRequest -func newRequest(raw string, depth int32, p Param) (Request, error) { - u, err := url.Parse(raw) - if err != nil { - return Request{}, err - } - - baseDomain, err := publicsuffix.EffectiveTLDPlusOne(u.Hostname()) - if err != nil { - return Request{}, err - } - - return Request{ - BaseDomain: baseDomain, - URL: u, - Depth: depth, - Param: p, - }, nil -} - -// AbsURL -func (r *Request) AbsURL(u string) (*url.URL, error) { - if strings.HasPrefix(u, "#") { - return nil, fmt.Errorf("url is a fragment") - } - - absURL, err := r.URL.Parse(u) - if err != nil { - return nil, err - } - - absURL.Fragment = "" - - return absURL, nil -} diff --git a/response.go b/response.go deleted file mode 100644 index 5c83a40..0000000 --- a/response.go +++ /dev/null @@ -1,12 +0,0 @@ -package wbot - -import "net/url" - -// Response -type Response struct { - URL *url.URL - Status int - Body []byte - NextURLs []string - Depth int32 -} diff --git a/rotator.go b/rotator.go deleted file mode 100644 index ea8c5ef..0000000 --- a/rotator.go +++ /dev/null @@ -1,39 +0,0 @@ -package wbot - -import ( - "container/ring" -) - -// rotator -type rotator struct { - r *ring.Ring -} - -// newRotator -func newRotator(s []string) *rotator { - r := ring.New(len(s)) - for _, item := range s { - r.Value = item - r = r.Next() - } - return &rotator{ - r: r, - } -} - -// Next -func (r *rotator) next() string { - if r.r == nil { - return "" - } - - val, ok := r.r.Value.(string) - if !ok { - return "" - } - - // move - r.r = r.r.Next() - - return val -} diff --git a/store.go b/store.go deleted file mode 100644 index 5b444b7..0000000 --- a/store.go +++ /dev/null @@ -1,47 +0,0 @@ -package wbot - -import "sync" - -// Store -type Store interface { - Visited(link string) bool - Close() error -} - -// Default Store - -// - -// Store -type store[T comparable] struct { - mu *sync.Mutex - visited map[T]bool -} - -// NewStore -func defaultStore[T comparable]() *store[T] { - return &store[T]{ - mu: &sync.Mutex{}, - visited: make(map[T]bool), - } -} - -// Visited -func (s *store[T]) Visited(k T) bool { - s.mu.Lock() - defer s.mu.Unlock() - _, ok := s.visited[k] - - // add if not visited - if !ok { - s.visited[k] = true - } - - return ok -} - -// Close -func (s *store[T]) Close() error { - s.visited = nil - return nil -} diff --git a/utilities.go b/utilities.go new file mode 100644 index 0000000..4918895 --- /dev/null +++ b/utilities.go @@ -0,0 +1,52 @@ +package wbot + +import ( + "fmt" + "net/url" + "strings" + + "github.com/weppos/publicsuffix-go/publicsuffix" +) + +var tlds = map[string]bool{ + "ac": true, "ae": true, "aero": true, "af": true, "ag": true, "am": true, + "as": true, "asia": true, "at": true, "au": true, "ax": true, "be": true, + "bg": true, "bi": true, "biz": true, "bj": true, "br": true, "by": true, + "ca": true, "cat": true, "cc": true, "cl": true, "cn": true, "co": true, + "com": true, "coop": true, "cx": true, "de": true, "dev": true, "dk": true, + "dm": true, "dz": true, "edu": true, "ee": true, "eu": true, "fi": true, "fo": true, + "fr": true, "ge": true, "gl": true, "gov": true, "gs": true, "hk": true, + "hr": true, "hu": true, "id": true, "ie": true, "in": true, "info": true, + "int": true, "io": true, "ir": true, "is": true, "je": true, "jobs": true, + "kg": true, "kr": true, "la": true, "lu": true, "lv": true, "ly": true, + "ma": true, "md": true, "me": true, "mk": true, "mobi": true, "ms": true, + "mu": true, "mx": true, "name": true, "net": true, "nf": true, "ng": true, + "no": true, "nu": true, "nz": true, "org": true, "pl": true, "pr": true, + "pro": true, "pw": true, "ro": true, "ru": true, "sc": true, "se": true, + "sg": true, "sh": true, "si": true, "sk": true, "sm": true, "st": true, + "so": true, "su": true, "tc": true, "tel": true, "tf": true, "th": true, + "tk": true, "tl": true, "tm": true, "tn": true, "travel": true, "tw": true, + "tv": true, "tz": true, "ua": true, "uk": true, "us": true, "uz": true, + "vc": true, "ve": true, "vg": true, "ws": true, "xxx": true, "rs": true, +} + +func Hostname(link string) (string, error) { + u, err := url.Parse(link) + if err != nil { + return "", fmt.Errorf("failed to parse URL: %w", err) + } + + // Extract domain and TLD using publicsuffix-go + domain, err := publicsuffix.Domain(u.Hostname()) + if err != nil { + return "", fmt.Errorf("failed to extract domain: %w", err) + } + + // Ensure that the extracted TLD is in our allowed list + tld := domain[strings.LastIndex(domain, ".")+1:] + if !tlds[tld] { + return "", fmt.Errorf("invalid TLD: %s", tld) + } + + return domain, nil +} diff --git a/utilities_test.go b/utilities_test.go new file mode 100644 index 0000000..c787682 --- /dev/null +++ b/utilities_test.go @@ -0,0 +1,39 @@ +package wbot + +import "testing" + +func TestHostname(t *testing.T) { + validURLs := []struct { + input string + expected string + }{ + {"http://www.google.com", "google.com"}, + {"https://sub.domain.google.com", "google.com"}, + {"http://beta.moon.facebook.com", "facebook.com"}, + // ... Add more valid test cases here + } + + invalidURLs := []string{ + "http://www.google.invalidTLD", + "https://example.com.xxy", + "ftp://example.site", // assuming "site" is not in your TLDs map + // ... Add more invalid test cases here + } + + for _, tt := range validURLs { + got, err := Hostname(tt.input) + if err != nil { + t.Errorf("Hostname(%q) returned unexpected error: %v", tt.input, err) + } + if got != tt.expected { + t.Errorf("Hostname(%q) = %q; want %q", tt.input, got, tt.expected) + } + } + + for _, url := range invalidURLs { + _, err := Hostname(url) + if err == nil { + t.Errorf("Hostname(%q) expected to return an error, but got none", url) + } + } +} diff --git a/wbot.go b/wbot.go index fc926b7..e189f2c 100644 --- a/wbot.go +++ b/wbot.go @@ -1,252 +1,200 @@ package wbot import ( + "bytes" + "context" + "crypto/sha256" + "encoding/hex" "fmt" - "runtime" + "net/url" + "regexp" "strings" - "sync" - "sync/atomic" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/weppos/publicsuffix-go/publicsuffix" ) -// default cpu core -var cores = func() int { - c := runtime.NumCPU() - if c == 1 { - return c - } - return c - 1 -}() - -// WBot -type WBot struct { - wg *sync.WaitGroup - conf *config - limit *limiter - filter *filter - fetcher Fetcher - queue Queue - store Store - log Logger - stream chan Response -} +type ( + Fetcher interface { + Fetch(ctx context.Context, req *Request) (*Response, error) + Close() error + } -// NewWBot -func NewWBot(opts ...Option) *WBot { - conf := &config{ - maxDepth: 10, - parallel: cores, - maxBodySize: 1024 * 1024 * 10, - userAgents: newRotator([]string{}), - proxies: newRotator([]string{}), + Store interface { + HasVisited(ctx context.Context, u *ParsedURL) (bool, error) + Close() error } - wbot := &WBot{ - wg: &sync.WaitGroup{}, - conf: conf, - fetcher: defaultFetcher(), - limit: newLimiter(1, 1), - filter: newFilter([]string{}, []string{}), - store: defaultStore[string](), - queue: defaultQueue[Request](), - log: nil, - stream: make(chan Response, cores), + Queue interface { + Push(ctx context.Context, req *Request) error + Pop(ctx context.Context) (*Request, error) + Len() int32 + Close() error } - // options - wbot.SetOptions(opts...) + MetricsMonitor interface { + IncTotalRequests() + IncSuccessfulRequests() + IncFailedRequests() - return wbot -} + IncTotalLink() + IncCrawledLink() + IncSkippedLink() + IncDuplicatedLink() -// Crawl -func (wb *WBot) Crawl(link string) error { - // first request - p := Param{ - Referer: link, - MaxBodySize: wb.conf.maxBodySize, - UserAgent: wb.conf.userAgents.next(), - Proxy: wb.conf.proxies.next(), + Metrics() map[string]int64 } - req, err := newRequest(link, 0, p) - if err != nil { - return err + Request struct { + Target *ParsedURL + Param *Param + Depth int32 } - // rate limit - wb.limit.take(req.URL) - - resp, err := wb.fetcher.Fetch(req) - if err != nil { - return err + Response struct { + URL *ParsedURL + Status int + Body []byte + NextURLs []*ParsedURL + Depth int32 + ElapsedTime time.Duration + Err error } - if wb.log != nil { - rep := newReport(resp, nil) - wb.log.Send(rep) + ParsedURL struct { + Hash string + Root string + URL *url.URL } - // stream 1st response - wb.stream <- resp - - // add to queue - for _, link := range resp.NextURLs { - u, err := req.AbsURL(link) - if err != nil { - continue - } + Param struct { + Proxy string + UserAgent string + Referer string + MaxBodySize int64 + Timeout time.Duration + } - // is allowed domain - if !strings.Contains(u.Hostname(), req.BaseDomain) { - continue - } + FilterRule struct { + Hostname string + Allow []*regexp.Regexp + Disallow []*regexp.Regexp + } - // add only referer & maxBodySize - // rest of params will be added - // right before fetch request - // to avoid rotating user agent and proxy. - p := Param{ - Referer: req.URL.String(), - MaxBodySize: wb.conf.maxBodySize, - } - nreq, err := newRequest(u.String(), 1, p) - if err != nil { - continue - } + RateLimit struct { + Hostname string + Rate string + } +) - if err := wb.queue.Enqueue(nreq); err != nil { - continue - } +func (r *Request) ResolveURL(u string) (*url.URL, error) { + if strings.HasPrefix(u, "#") { + return nil, fmt.Errorf("url is a fragment") } - // start crawl - wb.wg.Add(wb.conf.parallel) - for i := 0; i < wb.conf.parallel; i++ { - go wb.crawl() + absURL, err := r.Target.URL.Parse(u) + if err != nil { + return nil, err } - // wait for all workers to finish - wb.wg.Wait() - close(wb.stream) + absURL.Fragment = "" - return nil + return absURL, nil +} +func (u *ParsedURL) String() string { + var link = u.URL.String() + if len(link) > 64 { + return link[:64] + } + return link } -// crawl -func (wb *WBot) crawl() { - defer wb.wg.Done() - // - for wb.queue.Next() { - req, err := wb.queue.Dequeue() - if err != nil { - if wb.log != nil { - rep := newReport(Response{}, err) - wb.log.Send(rep) - } - continue - } +func FindLinks(body []byte) (hrefs []string) { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) + if err != nil { + return hrefs + } - // check if max depth reached - if req.Depth > wb.conf.maxDepth { - if wb.log != nil { - rep := newReport(Response{}, fmt.Errorf("max depth reached")) - wb.log.Send(rep) - } - return + doc.Find("a[href]").Each(func(index int, item *goquery.Selection) { + if href, found := item.Attr("href"); found { + hrefs = append(hrefs, href) } - - // if already visited - if wb.store.Visited(req.URL.String()) { - if wb.log != nil { - rep := newReport(Response{}, fmt.Errorf("url recently checked")) - wb.log.Send(rep) - } - continue + }) + doc.Find("link[href]").Each(func(index int, item *goquery.Selection) { + if href, found := item.Attr("href"); found { + hrefs = append(hrefs, href) } - - // check filter - if !wb.filter.Allow(req.URL) { - if wb.log != nil { - rep := newReport(Response{}, fmt.Errorf("filtered url")) - wb.log.Send(rep) - } - continue + }) + doc.Find("img[src]").Each(func(index int, item *goquery.Selection) { + if src, found := item.Attr("src"); found { + hrefs = append(hrefs, src) } + }) + doc.Find("script[src]").Each(func(index int, item *goquery.Selection) { + if src, found := item.Attr("src"); found { + hrefs = append(hrefs, src) + } + }) + doc.Find("iframe[src]").Each(func(index int, item *goquery.Selection) { + if src, found := item.Attr("src"); found { + hrefs = append(hrefs, src) + } + }) + return hrefs +} - // rate limit - wb.limit.take(req.URL) - - req.Param.UserAgent = wb.conf.userAgents.next() - req.Param.Proxy = wb.conf.proxies.next() +func NewURL(raw string) (*ParsedURL, error) { + u, err := url.Parse(raw) + if err != nil { + return nil, err + } - // visit next url - resp, err := wb.fetcher.Fetch(req) - if err != nil { - if wb.log != nil { - rep := newReport(resp, err) - wb.log.Send(rep) - } - continue - } + if u.Scheme != "http" && u.Scheme != "https" { + return nil, fmt.Errorf("invalid scheme: %s", u.Scheme) + } - if wb.log != nil { - rep := newReport(resp, nil) - wb.log.Send(rep) - } + // Extract domain and TLD using publicsuffix-go + domain, err := publicsuffix.Domain(u.Hostname()) + if err != nil { + return nil, fmt.Errorf("failed to extract domain: %w", err) + } - // stream - wb.stream <- resp - - // current depth - depth := req.Depth - // increment depth - atomic.AddInt32(&depth, 1) - - // visit next urls - for _, link := range resp.NextURLs { - u, err := req.AbsURL(link) - if err != nil { - continue - } - - // is allowed domain - if !strings.Contains(u.Hostname(), req.BaseDomain) { - continue - } - - p := Param{ - Referer: req.URL.String(), - MaxBodySize: wb.conf.maxBodySize, - } - nreq, err := newRequest(u.String(), depth, p) - if err != nil { - continue - } - - if err := wb.queue.Enqueue(nreq); err != nil { - continue - } - } + // Ensure that the extracted TLD is in our allowed list + tld := domain[strings.LastIndex(domain, ".")+1:] + if !tlds[tld] { + return nil, fmt.Errorf("invalid TLD: %s", tld) } -} -// SetOptions -func (wb *WBot) SetOptions(opts ...Option) { - for _, opt := range opts { - opt(wb) + hash, err := hashLink(*u) + if err != nil { + return nil, fmt.Errorf("invalid hash: %s", hash) } -} -// Stream -func (wb *WBot) Stream() <-chan Response { - return wb.stream + return &ParsedURL{ + Hash: hash, + Root: domain, + URL: u, + }, nil } -// Close -func (wb *WBot) Close() { - wb.queue.Close() - wb.store.Close() - if wb.log != nil { - wb.log.Close() +func hashLink(parsedLink url.URL) (string, error) { + parsedLink.Scheme = "" + + parsedLink.Host = strings.TrimPrefix(parsedLink.Host, "www.") + + decodedPath, err := url.PathUnescape(parsedLink.Path) + if err != nil { + return "", err } + parsedLink.Path = decodedPath + + cleanedURL := strings.TrimRight(parsedLink.String(), "/") + + cleanedURL = strings.TrimPrefix(cleanedURL, "//") + + hasher := sha256.New() + hasher.Write([]byte(cleanedURL)) + + return hex.EncodeToString(hasher.Sum(nil)), nil }