-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(crawler): fixing crawl functionality (#5)
## Motive this MR to fix issue with the crawling functionallity of the bot. the issue was with default fetcher that was using the `http.Client` that and it will block if the request takes too long to respond. the fix was to use `http.Client` with a timeout and a custom transport that will limit the number of concurrent requests. ## Changes - removed `OnError` Method. - added zerolog logger. - refactor metrics monitoring and add new metrics - reverted to simple queue. - timeout to HTTP client. - update gitignore.
- Loading branch information
Showing
33 changed files
with
1,360 additions
and
1,001 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
cmd/tests | ||
makefile | ||
logs | ||
tmp | ||
bin/ | ||
tests/ | ||
.idea.md | ||
*.*prof | ||
.vscode/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
clean: | ||
start: | ||
init: | ||
build: | ||
usage: | ||
.PHONY: clean start init build usage |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,67 @@ | ||
## WBot | ||
# WBot | ||
|
||
A configurable, thread-safe web crawler, provides a minimal interface for crawling and downloading web pages. | ||
|
||
### Features: | ||
## Features | ||
|
||
- Clean minimal API. | ||
- Configurable: MaxDepth, MaxBodySize, Rate Limit, Parrallelism, User Agent & Proxy rotation. | ||
- Memory-efficient, thread-safe. | ||
- Provides built-in interface: Fetcher, Store, Queue & a Logger. | ||
|
||
## API | ||
|
||
WBot provides a minimal API for crawling web pages. | ||
|
||
```go | ||
Run(links ...string) error | ||
OnReponse(fn func(*wbot.Response)) | ||
Metrics() map[string]int64 | ||
Shutdown() | ||
``` | ||
|
||
## Usage | ||
|
||
```go | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"log" | ||
|
||
### [Examples & API](https://github.com/twiny/wbot/wiki) | ||
"github.com/rs/zerolog" | ||
"github.com/twiny/wbot" | ||
"github.com/twiny/wbot/crawler" | ||
) | ||
|
||
### TODO | ||
- [ ] Add support for robots.txt. | ||
- [ ] Add test cases. | ||
- [ ] Implement `Fetch` using Chromedp. | ||
- [ ] Add more examples. | ||
- [ ] Add documentation. | ||
func main() { | ||
bot := crawler.New( | ||
crawler.WithParallel(50), | ||
crawler.WithMaxDepth(5), | ||
crawler.WithRateLimit(&wbot.RateLimit{ | ||
Hostname: "*", | ||
Rate: "10/1s", | ||
}), | ||
crawler.WithLogLevel(zerolog.DebugLevel), | ||
) | ||
defer bot.Shutdown() | ||
|
||
// read responses | ||
bot.OnReponse(func(resp *wbot.Response) { | ||
fmt.Printf("crawled: %s\n", resp.URL.String()) | ||
}) | ||
|
||
if err := bot.Run( | ||
"https://crawler-test.com/", | ||
); err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
log.Printf("finished crawling\n") | ||
} | ||
|
||
``` | ||
|
||
### Bugs | ||
|
||
Bugs or suggestions? Please visit the [issue tracker](https://github.com/twiny/wbot/issues). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package crawler | ||
|
||
import ( | ||
"runtime" | ||
"time" | ||
|
||
"github.com/twiny/poxa" | ||
) | ||
|
||
const ( | ||
defaultReferrer = "https://www.google.com/search" | ||
defaultUserAgent = "WBot/v0.2.0 (+https://github.com/twiny/wbot)" | ||
defaultTimeout = 10 * time.Second | ||
defaultMaxBodySize = int64(1024 * 1024 * 5) // 5MB | ||
) | ||
|
||
type ( | ||
config struct { | ||
parallel int | ||
maxDepth int32 | ||
maxBodySize int64 | ||
timeout time.Duration | ||
userAgents poxa.Spinner[string] | ||
referrers poxa.Spinner[string] | ||
proxies poxa.Spinner[string] | ||
} | ||
) | ||
|
||
func newConfig(maxDepth int32, userAgents, referrers, proxies []string) *config { | ||
if maxDepth <= 0 { | ||
maxDepth = 10 | ||
} | ||
|
||
var conf = &config{ | ||
parallel: runtime.NumCPU(), | ||
maxDepth: maxDepth, | ||
maxBodySize: defaultMaxBodySize, | ||
timeout: defaultTimeout, | ||
userAgents: poxa.NewSpinner(defaultUserAgent), | ||
referrers: poxa.NewSpinner(defaultReferrer), | ||
proxies: nil, | ||
} | ||
|
||
if len(userAgents) > 0 { | ||
uaList := poxa.NewSpinner(userAgents...) | ||
if uaList != nil { | ||
conf.userAgents = uaList | ||
} | ||
} | ||
|
||
if len(referrers) > 0 { | ||
refList := poxa.NewSpinner(referrers...) | ||
if refList != nil { | ||
conf.referrers = refList | ||
} | ||
} | ||
|
||
if len(proxies) > 0 { | ||
proxyList := poxa.NewSpinner(proxies...) | ||
if proxyList != nil { | ||
conf.proxies = proxyList | ||
} | ||
} | ||
|
||
return conf | ||
} |
Oops, something went wrong.