diff --git a/cmd/crowlet/crowlet.go b/cmd/crowlet/crowlet.go index 4ac89c4..a3d53c2 100644 --- a/cmd/crowlet/crowlet.go +++ b/cmd/crowlet/crowlet.go @@ -170,7 +170,7 @@ func main() { func addInterruptHandlers() chan struct{} { stop := make(chan struct{}) - osSignal := make(chan os.Signal) + osSignal := make(chan os.Signal, 1) signal.Notify(osSignal, os.Interrupt, syscall.SIGTERM) signal.Notify(osSignal, os.Interrupt, syscall.SIGINT) diff --git a/pkg/crawler/crawl.go b/pkg/crawler/crawl.go index a0e28e0..e63509b 100644 --- a/pkg/crawler/crawl.go +++ b/pkg/crawler/crawl.go @@ -122,6 +122,9 @@ func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats log.Warn("Invalid throttle value, defaulting to 1.") config.Throttle = 1 } + if config.Host != "" { + urls = RewriteURLHost(urls, config.Host) + } config.HTTP.ParseLinks = config.Links.CrawlExternalLinks || config.Links.CrawlHyperlinks || config.Links.CrawlImages diff --git a/pkg/crawler/links.go b/pkg/crawler/links.go index a0e5438..f67e7f6 100644 --- a/pkg/crawler/links.go +++ b/pkg/crawler/links.go @@ -27,6 +27,21 @@ type Link struct { IsExternal bool } +// RewriteURLHost modifies a list of raw URL strings to point to a new host. +func RewriteURLHost(urls []string, newHost string) []string { + rewrittenURLs := make([]string, 0, len(urls)) + for _, rawURL := range urls { + url, err := url.Parse(rawURL) + if err != nil { + log.Error("error parsing URL:", err) + continue + } + url.Host = newHost + rewrittenURLs = append(rewrittenURLs, url.String()) + } + return rewrittenURLs +} + // ExtractLinks returns links found in the html page provided and currentURL. // The URL is used to differentiate between internal and external links func ExtractLinks(htmlBody io.ReadCloser, currentURL url.URL) ([]Link, error) {