Skip to content

Commit

Permalink
crawler: restore support for '--override-host'
Browse files Browse the repository at this point in the history
When '--override-host <host>' is provided, the host in links of the
sitemap will be replaced by the one passed. This only affects links
at the first level of the sitemap.

Fixes #18
  • Loading branch information
Pixep committed Nov 11, 2024
1 parent 2870381 commit 48d7d45
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cmd/crowlet/crowlet.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func main() {

func addInterruptHandlers() chan struct{} {
stop := make(chan struct{})
osSignal := make(chan os.Signal)
osSignal := make(chan os.Signal, 1)
signal.Notify(osSignal, os.Interrupt, syscall.SIGTERM)
signal.Notify(osSignal, os.Interrupt, syscall.SIGINT)

Expand Down
3 changes: 3 additions & 0 deletions pkg/crawler/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats
log.Warn("Invalid throttle value, defaulting to 1.")
config.Throttle = 1
}
if config.Host != "" {
urls = RewriteURLHost(urls, config.Host)
}

config.HTTP.ParseLinks = config.Links.CrawlExternalLinks || config.Links.CrawlHyperlinks ||
config.Links.CrawlImages
Expand Down
15 changes: 15 additions & 0 deletions pkg/crawler/links.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ type Link struct {
IsExternal bool
}

// RewriteURLHost modifies a list of raw URL strings to point to a new host.
func RewriteURLHost(urls []string, newHost string) []string {
rewrittenURLs := make([]string, 0, len(urls))
for _, rawURL := range urls {
url, err := url.Parse(rawURL)
if err != nil {
log.Error("error parsing URL:", err)
continue
}
url.Host = newHost
rewrittenURLs = append(rewrittenURLs, url.String())
}
return rewrittenURLs
}

// ExtractLinks returns links found in the html page provided and currentURL.
// The URL is used to differentiate between internal and external links
func ExtractLinks(htmlBody io.ReadCloser, currentURL url.URL) ([]Link, error) {
Expand Down

0 comments on commit 48d7d45

Please sign in to comment.