diff --git a/config/config.go b/config/config.go new file mode 100644 index 0000000..bdfd6ce --- /dev/null +++ b/config/config.go @@ -0,0 +1,5 @@ +package config + +var ( + Debug bool +) diff --git a/fetch/fetcher.go b/fetch/fetcher.go index 0127bd9..b6290e0 100644 --- a/fetch/fetcher.go +++ b/fetch/fetcher.go @@ -6,16 +6,20 @@ import ( "io" "log/slog" "net/http" + "net/url" + "os" "time" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/dom" "github.com/chromedp/chromedp" + "github.com/jakopako/goskyr/config" "github.com/jakopako/goskyr/types" + "github.com/jakopako/goskyr/utils" ) type FetchOpts struct { - Interaction types.Interaction + Interaction []*types.Interaction } // A Fetcher allows to fetch the content of a web page @@ -90,8 +94,8 @@ func (d *DynamicFetcher) Cancel() { d.cancelAlloc() } -func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { - logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", url)) +func (d *DynamicFetcher) Fetch(urlStr string, opts FetchOpts) (string, error) { + logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", urlStr)) logger.Debug("fetching page", slog.String("user-agent", d.UserAgent)) // start := time.Now() ctx, cancel := chromedp.NewContext(d.allocContext) @@ -104,36 +108,37 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { var body string sleepTime := time.Duration(d.WaitMilliseconds) * time.Millisecond actions := []chromedp.Action{ - chromedp.Navigate(url), + chromedp.Navigate(urlStr), chromedp.Sleep(sleepTime), } logger.Debug(fmt.Sprintf("appended chrome actions: Navigate, Sleep(%v)", sleepTime)) - delay := 500 * time.Millisecond // default is .5 seconds - if opts.Interaction.Delay > 0 { - delay = time.Duration(opts.Interaction.Delay) * time.Millisecond - } - if opts.Interaction.Type == types.InteractionTypeClick { - count := 1 // default is 1 - if opts.Interaction.Count > 0 { - count = opts.Interaction.Count + for j, ia := range opts.Interaction { + logger.Debug(fmt.Sprintf("processing interaction nr %d, type %s", j, ia.Type)) + delay := 500 * time.Millisecond // default is .5 seconds + if ia.Delay > 0 { + delay = time.Duration(ia.Delay) * time.Millisecond } - for i := 0; i < count; i++ { - // we only click the button if it exists. Do we really need this check here? - // TODO: should we click as many times as possible if count == 0? How would we implement this? - // actions = append(actions, chromedp.Click(d.Interaction.Selector, chromedp.ByQuery)) - actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error { - var nodes []*cdp.Node - if err := chromedp.Nodes(opts.Interaction.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { - return err - } - if len(nodes) == 0 { - return nil - } // nothing to do - logger.Debug(fmt.Sprintf("clicking on node with selector: %s", opts.Interaction.Selector)) - return chromedp.MouseClickNode(nodes[0]).Do(ctx) - })) - actions = append(actions, chromedp.Sleep(delay)) - logger.Debug(fmt.Sprintf("appended chrome actions: ActionFunc, Sleep(%v)", delay)) + if ia.Type == types.InteractionTypeClick { + count := 1 // default is 1 + if ia.Count > 0 { + count = ia.Count + } + for i := 0; i < count; i++ { + // we only click the button if it exists. Do we really need this check here? + actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error { + var nodes []*cdp.Node + if err := chromedp.Nodes(ia.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { + return err + } + if len(nodes) == 0 { + return nil + } // nothing to do + logger.Debug(fmt.Sprintf("clicking on node with selector: %s", ia.Selector)) + return chromedp.MouseClickNode(nodes[0]).Do(ctx) + })) + actions = append(actions, chromedp.Sleep(delay)) + logger.Debug(fmt.Sprintf("appended chrome actions: ActionFunc (mouse click), Sleep(%v)", delay)) + } } } actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error { @@ -145,6 +150,23 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) { return err })) + if config.Debug { + u, _ := url.Parse(urlStr) + var buf []byte + r, err := utils.RandomString(u.Host) + if err != nil { + return "", err + } + filename := fmt.Sprintf("%s.png", r) + actions = append(actions, chromedp.CaptureScreenshot(&buf)) + actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error { + // log.Printf("Write %v", fileName) + logger.Debug(fmt.Sprintf("writing screenshot to file %s", filename)) + return os.WriteFile(filename, buf, 0644) + })) + logger.Debug("appended chrome actions: CaptureScreenshot, ActionFunc (save screenshot)") + } + // run task list err := chromedp.Run(ctx, actions..., diff --git a/main.go b/main.go index b75236b..037e344 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "sync" "github.com/jakopako/goskyr/autoconfig" + "github.com/jakopako/goskyr/config" "github.com/jakopako/goskyr/ml" "github.com/jakopako/goskyr/output" "github.com/jakopako/goskyr/scraper" @@ -65,6 +66,7 @@ func main() { return } + config.Debug = *debugFlag var logLevel slog.Level if *debugFlag { logLevel = slog.LevelDebug @@ -170,7 +172,7 @@ func main() { go func() { for _, s := range config.Scrapers { if *singleScraper == "" || *singleScraper == s.Name { - s.Debug = *debugFlag + // s.Debug = *debugFlag sc <- s } } diff --git a/scraper/scraper.go b/scraper/scraper.go index 6cd8b9c..2092f7b 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -2,7 +2,6 @@ package scraper import ( "bytes" - "crypto/rand" "errors" "fmt" "io/fs" @@ -19,6 +18,7 @@ import ( "github.com/antchfx/jsonquery" "github.com/goodsign/monday" "github.com/ilyakaznacheev/cleanenv" + "github.com/jakopako/goskyr/config" "github.com/jakopako/goskyr/date" "github.com/jakopako/goskyr/fetch" "github.com/jakopako/goskyr/output" @@ -236,17 +236,16 @@ type Paginator struct { // A Scraper contains all the necessary config parameters and structs needed // to extract the desired information from a website type Scraper struct { - Name string `yaml:"name"` - URL string `yaml:"url"` - Item string `yaml:"item"` - Fields []Field `yaml:"fields,omitempty"` - Filters []*Filter `yaml:"filters,omitempty"` - Paginator Paginator `yaml:"paginator,omitempty"` - RenderJs bool `yaml:"render_js,omitempty"` - PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true - Interaction types.Interaction `yaml:"interaction,omitempty"` + Name string `yaml:"name"` + URL string `yaml:"url"` + Item string `yaml:"item"` + Fields []Field `yaml:"fields,omitempty"` + Filters []*Filter `yaml:"filters,omitempty"` + Paginator Paginator `yaml:"paginator,omitempty"` + RenderJs bool `yaml:"render_js,omitempty"` + PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true + Interaction []*types.Interaction `yaml:"interaction,omitempty"` fetcher fetch.Fetcher - Debug bool `yaml:"debug,omitempty"` } // GetItems fetches and returns all items from a website according to the @@ -280,7 +279,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string currentPage := 0 var doc *goquery.Document - hasNextPage, pageURL, doc, err := c.fetchPage(nil, currentPage, c.URL, globalConfig.UserAgent, &c.Interaction) + hasNextPage, pageURL, doc, err := c.fetchPage(nil, currentPage, c.URL, globalConfig.UserAgent, c.Interaction) if err != nil { return items, err } @@ -477,10 +476,10 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int return item } -func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i *types.Interaction) (bool, string, *goquery.Document, error) { +func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i []*types.Interaction) (bool, string, *goquery.Document, error) { if nextPageI == 0 { - newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: *i}) + newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: i}) if err != nil { return false, "", nil, err } @@ -492,10 +491,12 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl pagSelector := doc.Find(c.Paginator.Location.Selector) if len(pagSelector.Nodes) > 0 { if nextPageI < c.Paginator.MaxPages || c.Paginator.MaxPages == 0 { - ia := types.Interaction{ - Selector: c.Paginator.Location.Selector, - Type: types.InteractionTypeClick, - Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page + ia := []*types.Interaction{ + { + Selector: c.Paginator.Location.Selector, + Type: types.InteractionTypeClick, + Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page + }, } nextPageDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: ia}) if err != nil { @@ -525,8 +526,8 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl } } -func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Document, error) { - res, err := c.fetcher.Fetch(url, opts) +func (c *Scraper) fetchToDoc(urlStr string, opts fetch.FetchOpts) (*goquery.Document, error) { + res, err := c.fetcher.Fetch(urlStr, opts) if err != nil { return nil, err } @@ -537,14 +538,14 @@ func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Documen } // in debug mode we want to write all the html's to files - if c.Debug { - bs := make([]byte, 8) - _, err := rand.Read(bs) + if config.Debug { + u, _ := url.Parse(urlStr) + r, err := utils.RandomString(u.Host) if err != nil { - return nil, fmt.Errorf("failed to generate random bytes for html file name") + return nil, err } - filename := fmt.Sprintf("%s-%x.html", c.Name, bs[:8]) - slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", url)) + filename := fmt.Sprintf("%s.html", r) + slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", urlStr)) htmlStr, err := goquery.OuterHtml(doc.Children()) if err != nil { return nil, fmt.Errorf("failed to write html file: %v", err) diff --git a/utils/utils.go b/utils/utils.go index c93e8b0..62a9726 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -1,6 +1,7 @@ package utils import ( + "crypto/rand" "fmt" "math" "sort" @@ -142,3 +143,12 @@ func ReverseSlice[T any](s []T) { s[i], s[j] = s[j], s[i] } } + +func RandomString(base string) (string, error) { + bs := make([]byte, 8) + _, err := rand.Read(bs) + if err != nil { + return "", fmt.Errorf("failed to generate random bytes: %v", err) + } + return fmt.Sprintf("%s-%x", base, bs[:8]), nil +}