From b484a7555e57652e92939057c4b6c3ddfbc85378 Mon Sep 17 00:00:00 2001 From: jakopako Date: Mon, 1 Aug 2022 15:00:56 +0200 Subject: [PATCH] Improve auto config extraction Fixes #115 --- config-gen.yml | 85 ++-------------------- {automate => generate}/config.go | 117 +++++++++++++++++++++++-------- main.go | 2 +- 3 files changed, 93 insertions(+), 111 deletions(-) rename {automate => generate}/config.go (66%) diff --git a/config-gen.yml b/config-gen.yml index 807b798..c8477ea 100644 --- a/config-gen.yml +++ b/config-gen.yml @@ -1,7 +1,7 @@ scrapers: - name: "" - url: https://www.30cc.be/nl/programma?seizoen=2022-2023&discipline=3 - item: body.page-programma > div.page-wrap > div.page-main > div.section.section--content.js-filter > div.container > div.section__content > div.grid > div.grid__12.grid--bp-med__8 > div.block.js-load-more > div.block > div.grid.grid--flex.js-load-more-results > div.grid__12.is-view-list > div.excerpt.excerpt--event.excerpt--link.excerpt--clickable + url: https://www.goodreads.com/quotes/tag/life + item: body > div.content > div.mainContentContainer > div.mainContent > div.mainContentFloat > div.leftContainer > div.quote.mediumText > div.quoteDetails exclude_with_selector: [] fields: static: [] @@ -9,85 +9,8 @@ scrapers: - name: field-0 type: text location: - selector: div.excerpt__main > div.excerpt__header > h3.excerpt__title > a - node_index: 0 - child_index: 0 - regex_extract: - exp: "" - index: 0 - attr: "" - max_length: 0 - entire_subtree: false - on_subpage: "" - can_be_empty: false - components: [] - date_location: "" - date_language: "" - hide: false + selector: div.quoteText > span.authorOrTitle - name: field-1 type: text location: - selector: div.excerpt__main > div.excerpt__header > div.excerpt__intro - node_index: 0 - child_index: 0 - regex_extract: - exp: "" - index: 0 - attr: "" - max_length: 0 - entire_subtree: false - on_subpage: "" - can_be_empty: false - components: [] - date_location: "" - date_language: "" - hide: false - - name: field-2 - type: text - location: - selector: div.excerpt__main > div.excerpt__ct.js-show-more > div.excerpt--eventperformance.js-show-more-child > span.excerpt__meta > span.excerpt__time - node_index: 0 - child_index: 0 - regex_extract: - exp: "" - index: 0 - attr: "" - max_length: 0 - entire_subtree: false - on_subpage: "" - can_be_empty: false - components: [] - date_location: "" - date_language: "" - hide: false - - name: field-3 - type: text - location: - selector: div.excerpt__main > div.excerpt__ct.js-show-more > div.excerpt--eventperformance.js-show-more-child > span.excerpt__meta - node_index: 0 - child_index: 0 - regex_extract: - exp: "" - index: 0 - attr: "" - max_length: 0 - entire_subtree: false - on_subpage: "" - can_be_empty: false - components: [] - date_location: "" - date_language: "" - hide: false - filters: [] - paginator: - location: - selector: "" - node_index: 0 - child_index: 0 - regex_extract: - exp: "" - index: 0 - attr: "" - max_length: 0 - entire_subtree: false - max_pages: 0 + selector: div.quoteText diff --git a/automate/config.go b/generate/config.go similarity index 66% rename from automate/config.go rename to generate/config.go index d6355f1..1825d1d 100644 --- a/automate/config.go +++ b/generate/config.go @@ -15,32 +15,97 @@ import ( ) type locationProps struct { + loc scraper.ElementLocation count int examples []string } -type locationManager map[scraper.ElementLocation]*locationProps +type locationManager []*locationProps -func (l *locationManager) update(e scraper.ElementLocation, s string) { +func update(l locationManager, e scraper.ElementLocation, s string) locationManager { // updates count and examples or adds new element to the locationManager // old implementation - if p, found := (*l)[e]; found { - p.count += 1 - if p.count <= 4 { - p.examples = append(p.examples, s) + // if p, found := (*l)[e]; found { + // p.count += 1 + // if p.count <= 4 { + // p.examples = append(p.examples, s) + // } + // } else { + // (*l)[e] = &locationProps{count: 1, examples: []string{s}} + // } + + // new implementation + for _, lp := range l { + if checkAndUpdatePath(&lp.loc, &e) { + lp.count++ + if lp.count <= 4 { + lp.examples = append(lp.examples, s) + } + return l } - } else { - (*l)[e] = &locationProps{count: 1, examples: []string{s}} } - // TODO: new implementation + return append(l, &locationProps{loc: e, count: 1, examples: []string{s}}) } -func (l *locationManager) filter(minCount int) { - for e, p := range *l { - if p.count < minCount { - delete(*l, e) +func checkAndUpdatePath(a, b *scraper.ElementLocation) bool { + // returns true if the paths overlap and the rest of the + // element location is identical. If true is returned + // the Selector of a will be updated if necessary. + if a.NodeIndex == b.NodeIndex && a.ChildIndex == b.ChildIndex { + if a.Selector == b.Selector { + return true + } else { + ap := selectorToPath(a.Selector) + bp := selectorToPath(b.Selector) + np := []string{} + if len(ap) != len(bp) { + return false + } + for i, an := range ap { + ae, be := strings.Split(an, "."), strings.Split(bp[i], ".") + at, bt := ae[0], be[0] + if at == bt { + if len(ae) == 1 && len(be) == 1 { + np = append(np, an) + continue + } + ac, bc := ae[1:], be[1:] + cc := []string{} + for j := 0; j < len(ac); j++ { + for k := 0; k < len(bc); k++ { + if ac[j] == bc[k] { + cc = append(cc, ac[j]) + } + } + } + if len(cc) > 0 { + nnl := append([]string{at}, cc...) + nn := strings.Join(nnl, ".") + np = append(np, nn) + continue + } + + } + return false + + } + // if we get until here there is an overlapping path + a.Selector = pathToSelector(np) + return true } } + return false +} + +func filter(l locationManager, minCount int) locationManager { + i := 0 + for _, p := range l { + if p.count >= minCount { + l[i] = p + i++ + } + } + return l[:i] } func pathToSelector(pathSlice []string) string { @@ -95,7 +160,7 @@ func GetDynamicFieldsConfig(s *scraper.Scraper, minOcc int) error { return fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status) } z := html.NewTokenizer(res.Body) - locOcc := locationManager{} + locMan := locationManager{} nrChildren := map[string]int{} nodePath := []string{} depth := 0 @@ -121,7 +186,7 @@ parse: // we have seen the exact location we need to check whether there is a location // where for each node in the path that there is at least on overlapping class // (if at least one of the two nodes has a class) - locOcc.update(l, strings.TrimSpace(text)) + locMan = update(locMan, l, strings.TrimSpace(text)) } nrChildren[p] += 1 } @@ -179,24 +244,18 @@ parse: } } - locOcc.filter(minOcc) + locMan = filter(locMan, minOcc) - if len(locOcc) > 0 { - f := make([]scraper.ElementLocation, len(locOcc)) - i := 0 - for k := range locOcc { - f[i] = k - i++ - } - sort.Slice(f, func(p, q int) bool { - return f[p].Selector > f[q].Selector + if len(locMan) > 0 { + sort.Slice(locMan, func(p, q int) bool { + return locMan[p].loc.Selector > locMan[q].loc.Selector }) colorReset := "\033[0m" colorGreen := "\033[32m" colorBlue := "\033[34m" - for i, e := range f { - fmt.Printf("%sfield [%d]%s\n %slocation:%s %+v\n %sexamples:%s\n\t%s\n\n", colorGreen, i, colorReset, colorBlue, colorReset, e, colorBlue, colorReset, strings.Join(locOcc[e].examples, "\n\t")) + for i, e := range locMan { + fmt.Printf("%sfield [%d]%s\n %slocation:%s %+v\n %sexamples:%s\n\t%s\n\n", colorGreen, i, colorReset, colorBlue, colorReset, e.loc, colorBlue, colorReset, strings.Join(e.examples, "\n\t")) } reader := bufio.NewReader(os.Stdin) @@ -212,10 +271,10 @@ parse: } var fs []scraper.ElementLocation for _, n := range ns { - if n >= len(f) { + if n >= len(locMan) { return fmt.Errorf("please enter valid numbers") } - fs = append(fs, f[n]) + fs = append(fs, locMan[n].loc) } elementsToConfig(s, fs...) diff --git a/main.go b/main.go index c6951aa..2cee009 100644 --- a/main.go +++ b/main.go @@ -6,7 +6,7 @@ import ( "log" "sync" - "github.com/jakopako/goskyr/automate" + automate "github.com/jakopako/goskyr/generate" "github.com/jakopako/goskyr/output" "github.com/jakopako/goskyr/scraper" "gopkg.in/yaml.v3"