Skip to content

Commit

Permalink
improve url extraction
Browse files Browse the repository at this point in the history
Fixes #98
  • Loading branch information
jakopako committed May 13, 2022
1 parent c23ce1d commit d856a8b
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 54 deletions.
106 changes: 99 additions & 7 deletions concerts-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ scrapers:
regex: ".*Postponed.*"
match: false
paginator:
selector: ".pagination .qt-btn-primary"
node_index: -1
location:
selector: ".pagination .qt-btn-primary"
node_index: -1
max_pages: 4

##########
Expand Down Expand Up @@ -162,8 +163,8 @@ scrapers:
date_location: "Europe/Berlin"
date_language: "en_US"
paginator:
selector: ".pager__item a"
relative: true
location:
selector: ".pager__item a"
filters:
- field: "title"
regex: ".*POSTPONED.*"
Expand Down Expand Up @@ -200,7 +201,6 @@ scrapers:
type: "url"
location:
selector: ".calendar-item__inner a"
relative: true
- name: "date"
type: "date"
on_subpage: "url"
Expand All @@ -216,8 +216,8 @@ scrapers:
date_location: "Europe/Berlin"
date_language: "nl_BE"
paginator:
selector: ".pager__item--next a"
relative: true
location:
selector: ".pager__item--next a"

##########
# Munich
Expand Down Expand Up @@ -341,3 +341,95 @@ scrapers:
- field: "location"
regex: "TonHalle" # duplicate
match: false

#########
# Zurich
#########
- name: Sender
url: "https://gds.fm/SENDER"
item: ".event-list__item"
fields:
static:
- name: "location"
value: "Sender"
- name: "city"
value: "Zurich"
- name: "type"
value: "concert"
- name: "sourceUrl"
value: "https://gds.fm/SENDER"
dynamic:
- name: "title"
location:
selector: ".event-preview__title"
- name: "comment"
location:
selector: ".event-detail__content div p"
max_length: 200
on_subpage: "url"
can_be_empty: true
- name: "url"
type: "url"
location:
selector: "" # An empty string means that we look in the event node itself for an href
- name: "date"
type: "date"
components:
- covers:
day: true
month: true
year: true
time: true
location:
selector: ".event-preview__date-long"
layout: ["January 02, 2006, 03:04 PM"]
date_location: "GMT"
date_language: "en_US"

- name: ElLokal
url: "http://www.ellokal.ch/?lang=de&details=9"
item: "#maincontent > .commingupEventsList_0,.commingupEventsList_1"
fields:
static:
- name: "location"
value: "ElLokal"
- name: "city"
value: "Zurich"
- name: "type"
value: "concert"
- name: "sourceUrl"
value: "http://www.ellokal.ch/?lang=de&details=9"
dynamic:
- name: "title"
location:
selector: ".commingupEventsList_block5 a"
- name: "comment"
location:
selector: ".concertStyleNew .concertDetails"
on_subpage: "url"
can_be_empty: true
- name: "url"
type: "url"
location:
selector: ".commingupEventsList_block5 a"
relative: true
- name: "date"
type: "date"
components:
- covers:
day: true
location:
selector: ".commingupEventsList_block2"
layout: ["02. "]
- covers:
month: true
location:
selector: ".commingupEventsList_block3"
layout: ["January"]
- covers:
time: true
location:
selector: ".commingupEventsList_block4"
layout: ["15Uhr04"]
date_location: "Europe/Berlin"
date_language: "de_DE"
74 changes: 27 additions & 47 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,8 @@ type Scraper struct {
} `yaml:"fields"`
Filters []Filter `yaml:"filters"`
Paginator struct {
// TODO: use getUrl method and remove relative bool
Selector string `yaml:"selector"`
Relative bool `yaml:"relative"`
MaxPages int `yaml:"max_pages"`
NodeIndex int `yaml:"node_index"`
Location ElementLocation `yaml:"location"`
MaxPages int `yaml:"max_pages"`
}
}

Expand Down Expand Up @@ -207,31 +204,11 @@ func (c Scraper) GetItems() ([]map[string]interface{}, error) {
})

hasNextPage = false
if c.Paginator.Selector != "" {
pageURL = getURLString(&c.Paginator.Location, doc.Selection, res)
if pageURL != "" {
currentPage++
if currentPage < c.Paginator.MaxPages || c.Paginator.MaxPages == 0 {
attr := "href"
if len(doc.Find(c.Paginator.Selector).Nodes) > c.Paginator.NodeIndex {
pagNode := doc.Find(c.Paginator.Selector).Get(c.Paginator.NodeIndex)
for _, a := range pagNode.Attr {
if a.Key == attr {
nextURL := a.Val
if c.Paginator.Relative {
baseURL := fmt.Sprintf("%s://%s", res.Request.URL.Scheme, res.Request.URL.Host)
if strings.HasPrefix(nextURL, "?") {
pageURL = baseURL + res.Request.URL.Path + nextURL
} else if !strings.HasPrefix(nextURL, "/") {
pageURL = baseURL + "/" + nextURL
} else {
pageURL = baseURL + nextURL
}
} else {
pageURL = nextURL
}
hasNextPage = true
}
}
}
hasNextPage = true
}
}
res.Body.Close()
Expand Down Expand Up @@ -294,7 +271,7 @@ func extractField(field *DynamicField, event map[string]interface{}, s *goquery.
}
event[field.Name] = ts
case "url":
url := getURLString(field, s, res)
url := getURLString(&field.ElementLocation, s, res)
if url == "" {
url = baseURL
}
Expand Down Expand Up @@ -425,23 +402,30 @@ func hasAllDateParts(cdp CoveredDateParts) bool {
return cdp.Day && cdp.Month && cdp.Year && cdp.Time
}

func getURLString(f *DynamicField, s *goquery.Selection, res *http.Response) string {
func getURLString(e *ElementLocation, s *goquery.Selection, res *http.Response) string {
var urlVal, url string
var exists bool
// attr := "href"
if f.ElementLocation.Attr == "" {
if e.Attr == "" {
// set attr to the default if not set
f.ElementLocation.Attr = "href"
e.Attr = "href"
}
if f.ElementLocation.Selector == "" {
urlVal, exists = s.Attr(f.ElementLocation.Attr)
if e.Selector == "" {
urlVal = s.AttrOr(e.Attr, "")
} else {
urlVal, exists = s.Find(f.ElementLocation.Selector).Attr(f.ElementLocation.Attr)
fieldSelection := s.Find(e.Selector)
if len(fieldSelection.Nodes) > e.NodeIndex {
fieldNode := fieldSelection.Get(e.NodeIndex)
for _, a := range fieldNode.Attr {
if a.Key == e.Attr {
urlVal = a.Val
break
}
}
}
}
if !exists {

if urlVal == "" {
return ""
}
if strings.HasPrefix(urlVal, "http") {
} else if strings.HasPrefix(urlVal, "http") {
url = urlVal
} else if strings.HasPrefix(urlVal, "?") {
url = fmt.Sprintf("%s://%s%s%s", res.Request.URL.Scheme, res.Request.URL.Host, res.Request.URL.Path, urlVal)
Expand All @@ -453,13 +437,6 @@ func getURLString(f *DynamicField, s *goquery.Selection, res *http.Response) str
url = fmt.Sprintf("%s%s", baseURL, urlVal)
}

// if f.Relative {
// baseURL := fmt.Sprintf("%s://%s", res.Request.URL.Scheme, res.Request.URL.Host)
// if !strings.HasPrefix(url, "/") {
// baseURL = baseURL + "/"
// }
// url = baseURL + url
// }
url = strings.TrimSpace(url)
return url
}
Expand Down Expand Up @@ -498,6 +475,9 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) {
currentChildIndex++
}
} else {
// WRONG
// It could be the case that there are multiple nodes that match the selector
// and we don't want the attr of the first node...
fieldString = fieldSelection.AttrOr(t.Attr, "")
fieldString, err = extractStringRegex(&t.RegexExtract, fieldString)
if err != nil {
Expand Down

0 comments on commit d856a8b

Please sign in to comment.