Skip to content

Commit

Permalink
Merge branch 'main' into jakopako/issue107
Browse files Browse the repository at this point in the history
  • Loading branch information
jakopako committed Jul 1, 2022
2 parents caf8049 + 63e9851 commit 2aa4190
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 45 deletions.
46 changes: 46 additions & 0 deletions concerts-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,49 @@ scrapers:
layout: ["15Uhr04"]
date_location: "Europe/Berlin"
date_language: "de_DE"

- name: Komplex457
url: "https://komplex-457.ch/event/"
item: ".portfolio"
fields:
static:
- name: "location"
value: "Komplex457"
- name: "city"
value: "Zurich"
- name: "type"
value: "concert"
- name: "sourceUrl"
value: "https://komplex-457.ch/event/"
dynamic:
- name: "title"
location:
selector: ".av-masonry-entry-title"
- name: "url"
type: "url"
location:
selector: ""
- name: "date"
type: "date"
on_subpage: "url"
components:
- covers:
day: true
month: true
year: true
location:
selector: ".iconbox_content_container p"
node_index: 0
entire_subtree: true
regex_extract:
exp: "[0-9]{2}\\.[0-9]{2}\\.[0-9]{4}"
layout: ["02.01.2006"]
- covers:
time: true
location:
selector: ".iconbox_content_container p"
node_index: 3
regex_extract:
exp: "[0-9]{1,2}:[0-9]{2}"
layout: ["15:04"]
date_location: "Europe/Berlin"
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ go 1.17
require (
github.com/PuerkitoBio/goquery v1.8.0
github.com/goodsign/monday v1.0.0
github.com/ilyakaznacheev/cleanenv v1.2.6
github.com/ilyakaznacheev/cleanenv v1.3.0
golang.org/x/net v0.0.0-20220225172249-27dd8689420f
gopkg.in/yaml.v2 v2.4.0
)

require (
github.com/BurntSushi/toml v0.3.1 // indirect
github.com/BurntSushi/toml v1.1.0 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/joho/godotenv v1.3.0 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
github.com/joho/godotenv v1.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 // indirect
)
16 changes: 8 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/toml v1.1.0 h1:ksErzDEI1khOiGPgpwuI7x2ebx/uXQNw7xJpn9Eq1+I=
github.com/BurntSushi/toml v1.1.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/goodsign/monday v1.0.0 h1:Yyk/s/WgudMbAJN6UWSU5xAs8jtNewfqtVblAlw0yoc=
github.com/goodsign/monday v1.0.0/go.mod h1:r4T4breXpoFwspQNM+u2sLxJb2zyTaxVGqUfTBjWOu8=
github.com/ilyakaznacheev/cleanenv v1.2.6 h1:oJRaVZfAI0xdA5LJNguuKH2ldVJg44SP8GqkEn/cw7w=
github.com/ilyakaznacheev/cleanenv v1.2.6/go.mod h1:C3bB+MJ+LjECYlw2k7CSagKGfL1Ym2ywfjj40RjXJ24=
github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc=
github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg=
github.com/ilyakaznacheev/cleanenv v1.3.0 h1:RapuLclPPUbmdd5Bi5UXScwMEZA6+ZNLU5OW9itPjj0=
github.com/ilyakaznacheev/cleanenv v1.3.0/go.mod h1:i0owW+HDxeGKE0/JPREJOdSCPIyOnmh6C0xhWAkF/xA=
github.com/joho/godotenv v1.4.0 h1:3l4+N6zfMWnkbPEXKng2o2/MR5mSwTrBih4ZEkkz1lg=
github.com/joho/godotenv v1.4.0/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220225172249-27dd8689420f h1:oA4XRj0qtSt8Yo1Zms0CUlsT3KG69V2UGQWPBxujDmc=
golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
Expand All @@ -26,7 +26,7 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 h1:slmdOY3vp8a7KQbHkL+FLbvbkgMqmXojpFUO/jENuqQ=
olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3/go.mod h1:oVgVk4OWVDi43qWBEyGhXgYxt7+ED4iYNpTngSLX2Iw=
91 changes: 58 additions & 33 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package scraper

import (
"bytes"
"errors"
"fmt"
"log"
Expand Down Expand Up @@ -64,12 +65,13 @@ type RegexConfig struct {

// ElementLocation is used to find a specific string in a html document
type ElementLocation struct {
Selector string `yaml:"selector"`
NodeIndex int `yaml:"node_index"`
ChildIndex int `yaml:"child_index"`
RegexExtract RegexConfig `yaml:"regex_extract"`
Attr string `yaml:"attr"`
MaxLength int `yaml:"max_length"`
Selector string `yaml:"selector"`
NodeIndex int `yaml:"node_index"`
ChildIndex int `yaml:"child_index"`
RegexExtract RegexConfig `yaml:"regex_extract"`
Attr string `yaml:"attr"`
MaxLength int `yaml:"max_length"`
EntireSubtree bool `yaml:"entire_subtree"`
}

// CoveredDateParts is used to determine what parts of a date a
Expand Down Expand Up @@ -487,46 +489,69 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) {
fieldSelection := s.Find(t.Selector)
if len(fieldSelection.Nodes) > t.NodeIndex {
if t.Attr == "" {
fieldNode := fieldSelection.Get(t.NodeIndex).FirstChild
currentChildIndex := 0
for fieldNode != nil {
// for the case where we want to find the correct string
// by regex (checking all the children and taking the first one that matches the regex)
// the ChildIndex has to be set to -1 to
// distinguish from the default case 0. So when we explicitly set ChildIndex to -1 it means
// check _all_ of the children.
if currentChildIndex == t.ChildIndex || t.ChildIndex == -1 {
if fieldNode.Type == html.TextNode {
fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data)
if err == nil {
if t.MaxLength > 0 && t.MaxLength < len(fieldString) {
fieldString = fieldString[:t.MaxLength] + "..."
if t.EntireSubtree {
// copied from https://github.com/PuerkitoBio/goquery/blob/v1.8.0/property.go#L62
var buf bytes.Buffer
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.TextNode {
// Keep newlines and spaces, like jQuery
buf.WriteString(n.Data)
}
if n.FirstChild != nil {
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
}
f(fieldSelection.Get(t.NodeIndex))
fieldString = buf.String()
} else {
fieldNode := fieldSelection.Get(t.NodeIndex).FirstChild
currentChildIndex := 0
for fieldNode != nil {
// for the case where we want to find the correct string
// by regex (checking all the children and taking the first one that matches the regex)
// the ChildIndex has to be set to -1 to
// distinguish from the default case 0. So when we explicitly set ChildIndex to -1 it means
// check _all_ of the children.
if currentChildIndex == t.ChildIndex || t.ChildIndex == -1 {
if fieldNode.Type == html.TextNode {
fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data)
if err == nil {
fieldString = strings.TrimSpace(fieldString)
if t.MaxLength > 0 && t.MaxLength < len(fieldString) {
fieldString = fieldString[:t.MaxLength] + "..."
}
return fieldString, nil
} else if t.ChildIndex != -1 {
// only in case we do not (ab)use the regex to search across all children
// we want to return the err. Also, we still return the fieldString as
// this might be useful for narrowing down the reason for the error.
return fieldString, err
}
break
} else if t.ChildIndex != -1 {
// only in case we do not (ab)use the regex to search across all children
// we want to return the err. Also, we still return the fieldString as
// this might be useful for narrowing down the reason for the error.
return fieldString, err
}
}
fieldNode = fieldNode.NextSibling
currentChildIndex++
}
fieldNode = fieldNode.NextSibling
currentChildIndex++
}
} else {
// WRONG
// It could be the case that there are multiple nodes that match the selector
// and we don't want the attr of the first node...
fieldString = fieldSelection.AttrOr(t.Attr, "")
fieldString, err = extractStringRegex(&t.RegexExtract, fieldString)
if err != nil {
return fieldString, err
}
}
}
// automitcally trimming whitespaces might be confusing in some cases...
// automatically trimming whitespaces might be confusing in some cases...
fieldString = strings.TrimSpace(fieldString)
fieldString, err = extractStringRegex(&t.RegexExtract, fieldString)
if err != nil {
return fieldString, err
}
if t.MaxLength > 0 && t.MaxLength < len(fieldString) {
fieldString = fieldString[:t.MaxLength] + "..."
}
return fieldString, nil
}

Expand Down

0 comments on commit 2aa4190

Please sign in to comment.