From 75f322c7d74f9dabb131dc0e4d34b4c3736c5671 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 10 Jun 2022 18:32:47 +0000 Subject: [PATCH 1/4] Bump github.com/ilyakaznacheev/cleanenv from 1.2.6 to 1.3.0 Bumps [github.com/ilyakaznacheev/cleanenv](https://github.com/ilyakaznacheev/cleanenv) from 1.2.6 to 1.3.0. - [Release notes](https://github.com/ilyakaznacheev/cleanenv/releases) - [Commits](https://github.com/ilyakaznacheev/cleanenv/compare/v1.2.6...v1.3.0) --- updated-dependencies: - dependency-name: github.com/ilyakaznacheev/cleanenv dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 8 ++++---- go.sum | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index d49768f..dfd6d25 100644 --- a/go.mod +++ b/go.mod @@ -5,15 +5,15 @@ go 1.17 require ( github.com/PuerkitoBio/goquery v1.8.0 github.com/goodsign/monday v1.0.0 - github.com/ilyakaznacheev/cleanenv v1.2.6 + github.com/ilyakaznacheev/cleanenv v1.3.0 golang.org/x/net v0.0.0-20220225172249-27dd8689420f gopkg.in/yaml.v2 v2.4.0 ) require ( - github.com/BurntSushi/toml v0.3.1 // indirect + github.com/BurntSushi/toml v1.1.0 // indirect github.com/andybalholm/cascadia v1.3.1 // indirect - github.com/joho/godotenv v1.3.0 // indirect - gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect + github.com/joho/godotenv v1.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 // indirect ) diff --git a/go.sum b/go.sum index 8cac97e..ce1a8ae 100644 --- a/go.sum +++ b/go.sum @@ -1,15 +1,15 @@ -github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/toml v1.1.0 h1:ksErzDEI1khOiGPgpwuI7x2ebx/uXQNw7xJpn9Eq1+I= +github.com/BurntSushi/toml v1.1.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/goodsign/monday v1.0.0 h1:Yyk/s/WgudMbAJN6UWSU5xAs8jtNewfqtVblAlw0yoc= github.com/goodsign/monday v1.0.0/go.mod h1:r4T4breXpoFwspQNM+u2sLxJb2zyTaxVGqUfTBjWOu8= -github.com/ilyakaznacheev/cleanenv v1.2.6 h1:oJRaVZfAI0xdA5LJNguuKH2ldVJg44SP8GqkEn/cw7w= -github.com/ilyakaznacheev/cleanenv v1.2.6/go.mod h1:C3bB+MJ+LjECYlw2k7CSagKGfL1Ym2ywfjj40RjXJ24= -github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc= -github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg= +github.com/ilyakaznacheev/cleanenv v1.3.0 h1:RapuLclPPUbmdd5Bi5UXScwMEZA6+ZNLU5OW9itPjj0= +github.com/ilyakaznacheev/cleanenv v1.3.0/go.mod h1:i0owW+HDxeGKE0/JPREJOdSCPIyOnmh6C0xhWAkF/xA= +github.com/joho/godotenv v1.4.0 h1:3l4+N6zfMWnkbPEXKng2o2/MR5mSwTrBih4ZEkkz1lg= +github.com/joho/godotenv v1.4.0/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220225172249-27dd8689420f h1:oA4XRj0qtSt8Yo1Zms0CUlsT3KG69V2UGQWPBxujDmc= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= @@ -26,7 +26,7 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 h1:slmdOY3vp8a7KQbHkL+FLbvbkgMqmXojpFUO/jENuqQ= olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3/go.mod h1:oVgVk4OWVDi43qWBEyGhXgYxt7+ED4iYNpTngSLX2Iw= From 8304408222a435043be89a3d0392e981ba5092f2 Mon Sep 17 00:00:00 2001 From: Jakob Dhondt Date: Wed, 22 Jun 2022 18:01:57 +0200 Subject: [PATCH 2/4] wip --- concerts-config.yml | 46 ++++++++++++++++++++++++ scraper/scraper.go | 87 ++++++++++++++++++++++++++++----------------- 2 files changed, 100 insertions(+), 33 deletions(-) diff --git a/concerts-config.yml b/concerts-config.yml index c9a18dd..3b8229b 100644 --- a/concerts-config.yml +++ b/concerts-config.yml @@ -483,3 +483,49 @@ scrapers: layout: ["15Uhr04"] date_location: "Europe/Berlin" date_language: "de_DE" + + - name: Komplex457 + url: "https://komplex-457.ch/event/" + item: ".portfolio" + fields: + static: + - name: "location" + value: "Komplex457" + - name: "city" + value: "Zurich" + - name: "type" + value: "concert" + - name: "sourceUrl" + value: "https://komplex-457.ch/event/" + dynamic: + - name: "title" + location: + selector: ".av-masonry-entry-title" + - name: "url" + type: "url" + location: + selector: "" + - name: "date" + type: "date" + on_subpage: "url" + components: + - covers: + day: true + month: true + year: true + location: + selector: ".iconbox_content_container p" + node_index: 0 + entire_subtree: true + regex_extract: + exp: "[0-9]{2}\\.[0-9]{2}\\.[0-9]{4}" + layout: ["02.01.2006"] + - covers: + time: true + location: + selector: ".iconbox_content_container p" + node_index: 3 + regex_extract: + exp: "[0-9]{1,2}:[0-9]{2}" + layout: ["15:04"] + date_location: "Europe/Berlin" \ No newline at end of file diff --git a/scraper/scraper.go b/scraper/scraper.go index 7986061..ef58127 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -1,6 +1,7 @@ package scraper import ( + "bytes" "errors" "fmt" "log" @@ -63,12 +64,13 @@ type RegexConfig struct { // ElementLocation is used to find a specific string in a html document type ElementLocation struct { - Selector string `yaml:"selector"` - NodeIndex int `yaml:"node_index"` - ChildIndex int `yaml:"child_index"` - RegexExtract RegexConfig `yaml:"regex_extract"` - Attr string `yaml:"attr"` - MaxLength int `yaml:"max_length"` + Selector string `yaml:"selector"` + NodeIndex int `yaml:"node_index"` + ChildIndex int `yaml:"child_index"` + RegexExtract RegexConfig `yaml:"regex_extract"` + Attr string `yaml:"attr"` + MaxLength int `yaml:"max_length"` + EntireSubtree bool `yaml:"entire_subtree"` } // CoveredDateParts is used to determine what parts of a date a @@ -486,46 +488,65 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) { fieldSelection := s.Find(t.Selector) if len(fieldSelection.Nodes) > t.NodeIndex { if t.Attr == "" { - fieldNode := fieldSelection.Get(t.NodeIndex).FirstChild - currentChildIndex := 0 - for fieldNode != nil { - // for the case where we want to find the correct string - // by regex (checking all the children and taking the first one that matches the regex) - // the ChildIndex has to be set to -1 to - // distinguish from the default case 0. So when we explicitly set ChildIndex to -1 it means - // check _all_ of the children. - if currentChildIndex == t.ChildIndex || t.ChildIndex == -1 { - if fieldNode.Type == html.TextNode { - fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data) - if err == nil { - if t.MaxLength > 0 && t.MaxLength < len(fieldString) { - fieldString = fieldString[:t.MaxLength] + "..." + if t.EntireSubtree { + // copied from https://github.com/PuerkitoBio/goquery/blob/v1.8.0/property.go#L62 + var buf bytes.Buffer + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.TextNode { + // Keep newlines and spaces, like jQuery + buf.WriteString(n.Data) + } + if n.FirstChild != nil { + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + } + f(fieldSelection.Get(t.NodeIndex)) + fieldString = buf.String() + } else { + fieldNode := fieldSelection.Get(t.NodeIndex).FirstChild + currentChildIndex := 0 + for fieldNode != nil { + // for the case where we want to find the correct string + // by regex (checking all the children and taking the first one that matches the regex) + // the ChildIndex has to be set to -1 to + // distinguish from the default case 0. So when we explicitly set ChildIndex to -1 it means + // check _all_ of the children. + if currentChildIndex == t.ChildIndex || t.ChildIndex == -1 { + if fieldNode.Type == html.TextNode { + fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data) + if err == nil { + break + } else if t.ChildIndex != -1 { + // only in case we do not (ab)use the regex to search across all children + // we want to return the err. Also, we still return the fieldString as + // this might be useful for narrowing down the reason for the error. + return fieldString, err } - break - } else if t.ChildIndex != -1 { - // only in case we do not (ab)use the regex to search across all children - // we want to return the err. Also, we still return the fieldString as - // this might be useful for narrowing down the reason for the error. - return fieldString, err } } + fieldNode = fieldNode.NextSibling + currentChildIndex++ } - fieldNode = fieldNode.NextSibling - currentChildIndex++ } } else { // WRONG // It could be the case that there are multiple nodes that match the selector // and we don't want the attr of the first node... fieldString = fieldSelection.AttrOr(t.Attr, "") - fieldString, err = extractStringRegex(&t.RegexExtract, fieldString) - if err != nil { - return fieldString, err - } } } - // automitcally trimming whitespaces might be confusing in some cases... + // automatically trimming whitespaces might be confusing in some cases... fieldString = strings.TrimSpace(fieldString) + fieldString, err = extractStringRegex(&t.RegexExtract, fieldString) + if err != nil { + return fieldString, err + } + if t.MaxLength > 0 && t.MaxLength < len(fieldString) { + fieldString = fieldString[:t.MaxLength] + "..." + } return fieldString, nil } From f31f5b2ba47decf94cbfe4f212f38ca5a73835d4 Mon Sep 17 00:00:00 2001 From: jakopako Date: Tue, 28 Jun 2022 18:57:23 +0200 Subject: [PATCH 3/4] fixed bug introduced in previous version --- concerts-config.yml | 2 +- scraper/scraper.go | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/concerts-config.yml b/concerts-config.yml index 3b8229b..a3f379c 100644 --- a/concerts-config.yml +++ b/concerts-config.yml @@ -528,4 +528,4 @@ scrapers: regex_extract: exp: "[0-9]{1,2}:[0-9]{2}" layout: ["15:04"] - date_location: "Europe/Berlin" \ No newline at end of file + date_location: "Europe/Berlin" diff --git a/scraper/scraper.go b/scraper/scraper.go index ef58127..f9c07bb 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -518,7 +518,10 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) { if fieldNode.Type == html.TextNode { fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data) if err == nil { - break + if t.MaxLength > 0 && t.MaxLength < len(fieldString) { + fieldString = fieldString[:t.MaxLength] + "..." + } + return fieldString, nil } else if t.ChildIndex != -1 { // only in case we do not (ab)use the regex to search across all children // we want to return the err. Also, we still return the fieldString as From 63e9851af301804f094104fa9d6c33f12fc65558 Mon Sep 17 00:00:00 2001 From: jakopako Date: Tue, 28 Jun 2022 19:24:00 +0200 Subject: [PATCH 4/4] fixed bug --- scraper/scraper.go | 1 + 1 file changed, 1 insertion(+) diff --git a/scraper/scraper.go b/scraper/scraper.go index f9c07bb..0b39868 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -518,6 +518,7 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) { if fieldNode.Type == html.TextNode { fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data) if err == nil { + fieldString = strings.TrimSpace(fieldString) if t.MaxLength > 0 && t.MaxLength < len(fieldString) { fieldString = fieldString[:t.MaxLength] + "..." }