Skip to content

Commit

Permalink
Merge pull request #302 from jakopako/jakopako/issue264
Browse files Browse the repository at this point in the history
improve year guessing
  • Loading branch information
jakopako authored Jun 17, 2024
2 parents d2b0caf + 5db2ccd commit be251ac
Show file tree
Hide file tree
Showing 2 changed files with 219 additions and 27 deletions.
53 changes: 26 additions & 27 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,12 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string
}
}

c.guessYear(items, time.Now())

return items, nil
}

func (c *Scraper) guessYear(items []map[string]interface{}, ref time.Time) {
// get date field names where we need to adapt the year
dateFieldsGuessYear := map[string]bool{}
for _, f := range c.Fields {
Expand All @@ -384,46 +390,39 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string
// event websites mostly contain a list of events ordered by date. Sometimes the date does
// not contain the year. In that case we could simply set the year to the current year but
// it might happen that the list of events spans across more than one year into the next
// year. In that case we still want to set the correct year which would be current year + 1.
// year. In that case we still want to set the correct year which would be current year + n.
// Moreover, the list might not be ordered at all. In that case we also want to try to set
// the correct year.
if len(dateFieldsGuessYear) > 0 {
for i, item := range items {
for name, val := range item {
if dateFieldsGuessYear[name] {
if t, ok := val.(time.Time); ok {
now := time.Now()
yesterday := now.AddDate(0, 0, -1)
// we compare the date with yesterday, not now, to accomodate for the fact that at the time we scrape
// the event might have already taken place but not yet removed from the website. Let's see if 1 day
// is a reasonable margin.
if t.Before(yesterday) {
newT := time.Date(t.Year()+1, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location())
item[name] = newT
continue
}

// for the first item we compare this item's date with 'now' and try
// to find the most suitable year, ie the year that brings this item's
// date closest to now.
// for the remaining items we do the same as with the first item except
// that we compare this item's date to the previous item's date instead
// of 'now'.
if i > 0 {
if prevT, ok := items[i-1][name].(time.Time); ok {
// here we do not compare the current date directly to the previous date. There
// are cases where we wouldn't want the year to be increased by one even though
// the previous date is bigger than the current one. Such cases occur when a
// website contains a list of items that are sorted by date but within a day are
// not sorted by time. To prevent the year from being increased wrongly in that
// case we introduce a min delta of 1 day.
tmpT := prevT.AddDate(0, 0, -1)
if t.Before(tmpT) {
// probably there is still a bug here when we have a list that spans two years
// changes..
newT := time.Date(t.Year()+1, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location())
item[name] = newT
}
ref, _ = items[i-1][name].(time.Time)
}
diff := time.Since(time.Unix(0, 0))
newDate := t
for y := ref.Year() - 1; y <= ref.Year()+1; y++ {
tmpT := time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location())
if newDiff := tmpT.Sub(ref).Abs(); newDiff < diff {
diff = newDiff
newDate = time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location())
}
}
item[name] = newDate
}
}
}
}
}

return items, nil
}

func (c *Scraper) initializeFilters() error {
Expand Down
193 changes: 193 additions & 0 deletions scraper/scraper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,3 +648,196 @@ func TestExtractFieldDate29Feb(t *testing.T) {
t.Fatalf("expected '2024' as year of date but got '%d'", dt.Year())
}
}

func TestGuessYearSimple(t *testing.T) {
// items dates span period around change of year
s := &Scraper{
Fields: []Field{
{
Type: "date",
GuessYear: true,
Name: "date",
},
},
}
loc, _ := time.LoadLocation("CET")
items := []map[string]interface{}{
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc),
},
}
expectedItems := []map[string]interface{}{
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc),
},
}
s.guessYear(items, time.Date(2023, 11, 30, 20, 30, 0, 0, loc))
for i, d := range items {
if d["date"] != expectedItems[i]["date"] {
t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"])
}
}
}

func TestGuessYearUnordered(t *testing.T) {
// items dates are not perfectly ordered and span
// period around change of year
s := &Scraper{
Fields: []Field{
{
Type: "date",
GuessYear: true,
Name: "date",
},
},
}
loc, _ := time.LoadLocation("CET")
items := []map[string]interface{}{
{
"date": time.Date(2023, 11, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 14, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc),
},
}
expectedItems := []map[string]interface{}{
{
"date": time.Date(2023, 11, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 14, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc),
},
}
s.guessYear(items, time.Date(2023, 11, 1, 20, 30, 0, 0, loc))
for i, d := range items {
if d["date"] != expectedItems[i]["date"] {
t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"])
}
}
}

func TestGuessYear2Years(t *testing.T) {
// items dates span more than 2 years
s := &Scraper{
Fields: []Field{
{
Type: "date",
GuessYear: true,
Name: "date",
},
},
}
loc, _ := time.LoadLocation("CET")
items := []map[string]interface{}{
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 1, 14, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 5, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 9, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 2, 2, 20, 0, 0, 0, loc),
},
}
expectedItems := []map[string]interface{}{
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2024, 1, 14, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2024, 5, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2024, 9, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2025, 2, 2, 20, 0, 0, 0, loc),
},
}
s.guessYear(items, time.Date(2023, 11, 1, 20, 30, 0, 0, loc))
for i, d := range items {
if d["date"] != expectedItems[i]["date"] {
t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"])
}
}
}

func TestGuessYearStartBeforeReference(t *testing.T) {
// items date start before given reference
s := &Scraper{
Fields: []Field{
{
Type: "date",
GuessYear: true,
Name: "date",
},
},
}
loc, _ := time.LoadLocation("CET")
items := []map[string]interface{}{
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc),
},
}
expectedItems := []map[string]interface{}{
{
"date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc),
},
{
"date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc),
},
{
"date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc),
},
}
s.guessYear(items, time.Date(2024, 1, 30, 20, 30, 0, 0, loc))
for i, d := range items {
if d["date"] != expectedItems[i]["date"] {
t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"])
}
}
}

0 comments on commit be251ac

Please sign in to comment.