This repository has been archived by the owner on Jan 9, 2024. It is now read-only.
forked from efixler/scrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.go
122 lines (113 loc) · 2.94 KB
/
scrape.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
Package scrape provides a simple interface for fetching and storing web pages
metadata and text content. The `scrape` and `scrape-server` commands provide
a command-line interface and a REST API, respectively.
*/
package scrape
import (
"context"
"errors"
nurl "net/url"
"sync"
"log/slog"
"github.com/efixler/scrape/fetch"
"github.com/efixler/scrape/resource"
"github.com/efixler/scrape/store"
)
var (
wg sync.WaitGroup
)
type StorageBackedFetcher struct {
Fetcher fetch.URLFetcher
Storage store.URLDataStore
closed bool
}
func NewStorageBackedFetcher(
fetcherFactory fetch.Factory,
storageFactory store.Factory,
) (*StorageBackedFetcher, error) {
storage, err := storageFactory()
if err != nil {
return nil, err
}
fetcher, err := fetcherFactory()
if err != nil {
return nil, err
}
return &StorageBackedFetcher{
Fetcher: fetcher,
Storage: storage,
}, nil
}
// The context passed to Open() will be passed on to child components
// so that they can hook into the context directly, specifically to
// close and release resources on cancellation.
func (f StorageBackedFetcher) Open(ctx context.Context) error {
err := f.Fetcher.Open(ctx)
if err != nil {
return err
}
err = f.Storage.Open(ctx)
if err != nil {
return err
}
// We actually shouldn't need this, since the child components will hook into the context
// directly.
context.AfterFunc(ctx, func() {
f.Close()
})
return nil
}
func (f StorageBackedFetcher) Fetch(url *nurl.URL) (*resource.WebPage, error) {
// Treat this as the entry point for the url and apply cleaning here.
// Re-evaluate this -- right now this is the entry point for all URLs,
// but it might not always be. We also might want storage to use the cleaned URLs,
// but always fetch the url precisely as requested.
originalURL := url.String()
// Now clean the URL
url = resource.CleanURL(url)
// Now fetch the item from storage
item, err := f.Storage.Fetch(url)
if err != nil && !errors.Is(err, store.ErrorResourceNotFound) {
return nil, err
}
var resource *resource.WebPage
if item != nil {
resource = &item.Data
}
// TODO: Check that we don't store OriginalURL with the resource
defer func() { resource.OriginalURL = originalURL }()
if resource == nil {
resource, err = f.Fetcher.Fetch(url)
// never store a resource with an error, but do return a partial resource
if err != nil {
return resource, err
}
sd := &store.StoredUrlData{
Data: *resource,
}
wg.Add(1)
go func() {
defer wg.Done()
key, err := f.Storage.Store(sd)
if err != nil {
slog.Error("Error storing %s: %s\n", "url", url, "key", key, "error", err)
}
}()
}
return resource, nil
}
// Close() will be invoked when the context sent to Open() is done
// If that context doesn't get cancelled, Close() must be called to
// release resources.
func (f *StorageBackedFetcher) Close() error {
if f.closed {
return nil
}
defer func() {
f.closed = true
}()
f.Fetcher.Close()
f.Storage.Close()
return nil
}