forked from nevali/crawl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
libcrawl.h
265 lines (235 loc) · 10.5 KB
/
libcrawl.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* Author: Mo McRoberts <mo.mcroberts@bbc.co.uk>
*
* Copyright 2014-2015 BBC
*/
/*
* Copyright 2013 Mo McRoberts.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LIBCRAWL_H_
# define LIBCRAWL_H_ 1
# include <stdint.h>
# include <time.h>
# include <jansson.h>
# include <liburi.h>
/* Crawl object states */
typedef enum
{
/* An internal error occurred */
COS_ERR = -1,
/* Has not yet been crawled */
COS_NEW = 0,
/* Crawling failed */
COS_FAILED,
/* The processor or policy rejected the resource */
COS_REJECTED,
/* The resource was processed */
COS_ACCEPTED,
/* Never set by crawld itself: external processing has completed */
COS_COMPLETE,
/* The next fetch should ignore the cache */
COS_FORCE,
/* The resource was rejected in line with expectations (e.g., a 304) */
COS_SKIPPED,
/* The resource was skipped, but the object should not be
* rolled back (e.g., 301 redirect)
*/
COS_SKIPPED_COMMIT
} CRAWLSTATE;
/* A crawl context.
*
* Note that while libcrawl is thread-safe, a single crawl context cannot be
* used in multiple threads concurrently. You must either protect the context
* with a lock, or create separate contexts for each thread which will invoke
* libcrawl methods.
*/
typedef struct crawl_struct CRAWL;
/* A cache implementation.
*
* The populated structure can be passed to crawl_set_cache() to specify the
* cache implementation that will be used by the context.
*/
# define CACHE_KEY_LEN 32
typedef char CACHEKEY[CACHE_KEY_LEN+1];
/* A crawled object, returned by a cache look-up (crawl_locate) or fetch
* (crawl_fetch). The same thread restrictions apply to crawled objects
* as to the context.
*/
typedef struct crawl_object_struct CRAWLOBJ;
typedef struct crawl_cache_struct CRAWLCACHE;
typedef struct crawl_cache_impl_struct CRAWLCACHEIMPL;
struct crawl_cache_impl_struct
{
void *reserved;
unsigned long (*init)(CRAWLCACHE *cache);
unsigned long (*done)(CRAWLCACHE *cache);
FILE *(*payload_open_write)(CRAWLCACHE *cache, const CACHEKEY key);
FILE *(*payload_open_read)(CRAWLCACHE *cache, const CACHEKEY key);
int (*payload_close_rollback)(CRAWLCACHE *cache, const CACHEKEY key, FILE *f);
int (*payload_close_commit)(CRAWLCACHE *cache, const CACHEKEY key, FILE *f, CRAWLOBJ *obj);
int (*info_read)(CRAWLCACHE *cache, const CACHEKEY key, json_t **dict);
int (*info_write)(CRAWLCACHE *cache, const CACHEKEY key, const json_t *dict);
char *(*uri)(CRAWLCACHE *cache, const CACHEKEY key);
int (*set_username)(CRAWLCACHE *cache, const char *username);
int (*set_password)(CRAWLCACHE *cache, const char *password);
int (*set_endpoint)(CRAWLCACHE *cache, const char *endpoint);
};
struct crawl_cache_struct
{
/* The cache implementation */
const CRAWLCACHEIMPL *impl;
/* The crawl context */
CRAWL *crawl;
/* This pointer is owned by (and private to) the cache implementation to
* use for any purpose it wishes.
*/
void *data;
};
/* URI policy callback: invoked before a URI is fetched; returns 1 to proceed,
* 0 to skip, -1 on error.
*/
typedef CRAWLSTATE (*crawl_uri_policy_cb)(CRAWL *crawl, URI *uri, const char *uristr, void *userdata);
/* Pre-fetch callback: invoked immediately before a URI is fetched (and after
* the policy callback, if any).
*/
typedef CRAWLSTATE (*crawl_prefetch_cb)(CRAWL *crawl, URI *uri, const char *uristr, void *userdata);
/* Updated callback: invoked after a resource has been fetched and stored in
* the cache.
*/
typedef int (*crawl_updated_cb)(CRAWL *crawl, CRAWLOBJ *obj, time_t prevtime, void *userdata);
/* Failed callback: invoked after a resource fetch was attempted but was
* aborted due to a hard error.
*/
typedef int (*crawl_failed_cb)(CRAWL *crawl, CRAWLOBJ *obj, time_t prevtime, void *userdata, CRAWLSTATE state);
/* Unchanged callback: invoked after a resource was rolled back because there is no
* newer version of the resource.
*/
typedef int (*crawl_unchanged_cb)(CRAWL *crawl, CRAWLOBJ *obj, time_t prevtime, void *userdata);
/* Next callback: invoked to obtain the next URI to crawl; if *next is NULL on
* return, crawling ends. The URI returned via *next will be freed by libcrawl.
*/
typedef int (*crawl_next_cb)(CRAWL *crawl, URI **next, CRAWLSTATE *state, void *userdata);
/* Checkpoint callback: invoked after the response headers have been received
* but before the payload has been. If the callback returns a result other than
* COS_ACCEPTED, the fetch will be aborted.
*/
typedef CRAWLSTATE (*crawl_checkpoint_cb)(CRAWL *crawl, CRAWLOBJ *obj, int *status, void *userdata);
/* libcrawl-provided cache implementations */
extern const CRAWLCACHEIMPL *diskcache;
extern const CRAWLCACHEIMPL *s3cache;
/* Create a crawl context */
CRAWL *crawl_create(void);
/* Destroy a context created with crawl_create() */
void crawl_destroy(CRAWL *p);
/* Obtain a known cache implementation for a URI scheme */
const CRAWLCACHEIMPL *crawl_cache_scheme(CRAWL *crawl, const char *scheme);
/* Set the cache implementation that will be used by this context */
int crawl_set_cache(CRAWL *crawl, const CRAWLCACHEIMPL *cache);
/* Set the Accept header sent in subsequent requests */
int crawl_set_accept(CRAWL *crawl, const char *accept);
/* Set the User-Agent header sent in subsequent requests */
int crawl_set_ua(CRAWL *crawl, const char *ua);
/* Set the private user-data pointer passed to callback functions */
int crawl_set_userdata(CRAWL *crawl, void *userdata);
/* Set the verbose flag */
int crawl_set_verbose(CRAWL *crawl, int verbose);
/* Set the cache path or URI string */
int crawl_set_cache_path(CRAWL *crawl, const char *path);
/* Set the cache URI */
int crawl_set_cache_uri(CRAWL *crawl, URI *uri);
/* Retrieve the private user-data pointer previously set with crawl_set_userdata() */
void *crawl_userdata(CRAWL *crawl);
/* Set the username (or access key) used by the cache */
int crawl_set_username(CRAWL *crawl, const char *username);
/* Set the password (or secret) used by the cache */
int crawl_set_password(CRAWL *crawl, const char *password);
/* Set the endpoint used by the cache */
int crawl_set_endpoint(CRAWL *crawl, const char *endpoint);
/* Set the callback function used to apply a URI policy */
int crawl_set_uri_policy(CRAWL *crawl, crawl_uri_policy_cb cb);
/* Set the callback function invoked when an object is updated */
int crawl_set_updated(CRAWL *crawl, crawl_updated_cb cb);
/* Set the callback function invoked when an object could not be updated */
int crawl_set_failed(CRAWL *crawl, crawl_failed_cb cb);
/* Set the callback function invoked to get the next URI to crawl */
int crawl_set_next(CRAWL *crawl, crawl_next_cb cb);
/* Set the callback function invoked before the object payload is retrieved */
int crawl_set_checkpoint(CRAWL *crawl, crawl_checkpoint_cb cb);
/* Set the callback function invoked when an object is rolled back */
int crawl_set_unchanged(CRAWL *crawl, crawl_unchanged_cb cb);
/* Set the callback function invoked before an object is fetched */
int crawl_set_prefetch(CRAWL *crawl, crawl_prefetch_cb cb);
/* Set the logging function used by the crawler */
int crawl_set_logger(CRAWL *crawl, void (*logger)(int, const char *, va_list));
/* Open the payload file for a crawl object */
FILE *crawl_obj_open(CRAWLOBJ *obj);
/* Destroy an (in-memory) crawl object */
int crawl_obj_destroy(CRAWLOBJ *obj);
/* Obtain the cache key for a crawl object */
const char *crawl_obj_key(CRAWLOBJ *obj);
/* Obtain the HTTP status for a crawl object */
int crawl_obj_status(CRAWLOBJ *obj);
/* Obtain the storage timestamp for a crawl object */
time_t crawl_obj_updated(CRAWLOBJ *obj);
/* Obtain the headers for a crawl object - caller must use json_decref() */
json_t *crawl_obj_headers(CRAWLOBJ *obj, int clone);
/* Obtain the path to the payload for the crawl object */
const char *crawl_obj_payload(CRAWLOBJ *obj);
/* Obtain the size of the payload */
uint64_t crawl_obj_size(CRAWLOBJ *obj);
/* Obtain the crawl object URI */
const URI *crawl_obj_uri(CRAWLOBJ *obj);
/* Obtain the crawl object URI as a string */
const char *crawl_obj_uristr(CRAWLOBJ *obj);
/* Obtain the MIME type of the payload */
const char *crawl_obj_type(CRAWLOBJ *obj);
/* Obtain the redirect target of the resource */
const char *crawl_obj_redirect(CRAWLOBJ *obj);
/* Obtain the content-location of the resource */
const char *crawl_obj_content_location(CRAWLOBJ *obj);
/* Has this object been freshly-fetched? */
int crawl_obj_fresh(CRAWLOBJ *obj);
/* Determine the cache key for a resource */
int crawl_cache_key(CRAWL *restrict crawl, const char *restrict uri, char *restrict buf, size_t buflen);
/* Determine the cache key for a resource */
int crawl_cache_key_uri(CRAWL *restrict crawl, URI *restrict uri, char *restrict buf, size_t buflen);
/* Fetch a resource specified as a string containing a URI */
CRAWLOBJ *crawl_fetch(CRAWL *crawl, const char *uri, CRAWLSTATE state);
/* Fetch a resource specified as a URI */
CRAWLOBJ *crawl_fetch_uri(CRAWL *crawl, URI *uri, CRAWLSTATE state);
/* Locate a cached resource specified as a string */
CRAWLOBJ *crawl_locate(CRAWL *crawl, const char *uristr);
/* Locate a cached resource specified as a URI */
CRAWLOBJ *crawl_locate_uri(CRAWL *crawl, URI *uri);
/* Perform a crawling cycle */
int crawl_perform(CRAWL *crawl);
/* Memory allocation helpers */
/* These APIs are 'safe' wrappers around calloc(), strdup(), realloc() and
* free(). If allocation fails, they will log a message at LOG_CRIT (see
* MSG_C_NOMEM) and call abort().
*
* It is valid to pass NULL as the 'crawl' parameter to these functions;
* in that circumstance, any log messages will be delivered via syslog()
* rather than the crawl context's registered logging callback.
*
* crawl_alloc() will zero-initialise the returned buffer.
*
* crawl_realloc() will NOT zero-initialise any part of the buffer if the
* new buffer is larger than the original.
*/
void *crawl_alloc(CRAWL *restrict crawl, size_t nbytes);
char *crawl_strdup(CRAWL *restrict crawl, const char *src);
void *crawl_realloc(CRAWL *restrict crawl, void *restrict ptr, size_t nbytes);
void crawl_free(CRAWL *restrict crawl, void *restrict ptr);
#endif /*!LIBCRAWL_H_*/