forked from SimFin/pdf-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.example.json
64 lines (58 loc) · 3.75 KB
/
config.example.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
{
"solo" : null ,
"debug" : false ,
"url_test": "http://url-test.com/" ,
"domains_skip" : [
"facebook.com" ,
"twitter.com" ,
"typeform.com" ,
"instagram.com" ,
"www.my-meteo.com" ,
"www.pinterest.com" ,
"www.google.fr" ,
"pluginsmarket.com" ,
"cookiedatabase.org" ,
"consent.google.fr" ,
"accounts.google.com" ,
"mozilla.org" ,
"microsoft.com" ,
"fr.mappy.com" ,
"pluginsmarket.com" ,
"www.linkedin.com" ,
"help.opera.com",
"www.my-meteo.com",
"firefox.com",
"soundcloud.com",
"flickr.com",
"youtube.com",
"feedify.net"
] ,
"firefox" : "./firefox/firefox-bin" ,
"urls" : [
{ "url" : "http://www.url1.com/" , "method" : "normal" , "depth" : 0 , "ignore_patterns" : [ "<a href=\"mailto.*\"" ] , "ignore_urls" : [ "http://www.url1.com/FOO" , ".*url1.com/.*foo.*" ] } ,
{ "url" : "https://www.url2.com/" , "method" : "normal" , "depth" : 7 , "sleep" : 15 , "ignore_patterns" : [ ] , "crawler_mode" : "CRAWL_LIGHT" } ,
{ "url" : "https://www.url3.com/" , "method" : "normal" , "depth" : 7 , "sleep" : 15 , "ignore_patterns" : [ "fuse_script\\.js\\?ver=[0-9]+" , "data-nonce=\"\\w+\"" , "\"_nonce\":\"\\w+\"" , "\"consent_expire_time\":\"\\d+\"","\"consent_time\":\"\\d+\"" ] } ,
{ "url" : "https://www.url4.com/" , "method" : "normal" , "depth" : 7 , "ignore_elements" : [ "//h2[contains(@class,'widgettitle')]" , "//div[contains(@class,'fest_item')]" , "//script[contains(@src,'image_zoom')]" , "//script[contains(text(),'with_woocommerce')]"] , "ignore_patterns" : [ "class=\"collapseomatic\" id=\"\\w+\"" , "class=\"colomat-swap\" id=\"[\\w\\-]+\"" , "class=\"collapseomatic_content\" id=\"[\\w\\-]+\"" ] } ,
{ "url" : "https://www.url5.com/" , "method" : "normal" , "depth" : 7 , "sleep" : 10 } ,
{ "url" : "https://www.url6.com/" , "method" : "normal" , "depth" : 7 , "ignore_patterns" : ["\"nonce\":\"\\w+\"" , "\"servertime\":\"\\d+\"" , "\"wpdev_bk_today\":\"\\[[0-9,]+\\]\"", "search-form-\\w+" , "Dynamic page generated in [0-9\\.]+ seconds" , "Cached page generated by WP-Super-Cache on [0-9\\-:\\s]+" ] , "ignore_elements" : ["//script[contains(text(),'WP_Statistics_http')]"]} ,
{ "url" : "https://www.url7.com/" , "method" : "normal" , "depth" : 5 , "sleep" : 15 , "use_proxy" : true , "safe" : true , "ignore_patterns" : ["^\\s+$" , "\"csrf.token\":\"\\w+\"" , "\"spoofstring\".*\"spamstring\":\"\\w+\"" ] , "ignore_elements" : ["//input[@name='cbsecuritym3']" , "//span[contains(text(),'Cette adresse e-mail est prot')]" , "//script[contains(text(),'addy_text')]" , "//input[@name='cbrasitway']"]} ,
{ "url" : "http://www.url8.com/" , "method" : "normal" , "depth" : 7 , "ignore_urls" : [ ".*url8.com.*/.*@.*" ] } ,
{ "url" : "https://www.url9.com/" , "method" : "normal" , "depth" : 4 , "ignore_elements" : ["//meta[@property='og:url']" , "//meta[@http-equiv='last-modified']" , "//ol[contains(@class,'carousel-indicators')]" , "//div[//script[contains(@id,'infogram')]]" , "//div[@class='newspaper3c']" ] } ,
{ "url" : "https://www.url10.com/" , "method" : "normal" , "depth" : 4 }
],
"searches" : [
{ "name" : "NAME1" , "patterns" : [ "pattern1" ], "level" : 3 } ,
{ "name" : "NAME2" , "patterns" : [ "pattern2"], "level" : 3 } ,
{ "name" : "NAME3",
"patterns" : [ "pattern3" ],
"exclude_patterns" : null ,
"domains" : null,
"exclude_domains" : [ "URL" ] ,
"doc_types" : null ,
"level" : 1
}
] ,
"emails" : [
"EMAIL@DOMAIN.COM"
]
}