-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
spider.yaml
102 lines (86 loc) · 4.01 KB
/
spider.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# coding:utf-8
time_out: 200 # timeout for crawling and storing user info
min_crawl_interal: 10 # min interal of http request
max_crawl_interal: 20 # max interal of http request
excp_interal: 5*60 # time for sleeping when crawling raises exceptions
# TODO set a default value for max_value of crawling
max_search_page: 50 # max search page for crawling
max_home_page: 50 # max user home page for crawling
max_comment_page: 2000 # max comment page for crawling
max_repost_page: 2000 # max repost page for crawling
max_dialogue_page: 2000 # max dialogue page for crawling
max_retries: 5 # retry times for crawling
# You should set the args below if you login from uncommon place
# It's for verification code indentified
yundama_username: xxxxxx # account for yundama
yundama_passwd: xxxxxx # password for yundama
# only crawl weibo(bowen) after
# only affect to home crawler
time_after: '1970-01-01 08:00:00'
# whether account follows the uid below
# if yes rows in wbuser will have 1 at isFan column
samefollow_uid: ''
# The value of running_mode can be normal or quick.
# In normal mode, it will be more stable, while in quick mode, the crawling speed will
# be much faster, and the weibo account almostly will be banned
running_mode: normal
# The value of crawling mode can be accurate or normal
# In normal mode, the spider won't crawl the weibo content of "展开全文" when execute home crawl tasks or search crawl
# tasks, so the speed will be much quicker.
# In accurate mode,the spider will crawl the info of "展开全文",which will be slower, but more details will be given.
crawling_mode: normal
# the max number of each cookie can be shared
# if you choose quick mode, your cookie will be used util it's banned
share_host_count: 5
# the expire time(hours) of each weibo cookies
cookie_expire_time: 23
# 1 for allow download images, otherwise set it to 0
images_allow: 1
# the default image path is '${user.home}/weibospider/images'
# if you want to change another directory for download image, just set the path below
images_path: ''
# the value can be large or thumbnail
# in large type, you will download the large image
# in thumbnail type, you will download the thumbnail image
image_type: large
db:
host: 127.0.0.1
port: 3306
user: root
password: 123456
db_name: weibo
db_type: mysql
redis:
host: 127.0.0.1
port: 6379
password: ''
cookies: 1 # store and fetch cookies
# store fetched urls and results,so you can decide whether retry to crawl the urls or not
urls: 2
broker: 5 # broker for celery
backend: 6 # backed for celery
id_name: 8 # user id and names,for repost info analysis. Could be safely deleted after repost tasks
# expire_time (hours) for redis db2, if they are useless to you, you can set the value smaller
expire_time: 48
# redis sentinel for ha. if you neet it, just add sentinel host and port below the sentinel args,like this:
###############################
#sentinel: #
# - host: 2.2.2.2 #
# port: 26379 #
# - host: 3.3.3.3 #
# port: 26379 #
# #
###############################
sentinel: ''
master: '' # redis sentinel master name, if you don't need it, just set master: ''
socket_timeout: 5 # socket timeout for redis sentinel, if you don't need it, just set master: ''
# warning by email
email:
# your email must open smtp & pop3 service
server: smtp.sina.com
port: 587
from: xxxxx@sina.com #sendingemailaccount
password: xxxx #youremailpasswd
to: xxxxxxxx@139.com #bind 139 email,so your phone will receive the warning message
subject: Warning Of Weibo Spider
warning_info: Please find out the reason why the spider stops working