-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathtask.py
323 lines (286 loc) · 11.8 KB
/
task.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# -*- coding: utf-8 -*-
from threeDM.threeDM import get_3dm_data
from threesixKe.threesixKe import get_36kr_data
from five2pj.five2pj import fetch_52pj_data
from acfun.acFun import get_acfun_data
from anquanke.anquanke import get_anquanke_data
from baidu.baidu import get_baidu_data
from baijingchuhai.baijingchuhai import get_baijingchuhai_data
from csdn.csdn import get_csdn_data
from dianshangbao.dianshangbao import get_dianshangbao_data
from diyicaijing.diyicaijing import get_diyicaijing_data
from dongchedi.dongchedi import get_dongchedi_hot_search
from douban.douban import get_douban_movie_data
from freebuf.freebuf import get_freebuf_data
from github.github import get_github_data
from googlesearch.googlesearch import get_googlesearch_data
from hupu.hupu import get_hupu_data
from huxiu.huxiu import get_huxiu_data
from ithome.ithome import get_ithome_data
from kaiyan.openeye import get_openeye_data
from kanxue.kanxue import get_kanxue_data
from kuandaishan.kuandaishan import get_kuandaishan_data
from pmcaff.pmcaff import get_pmcaff_data
from qichezhijia.qichezhijia import get_qichezhijia_data
from qidian.qidian import get_rank_list
from shuimu.shuimu import get_shuimu_data
from sina.sina import get_sina_data
from sina.sina_sport import get_sina_sport_data
from sina.sina_news import get_sina_news
from taipingyang.taipingyang import get_taipingyang_data
from taptap.taptap import get_taptap_data
from tencent.tencent import get_tencent_data
from woshipm.woshipm import get_woshipm_data
from xueqiu.xueqiu import get_xueqiu_data
from yiche.yiche import get_yiche_data
from youshedubao.youshedubao import get_youshedubao_data
from youxiputao.youxiputao import get_youxiputao_data
from zhanku.zhanku import get_zhanku_data
from zongheng.zongheng import get_zongheng_data
from coolan.coolan import get_cool
from hacknews.hacknews import get_hacker_news
from historytoday.historyday import get_history_today
from wallstreetcn.wallstreetcn import get_wallstreetcn_data
from pengpai.pengpaihot import get_pengpai_hot
from crypto_coin.coin import get_crypto_price
from ithome.needknow import get_ithome_needknow_data
from readhub.readhub import get_readhub_data
from v2ex.v2ex import get_v2ex_data
from hostloc.hostloc import get_hostloc_data
from linuxdo.linuxdo import get_linuxdo_data
from nodeseek.nodeseek import get_nodeseek_data
from wsj.wsj import get_wsj_data
from nytimes.nytimes import get_nytimes_data
from bloomberg.bloomberg import get_bloomberg_data
from ft.ft import get_ft_data
from yna.yna import get_yna_data
from tagesschau.tagesschau import get_lemonde_data
from rt.rt import get_rt_data
from nhk.nhk import get_nhk_data
from newsau.newsau import get_newsau_data
from mumsnet.mumsnet import get_mumsnet_data
from foxnews.foxnews import get_foxnews_data
from fivech.fivech import get_5ch_data
from dailymail.dailymail import get_dailymail_data
from asahi.asahi import get_asahi_data
from dzenru.dzenru import get_dzenru_data
from pymongo import MongoClient
import time
import httpx
from curl_cffi import requests
import random
from config import MONGO_URI, MONGO_DB
client = MongoClient(MONGO_URI)
db = client[MONGO_DB]
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
def fetch(url, header):
retry = 5
while retry > 0:
try:
res = requests.get(url, headers=header)
if res.status_code == 200:
data = res.json()
data['insert_time'] = time.time()
return data
retry -= 1
time.sleep(random.choice([1, 2, 3, 4, 5])*retry)
except Exception as err:
retry -= 1
print("now_time: {}, url: {}, error: {}".format(time.time(), url, str(err)))
time.sleep(random.choice([1, 2, 3, 4, 5])*retry)
def get_weibo_data():
weibo_url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
article = db["weibo_hot_search"]
data = httpx.get(weibo_url).json()
data['insert_time'] = time.time()
article.insert_one(data)
print("weibo Data inserted")
def get_zhihu_hot_data():
zhihu_hot_list = db['zhihu_hot_list']
zhihu_hot_list_url = "https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit=100"
data = fetch(zhihu_hot_list_url, headers)
zhihu_hot_list.insert_one(data)
print("zhihu data inserted")
def get_douyin_hot_data():
douyin_hot = db['douyin_hot']
session = requests.Session()
session.headers = headers
session.get("https://www.douyin.com/passport/general/login_guiding_strategy/?aid=6383")
res = session.get("https://www.douyin.com/aweme/v1/web/hot/search/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&detail_list=1&round_trip_time=50")
if res.status_code == 200:
data = res.json()
data['insert_time'] = time.time()
douyin_hot.insert_one(data)
print("douyin data inserted")
def get_bilibili_hot_data():
bilibili_hot_url = "https://api.bilibili.com/x/web-interface/ranking/v2"
bilibili_hot = db['bilibili_hot']
err = 5
while err > 0:
bili_headers = {}
res = requests.get(bilibili_hot_url, headers=bili_headers)
data = res.json()
data_code = data.get("code", 352)
if data_code == 0:
data['insert_time'] = time.time()
bilibili_hot.insert_one(data)
print("bilibili_hot data get success")
break
else:
print(data)
err -= 1
print("bilibili_hot data get error")
time.sleep(3)
def get_wx_read_rank():
url = "https://weread.qq.com/web/bookListInCategory/rising?rank=1"
wx_read = db['wx_read_rank']
data = fetch(url, headers)
wx_read.insert_one(data)
print("wx_read_rank data inserted")
def get_tieba_topic():
url = "https://tieba.baidu.com/hottopic/browse/topicList"
tieba = db['tieba_topic']
data = fetch(url, headers)
tieba.insert_one(data)
print("tieba topic data inserted")
def get_juejin_hot():
url = "https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot"
juejin_hot = db['juejin_hot']
data = fetch(url, headers)
juejin_hot.insert_one(data)
print("juejin_hot data inserted")
def get_toutiao_hot():
url = "https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc"
toutiao_hot = db['toutiao_hot']
data = fetch(url, headers)
toutiao_hot.insert_one(data)
print("toutiao_hot data inserted")
def get_ssp_hot():
url = "https://sspai.com/api/v1/article/tag/page/get?limit=50&tag=%E7%83%AD%E9%97%A8%E6%96%87%E7%AB%A0"
shaoshupai_hot = db['shaoshupai_hot']
data = fetch(url, headers)
shaoshupai_hot.insert_one(data)
print("shaoshupai data inserted")
def insert_data(collection_name, data):
"""通用数据插入函数"""
if not data:
print(f"{collection_name} data fetch failed")
return
collection = db[collection_name]
data['insert_time'] = time.time()
collection.insert_one(data)
print(f"{collection_name} data inserted")
if __name__ == "__main__":
try:
try:
get_toutiao_hot()
except Exception as e:
print(f"Error fetching toutiao_hot data: {e}")
try:
get_juejin_hot()
except Exception as e:
print(f"Error fetching juejin_hot data: {e}")
try:
get_tieba_topic()
except Exception as e:
print(f"Error fetching tieba_topic data: {e}")
try:
get_wx_read_rank()
except Exception as e:
print(f"Error fetching wx_read_rank data: {e}")
try:
get_zhihu_hot_data()
except Exception as e:
print(f"Error fetching zhihu_hot data: {e}")
try:
get_weibo_data()
except Exception as e:
print(f"Error fetching weibo data: {e}")
try:
get_ssp_hot()
except Exception as e:
print(f"Error fetching shaoshupai_hot data: {e}")
try:
get_douyin_hot_data()
except Exception as e:
print(f"Error fetching douyin_hot data: {e}")
try:
get_bilibili_hot_data()
except Exception as e:
print(f"Error fetching bilibili_hot data: {e}")
# 新的数据插入方式,增加每个插入的try-except
def safe_insert(collection_name, data_func):
try:
insert_data(collection_name, data_func())
except Exception as e:
print(f"Error inserting {collection_name} data: {e}")
# 通过 safe_insert 函数插入数据
safe_insert("pengpai", get_pengpai_hot)
safe_insert("crypto_coin", get_crypto_price)
safe_insert("3dm", get_3dm_data)
safe_insert("36kr", get_36kr_data)
safe_insert("52pj", fetch_52pj_data)
safe_insert("acfun", get_acfun_data)
safe_insert("anquanke", get_anquanke_data)
safe_insert("baidu_hot_search", get_baidu_data)
safe_insert("baijingchuhai", get_baijingchuhai_data)
safe_insert("csdn", get_csdn_data)
safe_insert("dianshangbao", get_dianshangbao_data)
safe_insert("diyicaijing", get_diyicaijing_data)
safe_insert("dongchedi", get_dongchedi_hot_search)
safe_insert("douban_movie", get_douban_movie_data)
safe_insert("freebuf", get_freebuf_data)
safe_insert("github", get_github_data)
safe_insert("google_search", get_googlesearch_data)
safe_insert("hupu", get_hupu_data)
safe_insert("huxiu", get_huxiu_data)
safe_insert("ithome", get_ithome_data)
safe_insert("openeye", get_openeye_data)
safe_insert("kanxue", get_kanxue_data)
safe_insert("kuandaishan", get_kuandaishan_data)
safe_insert("pmcaff", get_pmcaff_data)
safe_insert("qichezhijia", get_qichezhijia_data)
safe_insert("qidian", get_rank_list)
safe_insert("shuimu", get_shuimu_data)
safe_insert("sina", get_sina_data)
safe_insert("sina_sport", get_sina_sport_data)
safe_insert("sina_news", get_sina_news)
safe_insert("taipingyang", get_taipingyang_data)
safe_insert("taptap", get_taptap_data)
safe_insert("tencent_news", get_tencent_data)
safe_insert("woshipm", get_woshipm_data)
# safe_insert("xueqiu", get_xueqiu_data)
safe_insert("yiche", get_yiche_data)
safe_insert("youshedubao", get_youshedubao_data)
safe_insert("youxiputao", get_youxiputao_data)
safe_insert("zhanku", get_zhanku_data)
safe_insert("zongheng", get_zongheng_data)
# safe_insert("coolan", get_cool)
safe_insert("hacknews", get_hacker_news)
safe_insert("historytoday", get_history_today)
safe_insert("wallstreetcn", get_wallstreetcn_data)
safe_insert("readhub", get_readhub_data)
safe_insert("needknow", get_ithome_needknow_data)
safe_insert("v2ex", get_v2ex_data)
safe_insert("hostloc", get_hostloc_data)
safe_insert("linuxdo", get_linuxdo_data)
safe_insert("nodeseek", get_nodeseek_data)
safe_insert("wsj", get_wsj_data)
safe_insert("nytimes", get_nytimes_data)
safe_insert("bloomberg", get_bloomberg_data)
safe_insert("ft", get_ft_data)
safe_insert("yna", get_yna_data)
safe_insert("asahi", get_asahi_data)
safe_insert("nhk", get_nhk_data)
safe_insert("foxnews", get_foxnews_data)
safe_insert("rt", get_rt_data)
safe_insert("lemonde", get_lemonde_data)
safe_insert("dailymail", get_dailymail_data)
safe_insert("mumsnet", get_mumsnet_data)
safe_insert("newsau", get_newsau_data)
safe_insert("fivech", get_5ch_data)
safe_insert("dzenru", get_dzenru_data)
finally:
client.close()