This repository has been archived by the owner on Sep 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 19
/
FakeUA.py
203 lines (184 loc) · 6.37 KB
/
FakeUA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import json
import logging
import math
import os
import re
import sys
import time
from pprint import pprint
from urllib.parse import quote, urljoin
import asks
import trio
from pyquery import PyQuery as jq
from stem import Signal
from stem.connection import connect
from termcolor import colored
from FakeUAdb import UAS
asks.init('trio') # 初始化trio
FAKEHEADER = {
"x-requested-with": "XMLHttpRequest",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"referer": "http://www.mafengwo.cn/",
"accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01"
} # 默认伪造headers
POOLS = {} # 数据池
TASKS = set() # 任务池
DATANUMS = 0
MAXNUMS = 0
LIMIT = 10 # 并发限制
PERPAGE = 50
spiderSession = asks.Session(connections=LIMIT)
spiderSession.headers = FAKEHEADER
spiderSession.timeout_manager
logging.basicConfig(
format="[%(asctime)s] >>> %(levelname)s %(name)s: %(message)s", level=logging.INFO) # 初始化日志输出格式,级别
loger = logging.getLogger('FakeUA') # 初始化一个日志对象
try:
controller = connect()
controller.authenticate()
except Exception as e:
loger.error(colored('请检测您的Tor端口','red'))
exit()
async def getTypesL1():
"""
取得一级分类
"""
url = "https://developers.whatismybrowser.com/useragents/explore/"
resp = await spiderSession.get(url)
# listing-by-field-name > li:nth-child(1) > h2 > a
# listing-by-field-name > li:nth-child(2) > h2 > a
async with trio.open_nursery() as nursery:
for item in jq(resp.text)("#listing-by-field-name > li > h2 > a").items():
types = item.text().strip().replace(' ', '_').lower()
POOLS[types] = {}
nursery.start_soon(
getTypesL2, POOLS[types], types, urljoin(url, item.attr('href')))
async def getTypesL2(target, types, href):
"""
取得二级分类
"""
loger.info(colored(f'fetching {href}', 'yellow'))
resp = await spiderSession.get(href)
async with trio.open_nursery() as nursery:
for item in jq(resp.text)("body > div.content-base > section > div > table > tbody > tr").items():
name = item(
'td:nth-child(1)>a').text().strip().replace(' ', '_').lower()
target[name] = {}
url = urljoin(href, item('td:nth-child(1)>a').attr('href'))
nums = int(item('td:nth-child(2)').text().strip())
target[name]['url'] = url
target[name]['nums'] = nums
target[name]['UA_list'] = []
for page in range(1, math.ceil(nums/PERPAGE)+1):
TASKS.add('__'.join([
types,
name,
f"{url}{page}"
]))
async def getUAs():
global MAXNUMS
"""
爬行任务调度
"""
limit = trio.CapacityLimiter(LIMIT)
while TASKS:
MAXNUMS = len(list(TASKS))
loger.info(colored(f'当前任务量:{MAXNUMS}', 'red'))
await trio.sleep(1)
async with trio.open_nursery() as nursery:
for item in list(TASKS):
nursery.start_soon(getUAsitem, item, limit)
async def getUAsitem(detals, limit):
global DATANUMS
global MAXNUMS
"""
获取单个任务
"""
types, name, url = detals.split('__')
target = POOLS[types][name]['UA_list']
async with limit:
try:
loger.info(colored(f'fetching -> {url}', 'yellow'))
resp = await spiderSession.get(url, timeout=5, retries=3)
LocalDatas = []
for item in jq(resp.text)(
"body > div.content-base > section > div > table > tbody > tr").items():
datas = {
# 'uid':'',
'useragent': item('td.useragent').text(),
# 'href': item('td.useragent>a').attr('href'),
'software': item('td:nth-child(2)').attr('title'),
'engine': item('td:nth-child(3)').text(),
'types': item('td:nth-child(4)').text(),
'popularity': item('td:nth-child(5)').text()
}
loger.info(
'[' +
colored(DATANUMS,'green')+
'/'+
colored(MAXNUMS,'yellow')+
'/'+
colored(str(len(target)), 'blue') +
']' +
colored('->', 'blue').join([
colored(types, 'red'),
colored(name, 'red'),
colored(datas["useragent"], 'green')
]))
target.append(datas)
LocalDatas.append(datas)
SaveToDB(LocalDatas,UAS)
TASKS.remove(detals)
DATANUMS +=1
MAXNUMS -=1
except KeyboardInterrupt:
raise
except Exception as e:
loger.error(colored(e, 'red'))
NewID()
def NewID():
controller.signal(Signal.NEWNYM)
loger.error(colored('切换线路', 'red'))
def SaveJson(datas, filename):
"""
Json数据存储
"""
if not datas:
return
loger.info(colored(f'文件存储至 {filename}', 'yellow'))
with open(filename, 'w') as f:
f.write(json.dumps(datas, indent=4, ensure_ascii=False))
def MakeChunk(datas,length=100):
for item in range(0, math.ceil(len(datas)/length)):
yield datas[item*length:(item+1)*length]
def SaveToDB(datas,model):
# data_source = []
data_source = datas
# for tk, td in datas.items():
# for nk,nd in td.items():
# data_source.extend(nd['UA_list'])
if data_source:
loger.info(colored(f'数据存储至DB中 [{len(data_source)}]', 'yellow'))
for chunk in list(MakeChunk(data_source)):
model.insert_many(data_source).execute()
def main():
"""
主逻辑
"""
while True:
try:
trio.run(getTypesL1)
break
except Exception:
NewID()
trio.run(getUAs)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass
except Exception as e:
loger.error(colored(e, 'red'))
finally:
SaveJson(POOLS, 'POOLS.json')
# SaveToDB(POOLS, UAS)