-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathparse_detail.py
109 lines (99 loc) · 3.71 KB
/
parse_detail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import httpx
import json
import asyncio
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad
import base64
import re
import markdownify
from bs4 import BeautifulSoup
async def remove_img_tags(html_content):
if not html_content:
return html_content
soup = BeautifulSoup(html_content, 'html.parser')
for img in soup.find_all('img'):
img.decompose()
return str(soup)
async def parse_detail(needKnowList):
for needKnow in needKnowList:
if "hot_url" in needKnow:
if "thepaper.cn" in needKnow['hot_url']:
needKnow = await parse_pengpai(needKnow)
elif "36kr.com" in needKnow['hot_url']:
needKnow = await parse_36kr(needKnow)
elif "ithome.com" in needKnow['hot_url']:
needKnow = await parse_ithome(needKnow)
elif "sspai.com" in needKnow['hot_url']:
needKnow = await parse_sspai(needKnow)
elif "wallstreetcn.com" in needKnow['hot_url']:
needKnow = await parse_awatmt(needKnow)
return needKnowList
async def fetch(url):
async with httpx.AsyncClient() as client:
response = await client.get(url)
return response.text
async def parse_pengpai(needKnow):
url = needKnow['hot_url']
res = await fetch(url)
soup = BeautifulSoup(res, 'html.parser')
try:
detail = soup.select_one("div[class^='index_cententWrap']")
if not detail:
detail = soup.select_one("div[class^='header_videoWrap'] ~ div")
if detail:
detail = str(detail)
except:
return needKnow
detail = await remove_img_tags(detail)
detail = markdownify.markdownify(detail).strip()
needKnow['content'] = detail
return needKnow
async def parse_36kr(needKnow):
url = needKnow['hot_url']
res = await fetch(url)
key = "efabccee-b754-4c"
key = key.encode('utf-8').ljust(16, b'\0')
cipher = AES.new(key, AES.MODE_ECB)
encrypted_state = re.findall('window.initialState={"state":"(.*?)","isEncrypt":true}', res)[0]
encrypted_bytes = base64.b64decode(encrypted_state)
decrypted_padded = cipher.decrypt(encrypted_bytes)
decrypted_bytes = unpad(decrypted_padded, AES.block_size)
decrypted_text = decrypted_bytes.decode('utf-8')
state_dict = json.loads(decrypted_text)
detail = state_dict['articleDetail']['articleDetailData']['data']['widgetContent']
detail = await remove_img_tags(detail)
needKnow['content'] = markdownify.markdownify(detail).strip()
return needKnow
async def parse_ithome(needKnow):
url = needKnow['hot_url']
res = await fetch(url)
soup = BeautifulSoup(res, 'html.parser')
detail = soup.select_one(".news-content")
if detail:
detail = str(detail)
detail = await remove_img_tags(detail)
detail = markdownify.markdownify(detail).strip()
needKnow['content'] = detail
return needKnow
async def parse_sspai(needKnow):
url = needKnow['hot_url']
res = await fetch(url)
soup = BeautifulSoup(res, 'html.parser')
detail = soup.select_one("div.content")
if detail:
detail = str(detail)
detail = await remove_img_tags(detail)
detail = markdownify.markdownify(detail).strip()
needKnow['content'] = detail
return needKnow
async def parse_awatmt(needKnow):
url = needKnow['hot_url']
artile_id = url.split("?")[0].split("/")[-1]
url = f"https://api-one-wscn.awtmt.com/apiv1/content/articles/{artile_id}?extract=0"
res = await fetch(url)
res_json = json.loads(res)
detail = res_json['data']['content']
detail = await remove_img_tags(detail)
detail = markdownify.markdownify(detail).strip()
needKnow['content'] = detail
return needKnow