forked from TomoeMami/S1PlainTextBackup
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautorun.py
128 lines (125 loc) · 6.03 KB
/
autorun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup,UnicodeDammit
import re
import time
import sys
import io
import os
import json
#2022-09-19
# old_number = 2095520
# old_number = 2095520++int((int(time.time())-1664434472)/86400)*175
#2021-01-01
# dead_number = 1980710
def parse_title(title):
titles = re.sub(r'<.+?>','',str(title))
titles = re.sub(r'[\]\[]','',titles)
titles = re.sub(r'\|','|',titles)
titles = re.sub(r'/','/',titles)
titles = re.sub(r'\\','\',titles)
titles = re.sub(r':',':',titles)
titles = re.sub(r'\*','*',titles)
titles = re.sub(r'\?','?',titles)
titles = re.sub(r'"','"',titles)
titles = re.sub(r'<','<',titles)
titles = re.sub(r'>','>',titles)
titles = re.sub(r'\.\.\.','…',titles)
titles = re.sub(r'\n',' ',titles)
titles = '['+titles+']'
return titles
def parse_html(html,threadict):
# soup = BeautifulSoup(html,from_encoding="utf-8",features="lxml")
soup = BeautifulSoup(html, 'html.parser')
namelist = soup.find_all(name="tbody")
for i in namelist:
threadids = re.search(r'normalthread_\d{7}',str(i))
if (threadids):
levels = re.search(r'\d{1,5}</a></span>',str(i))
replycounts = i.find_all("a","xi2")
replycount = re.sub(r'<.+?>','',str(replycounts[0]))
titles = i.find_all("a","s xst")
title = re.sub(r'<.+?>','',str(titles[0]))
threadid = re.sub(r'normalthread_','',str(threadids.group(0)))
if levels:
#在这里进行是否添加的检查
# if(int(threadid) > old_number):
level = re.sub(r'</a></span>','',str(levels.group(0)))
lastreplytime = re.findall(r'\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}',str(i))
replytime = time.mktime(time.strptime(str(lastreplytime[1]), "%Y-%m-%d %H:%M"))
if(int(level) > 1) and ((int(time.time()) - replytime ) < 86400):
# if((int(time.time()) - replytime) < 43200):
# 1天之内回复过
threadict[threadid] = {}
threadict[threadid]['replytime'] = int(replytime)
threadict[threadid]['title'] = parse_title(title)
threadict[threadid]['replycount'] = int(replycount) + 1 # 外部回帖数会比实际楼层数少1
if __name__ == '__main__':
blacklist = [2104652,2056385,1842868,334540,1971007,1915305,2023780,2085181,2105283,2045161,2106883,2068300,2098345,2120403,2119905,2096884,2143473,2100013,2171889]
high_level = 10
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
# # 浏览器登录后得到的cookie,也就是刚才复制的字符串
with open ('/home/riko/s1cookie-1.txt','r',encoding='utf-8') as f:
cookie_str1 = f.read()
#cookie_str1 = os.getenv('S1_COOKIE')
cookie_str = repr(cookie_str1)[1:-1]
# #把cookie字符串处理成字典,以便接下来使用
cookies = {}
for line in cookie_str.split(';'):
key, value = line.split('=', 1)
cookies[key] = value
# 设置请求头
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
forumdict = {'外野': '75','虚拟主播区专楼':'151','游戏区':'4','漫区':'6','手游专楼':'135'}
# forumdict = {'外野': '75','游戏区':'4','漫区':'6'}
rootdir = "./"
with open(rootdir+'RefreshingData.json',"r",encoding='utf-8-sig') as f:
thdata = json.load(f)
# for i in thdata.keys():
# thdata[i]['update'] = False
session = requests.session()
for k in forumdict.keys():
threadict = {}
# 仅查看第一页
for i in range(1,2):
RURL = 'https://bbs.saraba1st.com/2b/forum-'+forumdict[k]+'-'+str(i)+'.html'
s1 = session.get(RURL, headers=headers, cookies=cookies)
# print(s1.encoding)
# s1 = requests.get(RURL, headers=headers)
# s1.encoding='utf-8'
# s1encoding = s1.encoding
# data = UnicodeDammit(s1.content)
parse_html(s1.text,threadict)
# with open(rootdir+'RefreshingData.json',"r",encoding='utf-8') as f:
# thdata = json.load(f)
ids = thdata.keys()
for l in threadict.keys():
if (int(l) not in blacklist):
if l in ids:
if thdata[l]['totalreply'] < threadict[l]['replycount']:
print(str(thdata[l]['totalreply']) +" : "+ str(threadict[l]['replycount']))
thdata[l]['active'] = True
thdata[l]['update'] = True
thdata[l]['lastedit'] = threadict[l]['replytime']
else:
thdata[l] = {}
thdata[l]['totalreply'] =0
thdata[l]['title'] = threadict[l]['title']
thdata[l]['newtitle'] = threadict[l]['title']
thdata[l]['lastedit'] = threadict[l]['replytime']
thdata[l]['category']= k
thdata[l]["active"] = True
thdata[l]["update"] = True
print('add:'+l)
with open(rootdir+'RefreshingData.json',"w",encoding='utf-8-sig') as f:
f.write(json.dumps(thdata,indent=2,ensure_ascii=False))
activethdata={}
with open(rootdir+'RefreshingData.json',"r",encoding='utf-8-sig') as f:
thdata=json.load(f)
for i in thdata.keys():
# if(thdata[i]['active']) or ((int(time.time()) -thdata[i]['lastedit']) < 1209600) or (int(i) > old_number) or(thdata[i]['totalreply'] // 30 > high_level):
if(thdata[i]['active']) or ((int(time.time()) - thdata[i]['lastedit']) < 1209600) or(thdata[i]['totalreply'] // 40 > high_level):
#缓存14天
activethdata[i] = thdata[i]
with open(rootdir+'RefreshingData.json',"w",encoding='utf-8-sig') as f:
f.write(json.dumps(activethdata,indent=2,ensure_ascii=False))