forked from NoCLin/rss-monitor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
73 lines (53 loc) · 1.95 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from bs4 import BeautifulSoup
from simplediff import html_diff
def html_diff2(original, edited):
import difflib
d = difflib.Differ()
# 直接按字切割,不做分词,分段
diff = d.compare(original, edited)
diff = list(diff)
last_flag = diff[0][0]
last_store = ""
result = []
for i in diff:
flag, _, char = i
if flag != last_flag:
result.append((last_flag, last_store))
last_store = ""
last_store += char
last_flag = flag
result.append((last_flag, last_store))
# print(result)
result_html = ""
for flag, text in result:
if flag == " ":
result_html += text
elif flag == "+":
result_html += f"<ins>{text}</ins>"
elif flag:
result_html += f"<del>{text}</del>"
return result_html
def html_diff_to_markdown(txt):
r = txt.replace("<ins>", " **[+") # 留个空格 否则markdown解析容易出错
r = r.replace("</ins>", "+]** ")
r = r.replace("<del>", " **[-")
r = r.replace("</del>", "-]** ")
return r
def shorten(s, length):
return s if len(s) < length else (s[:length] + "...")
def html_to_text(html):
return BeautifulSoup(html, 'html.parser').get_text()
if __name__ == '__main__':
old = "【全部公告本科生院 研究生院关于2020-2021学年秋冬学期课程调整安排的通知】 各学院(系),行政各部门,各校区管委会,直属各单位,各任课教师、各位同学:"
new = "【全部公告研究生院、本科生院 关于2020-2021学年秋冬学期课程调整安排的通知】 各学院(系),行政各部门,各校区管委会,直属各单位,各任课教师、各位同学:"
r1 = html_diff(old, new)
r2 = html_diff2(old, new)
r3 = html_diff_to_markdown(r1)
r4 = html_diff_to_markdown(r2)
print(old)
print(new)
print(r1)
print(r2)
print(r3)
print(r4)
# print(diff_result)