-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathhtml.py
111 lines (88 loc) · 3.08 KB
/
html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
import sys
from html.entities import name2codepoint
from html.parser import HTMLParser
if sys.version_info < (3, 8):
from cgi import escape as html_escape
else:
from html import escape as html_escape
class HTMLParseError(Exception):
pass
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class HTMLTagStripper(HTMLParser):
strippedTags = ["title", "script", "style"]
def __init__(self):
self.reset()
self.fed = []
self.strip_tag_contents_mode = False
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
# Replace <br>, <div> tags by spaces
if tag.lower() in ("br", "div"):
self.fed.append(" ")
# Strip the contents of a tag when it's
# in strippedTags. We can do this because
# HTMLParser won't try to parse the inner
# contents of a tag.
if tag.lower() in HTMLTagStripper.strippedTags:
self.strip_tag_contents_mode = True
def handle_endtag(self, tag):
self.strip_tag_contents_mode = False
def handle_data(self, d):
if not self.strip_tag_contents_mode:
self.fed.append(d)
if (3,) <= sys.version_info < (3, 10):
def error(self, message):
raise HTMLParseError(message)
def handle_entityref(self, d):
try:
val = chr(name2codepoint[d])
except KeyError:
return
self.fed.append(val)
def get_data(self) -> str:
return "".join(self.fed)
def strip_tags(html: str) -> str:
"""
Return textual content of HTML.
Remove title, script and style alltogether. Replace br and div
with space. Expand HTML entities.
This function can potentially raise HTMLParseError if fed invalid html.
You are responsible for handling it in the calling function.
"""
s = HTMLTagStripper()
s.feed(html)
return s.get_data()
# https://djangosnippets.org/snippets/19/
re_string = re.compile(
r"(?P<htmlchars>[<&>])|(?P<space>^[ \t]+)|(?P<lineend>\n)|(?P<protocol>(^|\s)((https?|ftp)://.*?))(\s|$)",
re.S | re.M | re.I | re.U,
)
def plaintext2html(text: str, tabstop: int = 4) -> str:
assert "\r" not in text, "newlines not normalized"
def do_sub(m):
c = m.groupdict()
if c["htmlchars"]:
return html_escape(c["htmlchars"], quote=False)
if c["lineend"]:
return "<br>"
elif c["space"]:
t = m.group().replace("\t", " " * tabstop)
t = t.replace(" ", " ")
return t
elif c["space"] == "\t":
return " " * tabstop
else:
url = m.group("protocol")
if url.startswith(" "):
prefix = " "
url = url[1:]
else:
prefix = ""
last = m.groups()[-1]
if last in ["\n", "\r", "\r\n"]:
last = "<br>"
return f'{prefix}<a href="{url}">{url}</a>{last}'
return "\n".join(
[f"<p>{re.sub(re_string, do_sub, p)}</p>" for p in text.split("\n\n")]
)