-
Notifications
You must be signed in to change notification settings - Fork 5
/
bte.py
184 lines (153 loc) · 5.58 KB
/
bte.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
"""
This module implements Finn's BTE (Body Text Extraction) algorithm for
extracting the main body text of a web page and avoiding the surrounding
irrelevant information. The description of the algorithm can be found
in A. Finn, N. Kushmerick, and B. Smyth. Fact or Fiction: Content
classification for digital libraries. In DELOS Workshop: Personalisation
and Recommender System in Digital Libraries, 2001.
Python implementation by Jan Pomikalek <xpomikal@fi.muni.cz>
"""
import re
def html2text(html_text, preserve_par=False, preserve_head_list_par=False):
"""
Converts HTML to plain text with boilerplate removed.
If preserve_par is set to True, paragraph mark-up will be preserverd.
If preserve_head_list_par is set to True, paragraph mark-up will be
preserverd and headers and list items marked with <h> and <l> tags
respectively.
"""
cleaned_text = preclean(html_text)
tokens = tokenise(cleaned_text)
(start, end) = bte(tokens)
main_body = tokens[start:end+1]
cleaned_body = find_paragraphs(main_body, preserve_head_list_par)
# separate paragraphs with newlines
blocks = []
block = []
for token in cleaned_body:
if token_value(token) > 0:
block.append(token)
else:
if len(block) > 0:
blocks.append(" ".join(block))
block = []
if preserve_par or preserve_head_list_par:
block.append(token)
if len(block) > 0:
blocks.append(" ".join(block))
return "\n".join(blocks)
def preclean(html_text):
"""
HTML preprocessing -- striping headers, scripts, styles; replacing HTML
entities.
"""
# strip all but body
cleaned_text = re.compile('^.*<body(\s+[^>]*)?>', re.S | re.I
).sub('', html_text)
cleaned_text = re.compile('</body>.*$', re.S | re.I
).sub('', cleaned_text)
# strip scripts
cleaned_text = re.compile('<script(\s+[^>]*)?>(.|\s)*?</script>',
re.S | re.I).sub('<script></script>', cleaned_text)
# strip styles
cleaned_text = re.compile('<style(\s+[^>]*)?>(.|\s)*?</style>',
re.S | re.I).sub('<style></style>', cleaned_text)
# html entities
cleaned_text = html_entities(cleaned_text)
return cleaned_text
def html_entities(html_text):
"Substitution of the most commonly used HTML entities."
html_text = re.sub('"', '"', html_text)
html_text = re.sub(' ', ' ', html_text)
html_text = re.sub(''', "'", html_text)
return html_text
def tokenise(html_text):
"""
Tokenises HTML document to a sequence of HTML tags and strings of
non-whitespace characters (words).
"""
return [g1 for (g1, g2) in re.findall('(<([^>]|\s)+>|[^\s<]+)', html_text)]
def bte(tokens):
"""
BTE algorithm. Expects a sequence of HTML tags and words as input parameter.
Outputs a pair of indices which indicate the beginning and end of the main
body.
"""
# find breakpoints
breakpoints = []
prev_value = None
sum_value = 0
for i in range(len(tokens)):
cur_value = token_value(tokens[i])
if prev_value and cur_value != prev_value:
breakpoints.append((i-1, sum_value))
sum_value = 0
sum_value+= cur_value
prev_value = cur_value
breakpoints.append((len(tokens)-1, sum_value))
# find breakpoints range which maximises the score
max_score = 0
max_start = 0
max_end = 0
for i in range(len(breakpoints)):
score = breakpoints[i][1]
if score > max_score:
max_score = score
if i > 0: max_start = breakpoints[i-1][0]+1
else: max_start = 0
max_end = breakpoints[i][0]
for j in range(i+1, len(breakpoints)):
score+= breakpoints[j][1]
if score > max_score:
max_score = score
if i > 0: max_start = breakpoints[i-1][0]+1
else: max_start = 0
max_end = breakpoints[j][0]
return (max_start, max_end)
def token_value(token):
"Returns -1 if the token is HTML tag, 1 otherwise (if word)."
if token.startswith('<'):
return -1
else:
return 1
def find_paragraphs(tokens, tag_h_l=False):
"""
Marks paragraph blocks with <p>. If tag_h_l set to True, headers and
list items are also detected and marked with <h> and <l> respectively.
"""
PAR_FIND_TAGS = ['p', 'div', 'hr', 'blockquote', 'table']
PAR_REPLACE_TAG = '<p>'
HEADER_FIND_TAGS = ['h1', 'h2', 'h3']
HEADER_REPLACE_TAG = '<h>'
LIST_FIND_TAGS = ['li']
LIST_REPLACE_TAG = '<l>'
result = [PAR_REPLACE_TAG]
in_paragraph = False
for token in tokens:
if token_value(token) > 0:
result.append(token)
in_paragraph = True
else:
if not in_paragraph:
continue
m = re.search('^<([^\s>]+)', token)
if not m:
continue
tag = m.group(1).lower()
if tag in PAR_FIND_TAGS:
result.append(PAR_REPLACE_TAG)
in_paragraph = False
if tag in HEADER_FIND_TAGS:
if tag_h_l:
result.append(HEADER_REPLACE_TAG)
else:
result.append(PAR_REPLACE_TAG)
in_paragraph = False
if tag in LIST_FIND_TAGS:
if tag_h_l:
result.append(LIST_REPLACE_TAG)
else:
result.append(PAR_REPLACE_TAG)
in_paragraph = False
return result