-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfix_html.py
30 lines (24 loc) · 927 Bytes
/
fix_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#coding: utf-8
import re;
match_urls = re.compile(r"""((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]+|(([^\s()<>]+)))*)|[^\s`!()[]{};:'".,<>?«»“”‘’]))""", re.DOTALL)
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def fix_docs(docs):
'''
Processes a bunch of docs by stripping HTML, and replacing URLS etc...
'''
docs = [doc.strip().replace('<br/>', '\n') for doc in docs if doc.strip()]
docs = [strip_tags(doc) for doc in docs]
docs = [match_urls.subn('HTTP', doc)[0] for doc in docs]
return docs