-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparse_text.py
37 lines (29 loc) · 1.05 KB
/
parse_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import re
import urllib
START_STRING = 'CHAPTER 1'
END_STRING = 'End of Project Gutenberg\'s'
PART = 'CHAPTER'
DORIAN_GRAY = 'http://www.gutenberg.org/files/174/174.txt'
def parse_from_url(url,
start_str=START_STRING,
end_str=END_STRING,
unit=PART):
'''
Parse text from url (defaults from Project Gutenberg's)
:url: str, url for the text to parse
:start_str: str, Starting sentence for the text
:end_str: str, End sentence for the text
:unit: str, single unit to divide the text (e.g. chapters)
'''
response = urllib.urlopen(url)
raw = response.read().decode('utf8')
start = raw.find(start_str)
end = raw.rfind(end_str)
raw = raw[start:end]
raw = re.sub(unit + ' [0-9]+', '', raw)
raw = re.sub('\r\n', ' ', raw)
sentences = []
for line in re.split(re.compile('\.|!|\?|"'), raw):
if line.strip() != '':
sentences.append(line.strip())
return ' <eos> '.join([s[:1].lower() + s[1:] for s in sentences])