-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimple_tokenizer.py
53 lines (45 loc) · 1.46 KB
/
simple_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
a VERY simple tokenizer. Just tokenizes on space and a few common punctiation marks.
Works for simple applications and some languagues though.
"""
def remove_spaces(line):
"""
Removes double spaces
"""
line = line.lstrip()
result = ""
for i, ch in enumerate(line):
if i+1 == len(line):
result = result + ch
elif (ch == " " and (line[i+1] == " " or line[i+1] == "\n")):
pass
else:
result = result + ch
return result
def simple_tokenize(file_name):
"""
Tokenize the content of a file with only one sentence in it.
"""
f = open(file_name)
text = f.read()
f.close()
return simple_tokenize_list([text])
def simple_tokenize_list(lines):
"""
Tokenizes each string in a list of strings, and replaces
tab with space, and double space with space.
param: A list of strings
:returns A list that contains a list of tokens in the input string
"""
output = []
for line in lines:
if line.strip() != "":
line = line.strip()
line = line.replace("\t", " ")
for ch in ['.', ',', '!', '?', '%', ":", ";", '"', "'", "-", "/", "(", ")"]:
line = line.replace(ch, " " + ch + " ")
line = line.strip()
removed_space = remove_spaces(line)
output.append(removed_space.split(" "))
return output
#simple_tokenize("2000_more_other.txt", "2000_more_other_tokenized.txt")