-
Notifications
You must be signed in to change notification settings - Fork 1
/
baseline_tokenizer.py
142 lines (120 loc) · 6.29 KB
/
baseline_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from nltk.tokenize import word_tokenize as tokenize
from tokenizer import word_tokenize
from time import time
from os import stat
import re
def file_size(f): #Function returns file size in bytes
fileInfo = stat(f)
return fileInfo.st_size
def speed(nltkTime,tokenizerTime): #Function returns a tuple which includes the difference in time taken
if nltkTime<tokenizerTime:
return ("NLTK","tokenizer",abs(nltkTime-tokenizerTime))
else:
return ("tokenizer","NLTK",abs(nltkTime-tokenizerTime))
def num_tokens(nltkTokens,tokenizerTokens): #Function returns a tuple which includes the difference in number of tokens
nltk_num = len(nltkTokens)
tokenizer_num = len([j for i in tokenizerTokens for j in i])
if nltk_num<tokenizer_num:
return ("tokenizer",abs(nltk_num-tokenizer_num),"NLTK")
else:
return ("NLTK",abs(nltk_num-tokenizer_num),"tokenizer")
def unique_tokens(nltkTokens,tokenizerTokens): #Function returns number of unique tokens to each implementation
tokenizer_tokens = [j for i in tokenizerTokens for j in i]
return (len(set(tokenizer_tokens)-set(nltkTokens)),len(set(nltkTokens)-set(tokenizer_tokens)))
def special_char(nltkTokens,tokenizerTokens): #Function returns tuple containing number of special characters tokenized by each tokenizer
tokenizerTokensList = [j for i in tokenizerTokens for j in i]
nltkCount = 0
tokenizerCount = 0
regexp = re.compile("[\W]")
for i in nltkTokens:
if regexp.match(i):
nltkCount+=1
for i in tokenizerTokensList:
if regexp.match(i):
tokenizerCount+=1
return (nltkCount,tokenizerCount)
def hyphenated(nltkTokens,tokenizerTokens): #Function returns tuple containing number of hyphenated words tokenized by each tokenizer
tokenizerTokensList = [j for i in tokenizerTokens for j in i]
nltkCount = 0
tokenizerCount = 0
regexp = re.compile("(?=\S*[-])([\w-]+)")
for i in nltkTokens:
if regexp.match(i):
nltkCount+=1
for i in tokenizerTokensList:
if regexp.match(i):
tokenizerCount+=1
return (nltkCount,tokenizerCount)
def contractions(nltkTokens,tokenizerTokens): #Functions returns tuple containing number of contractions tokenized by each tokenizer
tokenizerTokensList = [j for i in tokenizerTokens for j in i]
nltkCount = 0
tokenizerCount = 0
regexp = re.compile("(?=\S*[']([\w'])+)")
for i in nltkTokens:
if regexp.match(i):
nltkCount+=1
for i in tokenizerTokensList:
if regexp.match(i):
tokenizerCount+=1
return (nltkCount,tokenizerCount)
def prefixes(nltkTokens,tokenizerTokens): #Function returns tuple containing number of prefixes and abbreviations tokenized by each tokenizer
tokenizerTokensList = [j for i in tokenizerTokens for j in i]
nltkCount = 0
tokenizerCount = 0
regexp = re.compile("\w+\.(?=\S)")
for i in nltkTokens:
if regexp.match(i):
nltkCount+=1
for i in tokenizerTokensList:
if regexp.match(i):
tokenizerCount+=1
return (nltkCount,tokenizerCount)
def words(nltkTokens,tokenizerTokens): #Function returns tuple containing number of words tokenized by tokenizer
tokenizerTokensList = [j for i in tokenizerTokens for j in i]
nltkCount = 0
tokenizerCount = 0
regexp = re.compile(r"\b(?<!\W)\w+(?!\W)\b")
for i in nltkTokens:
if regexp.match(i):
nltkCount+=1
for i in tokenizerTokensList:
if regexp.match(i):
tokenizerCount+=1
return (nltkCount,tokenizerCount)
print ("Size of Brown corpus in bytes: ",file_size("brown.txt"))
text = open("brown.txt","r").read() #Read the Brown corpus
t0 = time()
nltkTokens = tokenize(text) #Tokenizer with NLTK
t1 = time()
nltkTime = t1-t0
print ("Time taken by NLTK's word_tokenize to tokenize text: ",nltkTime)
print ("Number of tokens generated by NLTK's word_tokenize: ",len(nltkTokens))
t2 = time()
tokenizerTokens = word_tokenize(text) #Tokenizer with tokenizer
t3 = time()
tokenizerTime = t3-t2
print ("Time taken by tokenizer's word_tokenize to tokenize text: ",tokenizerTime)
print ("Number of tokens generated by tokenizer's word_tokenize: ",len([j for i in tokenizerTokens for j in i]))
functionSpeed = speed(nltkTime,tokenizerTime)
print (functionSpeed[0],"is faster than",functionSpeed[1],"by",functionSpeed[2],"seconds")
numberOfTokens = num_tokens(nltkTokens,tokenizerTokens)
print (numberOfTokens[0],"generated",numberOfTokens[1],"more tokens than",numberOfTokens[2])
uniqueTokens = unique_tokens(nltkTokens,tokenizerTokens)
print (uniqueTokens[0],"tokens are unique to tokenizer and",uniqueTokens[1],"tokens are unique to NLTK")
numberOfSpecialChars = special_char(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfSpecialChars[0],"special charcters and tokenizer tokenized",numberOfSpecialChars[1],"special characters")
numberOfHyphenated = hyphenated(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfHyphenated[0],"hyphenated words and tokenizer tokenized",numberOfHyphenated[1],"hyphenated words")
numberOfContractions = contractions(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfContractions[0],"contractions and tokenizer tokenized",numberOfContractions[1],"contractions")
numberOfPrefixes = prefixes(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfPrefixes[0],"prefixes/abbreviations and tokenizer tokenized",numberOfPrefixes[1],"prefixes/abbreviations")
numberOfWords = words(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfWords[0],"words and tokenizer tokenized",numberOfWords[1],"words")
tokenizer_tokens = [j for i in tokenizerTokens for j in i]
uniqueTokenizerTokens = set(tokenizer_tokens)-set(nltkTokens)
print ("Among the additional tokens generated by tokenizer",special_char(uniqueTokenizerTokens,[[]])[0],"were special characters")
print ("Among the additional tokens generated by tokenizer",hyphenated(uniqueTokenizerTokens,[[]])[0],"were hyphenated words")
print ("Among the additional tokens generated by tokenizer",contractions(uniqueTokenizerTokens,[[]])[0],"were contractions")
print ("Among the additional tokens generated by tokenizer",prefixes(uniqueTokenizerTokens,[[]])[0],"were prefixes/abbreviations")
print ("Among the additional tokens generated by tokenizer",words(uniqueTokenizerTokens,[[]])[0],"were words")