baseline_tokenizer.py

from nltk.tokenize import word_tokenize as tokenize
from tokenizer import word_tokenize
from time import time
from os import stat
import re

def file_size(f): #Function returns file size in bytes
    fileInfo = stat(f)
    return fileInfo.st_size

def speed(nltkTime,tokenizerTime): #Function returns a tuple which includes the difference in time taken
    if nltkTime<tokenizerTime:
        return ("NLTK","tokenizer",abs(nltkTime-tokenizerTime))
    else:
        return ("tokenizer","NLTK",abs(nltkTime-tokenizerTime))
 
def num_tokens(nltkTokens,tokenizerTokens): #Function returns a tuple which includes the difference in number of tokens
    nltk_num = len(nltkTokens)
    tokenizer_num = len([j for i in tokenizerTokens for j in i])
    if nltk_num<tokenizer_num:
        return ("tokenizer",abs(nltk_num-tokenizer_num),"NLTK")
    else:
        return ("NLTK",abs(nltk_num-tokenizer_num),"tokenizer")

def unique_tokens(nltkTokens,tokenizerTokens): #Function returns number of unique tokens to each implementation
    tokenizer_tokens = [j for i in tokenizerTokens for j in i]
    return (len(set(tokenizer_tokens)-set(nltkTokens)),len(set(nltkTokens)-set(tokenizer_tokens)))

def special_char(nltkTokens,tokenizerTokens): #Function returns tuple containing number of special characters tokenized by each tokenizer
    tokenizerTokensList = [j for i in tokenizerTokens for j in i]
    nltkCount = 0
    tokenizerCount = 0
    regexp = re.compile("[\W]")
    for i in nltkTokens:
        if regexp.match(i):
            nltkCount+=1
    for i in tokenizerTokensList:
        if regexp.match(i):
            tokenizerCount+=1
    return (nltkCount,tokenizerCount)

def hyphenated(nltkTokens,tokenizerTokens): #Function returns tuple containing number of hyphenated words tokenized by each tokenizer
    tokenizerTokensList = [j for i in tokenizerTokens for j in i]
    nltkCount = 0
    tokenizerCount = 0
    regexp = re.compile("(?=\S*[-])([\w-]+)")
    for i in nltkTokens:
        if regexp.match(i):
            nltkCount+=1
    for i in tokenizerTokensList:
        if regexp.match(i):
            tokenizerCount+=1
    return (nltkCount,tokenizerCount)

def contractions(nltkTokens,tokenizerTokens): #Functions returns tuple containing number of contractions tokenized by each tokenizer
    tokenizerTokensList = [j for i in tokenizerTokens for j in i]
    nltkCount = 0
    tokenizerCount = 0
    regexp = re.compile("(?=\S*[']([\w'])+)")
    for i in nltkTokens:
        if regexp.match(i):
            nltkCount+=1
    for i in tokenizerTokensList:
        if regexp.match(i):
            tokenizerCount+=1
    return (nltkCount,tokenizerCount)

def prefixes(nltkTokens,tokenizerTokens): #Function returns tuple containing number of prefixes and abbreviations tokenized by each tokenizer
    tokenizerTokensList = [j for i in tokenizerTokens for j in i]
    nltkCount = 0
    tokenizerCount = 0
    regexp = re.compile("\w+\.(?=\S)")
    for i in nltkTokens:
        if regexp.match(i):
            nltkCount+=1
    for i in tokenizerTokensList:
        if regexp.match(i):
            tokenizerCount+=1
    return (nltkCount,tokenizerCount)

def words(nltkTokens,tokenizerTokens): #Function returns tuple containing number of words tokenized by tokenizer
    tokenizerTokensList = [j for i in tokenizerTokens for j in i]
    nltkCount = 0
    tokenizerCount = 0
    regexp = re.compile(r"\b(?<!\W)\w+(?!\W)\b")
    for i in nltkTokens:
        if regexp.match(i):
            nltkCount+=1
    for i in tokenizerTokensList:
        if regexp.match(i):
            tokenizerCount+=1
    return (nltkCount,tokenizerCount)

print ("Size of Brown corpus in bytes:  ",file_size("brown.txt"))
text = open("brown.txt","r").read() #Read the Brown corpus

t0 = time()
nltkTokens = tokenize(text) #Tokenizer with NLTK
t1 = time() 
nltkTime = t1-t0
print ("Time taken by NLTK's word_tokenize to tokenize text: ",nltkTime)
print ("Number of tokens generated by NLTK's word_tokenize: ",len(nltkTokens))

t2 = time()
tokenizerTokens = word_tokenize(text) #Tokenizer with tokenizer
t3 = time()
tokenizerTime = t3-t2
print ("Time taken by tokenizer's word_tokenize to tokenize text: ",tokenizerTime)
print ("Number of tokens generated by tokenizer's word_tokenize: ",len([j for i in tokenizerTokens for j in i]))

functionSpeed = speed(nltkTime,tokenizerTime)
print (functionSpeed[0],"is faster than",functionSpeed[1],"by",functionSpeed[2],"seconds")

numberOfTokens = num_tokens(nltkTokens,tokenizerTokens)
print (numberOfTokens[0],"generated",numberOfTokens[1],"more tokens than",numberOfTokens[2])

uniqueTokens = unique_tokens(nltkTokens,tokenizerTokens)
print (uniqueTokens[0],"tokens are unique to tokenizer and",uniqueTokens[1],"tokens are unique to NLTK")

numberOfSpecialChars = special_char(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfSpecialChars[0],"special charcters and tokenizer tokenized",numberOfSpecialChars[1],"special characters")

numberOfHyphenated = hyphenated(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfHyphenated[0],"hyphenated words and tokenizer tokenized",numberOfHyphenated[1],"hyphenated words")

numberOfContractions = contractions(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfContractions[0],"contractions and tokenizer tokenized",numberOfContractions[1],"contractions")

numberOfPrefixes = prefixes(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfPrefixes[0],"prefixes/abbreviations and tokenizer tokenized",numberOfPrefixes[1],"prefixes/abbreviations")

numberOfWords = words(nltkTokens,tokenizerTokens)
print ("NLTK tokenized",numberOfWords[0],"words and tokenizer tokenized",numberOfWords[1],"words")

tokenizer_tokens = [j for i in tokenizerTokens for j in i]
uniqueTokenizerTokens = set(tokenizer_tokens)-set(nltkTokens)

print ("Among the additional tokens generated by tokenizer",special_char(uniqueTokenizerTokens,[[]])[0],"were special characters")
print ("Among the additional tokens generated by tokenizer",hyphenated(uniqueTokenizerTokens,[[]])[0],"were hyphenated words")
print ("Among the additional tokens generated by tokenizer",contractions(uniqueTokenizerTokens,[[]])[0],"were contractions")
print ("Among the additional tokens generated by tokenizer",prefixes(uniqueTokenizerTokens,[[]])[0],"were prefixes/abbreviations")
print ("Among the additional tokens generated by tokenizer",words(uniqueTokenizerTokens,[[]])[0],"were words")