-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCorpusReader_TFIDF.py
301 lines (265 loc) · 11.1 KB
/
CorpusReader_TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# author: Ian Laird
# file name: CorpusReader_TFIDF.py
# class: NLP
# instructor: Dr Lin
# due date: February 11, 2019
# date last modified: February 12, 2019
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer, SnowballStemmer
# import numpy as np
from scipy import spatial
import math
import copy
import pickle
# CorpusReader_TFIDF
# This class adds tf-idf functionality to a given corpus reader
# it will return vectors that correspond to every document in the desired corpus
class CorpusReader_TFIDF:
# custom constructor
#
# param:
# corpus: the corpus that documents will be grabbed from
# tf: indicating how the term frequency will be calculated
# raw -- indicates that the raw score will be used
# log -- indicates that the score will be log normalized
# binary -- indicates that the score will be binary in output
# 1 indicates that the term is present in the document
# 0 indicates that the term is not present in the document
# idf: indicates how the inverse document frequency will be calculated
# base: simply the default
# smooth: the frequency is smoothed
# prob: probabilistic inverse document frequency
# stopword: what the stopwords will be
# standard -- use the default English stopwords
# anything else -- a fileName to read the stopwords from
# stemmer: the stemmer for the words
# the default is the Porter Stemmer
# ignorecase: indicates if case should be ignored
# no -- do not ignore case
# yes -- do ignore case
#
# This method generates a vector for each document in the corpus and stores it
def __init__(self, corpus, tf='raw', idf='base', stopword='standard', stemmer=PorterStemmer(), ignorecase='yes'):
if isinstance(stemmer, str):
if stemmer.lower() == 'porter':
stemmer = PorterStemmer()
elif stemmer.lower() == 'snowball':
stemmer = SnowballStemmer("english")
else:
print("Error Unknown stemmer option")
exit()
wordStorage = dict()
termFrequency = dict()
fileFrequencyStorage = dict()
dictClone= dict()
self.corpus = corpus
self.tf = tf
self.idf = idf
self.stopword = stopword
self.actualStopWords = set()
self.stemmer = stemmer
self.ignorecase = ignorecase
self.vectors = dict()
self.dimensions = list()
self.wordDocumentFrequency = dict()
# depending on preferences read in the stopwords
# if 'none' do not remove any stopwords
if(self.stopword != 'none'):
if(self.stopword == 'standard'):
self.actualStopWords = set(stopwords.words('english'))
else:
#read in the stopwords from a file
with open(stopword) as file:
fileContents = file.read()
self.actualStopWords = set(nltk.word_tokenize(fileContents))
self.actualStopWords = self.lowerCaseCheckConversion(self.actualStopWords)
# where all words in any document will be stored
self.allWords = list()
#calculate the vectors for the documents
for fileId in corpus.fileids():
tempWords = corpus.words(fileId)
filteredWords = self.lowerCaseCheckConversion(tempWords)
#now filter out the stopwords
filteredWords = self.filterWords(filteredWords)
filteredWords = self.stemWords(filteredWords)
self.allWords.extend(filteredWords)
#now need to store the read words so that the vector can be calculated later
wordStorage[fileId] = filteredWords
self.allWords = set(self.allWords)
# initially each term is present in no documents
for word in self.allWords:
self.wordDocumentFrequency[word] = 0
termFrequency[word] = 0
self.dimensions.append(word)
dictClone = copy.deepcopy(termFrequency)
# iterate through every file and its contents
for file, wordList in wordStorage.items():
# count up how many times each term occurs in each file
for word in wordList:
termFrequency[word] = termFrequency[word] + 1
for word, wordFrequency in termFrequency.items():
if wordFrequency > 0:
self.wordDocumentFrequency[word] = self.wordDocumentFrequency[word] + 1
fileFrequencyStorage[file] = termFrequency
termFrequency = copy.deepcopy(dictClone)
# now actually calculate the tf-idf values
for fileId in corpus.fileids():
vector = list()
frequencyMap = fileFrequencyStorage[fileId]
for word in self.allWords:
# calculate the TFIDF value using both the term frequency of the term
# and the number of docs it appears
vector.append(self.calculateTfIdfValue(frequencyMap[word], self.wordDocumentFrequency[word]))
del fileFrequencyStorage[fileId]
self.vectors[fileId] = vector
# calculateTfIdfValue
#
# generates the tf-idf value for a given tf and idf. Varies for how the tf
# and idf preferences were set in the constructor
#
# param:
# tf: the term frequency
# idf: the inverse document frequency
#
# return:
# the calculates value of tf times the calculates value of idf
def calculateTfIdfValue(self, tf, idf):
idf_val = 1
tf_val = 1
try:
if(tf == 0):
tf_val = 0
else:
if self.tf == 'raw':
tf_val = tf
elif self.tf == 'log':
tf_val = 1 + math.log(tf, 2)
elif self.tf == 'binary':
tf_val = 1
if self.idf == 'base':
idf_val = math.log(len(self.corpus.fileids()) / idf, 2)
elif self.idf == 'smooth':
idf_val = math.log(1 + (len(self.corpus.fileids()) / idf), 2)
elif self.idf == 'prob':
idf_val = math.log((len(self.corpus.fileids()) - idf) / idf, 2)
return tf_val * idf_val
except ZeroDivisionError:
return 0.0
except ValueError:
return 0.0
# fileids
# returns all fileids of the contained corpus
# input: none
def fileids(self):
return self.corpus.fileids()
# raw
# returns the raw text of the indicated fileids in the corpus
# input:
# fileids: a list of all fileids to return
def raw(self, fileids = []):
if len(fileids) == 0:
return self.corpus.raw()
return self.corpus.raw(fileids)
# words
# returns all words in the desired fileids
# input:
# fileids: a list of the fileids to print words from
def words(self, fileids=[]):
if len(fileids) == 0:
return self.corpus.words()
return self.corpus.words(fileids)
# open
# opens the desired fileid
# input:
# fileid: the file to open
def open(self, fileid):
return self.corpus.open(fileid)
# abspath
# returns the absolute path of the file
def abspath(self, fileid):
return self.corpus.abspath(fileid)
# tf_idf
# returns all vectors for the specified documents
# param:
# fileList: all documents to list
def tf_idf(self, filelist=[]):
# see if all documents are desired
if isinstance(filelist, list) and len(filelist) == 0:
return [x for x in self.vectors.values()]
# see if only one fileId is given
if isinstance(filelist, str):
return self.vectors[filelist]
# if a list of fileids is given return a list of vector
# for the corresponding files
return [self.vectors[x] for x in self.vectors.keys() if x in filelist]
# tf_idf_dim
# returns a list showing what word each dimension of the vector corresponds to
def tf_idf_dim(self):
"""show in what order the words are in the vector"""
return self.dimensions
# td_idf_new
#
# creates a vector corresponding to the given document
#
# param:
# words: the words that are to be treated as a document
def td_idf_new(self, words = []):
"""finds a new vector based on the existing idf values"""
return self.getVector(self.stemWords(self.filterWords(self.lowerCaseCheckConversion(words))))
# cosine_sim
# returns the dot product of the two files
def cosine_sim(self, fileId=[]):
"""finds the cosine similarity of two files"""
if len(fileId) == 2:
# return np.dot(self.vectors[fileId[0]], self.vectors[fileId[1]]) / (len(self.vectors[fileId[0]]) * len(self.vectors[fileId[1]]))
return 1 - spatial.distance.cosine(self.vectors[fileId[0]], self.vectors[fileId[1]])
else:
print("Error: the number of fileids given should be 2!")
return -1
# cosine_sim_new
# returns the cosine similarity between a new document and one of the documents in the corpus
# param:
# words: the words that are to be treated as a document
# fileid: the existing document that it is to be compared with
def cosine_sim_new(self, words = [] , fileid = 0):
new_list = self.td_idf_new(words)
list_2 = self.vectors[fileid]
return 1 - spatial.distance.cosine(new_list, list_2)
# getVector
# gets the vector corresponding to given list of words
def getVector(self, words):
#need to find the frequency of every word in words
freq = Counter(words)
keys = freq.keys()
# treat words like a document
returnVector = list()
for word in self.allWords:
# calculate the TFIDF value using both the term frequency of the term and the number of docs it appears
returnVector.append(self.calculateTfIdfValue((freq[word] if word in keys else 0), self.wordDocumentFrequency[word] if word in self.wordDocumentFrequency.keys() else 0))
return returnVector
# filterWords
# filters all stop words out of the collection of words
# return: the filtered words
def filterWords(self, words):
if isinstance(words, set):
return {x for x in words if x not in self.actualStopWords}
return [x for x in words if x not in self.actualStopWords]
# lowerCaseCheckConversion
# if necessary converts strings to all lower case
def lowerCaseCheckConversion(self, words):
if self.ignorecase == 'yes':
if isinstance(words, set):
return {x.lower() for x in words}
else:
return [x.lower() for x in words]
return words
# stemwords
# stems all words in the collection
def stemWords(self, words):
if self.stemmer is None:
return words
if isinstance(words, set):
return {self.stemmer.stem(x) for x in words}
return [self.stemmer.stem(x) for x in words]