-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAspectDetector.py
159 lines (134 loc) · 6.42 KB
/
AspectDetector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# author: Andrew Walker, Ian Laird
# file name: AspectDetector.py
# class: NLP
# instructor: Dr Lin
# due date: May 10, 2019
# date last modified: May 10, 2019
import nltk
from nltk.collocations import *
from CorpusReader_TFIDF import CorpusReader_TFIDF
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict
import pickle
from collections import Counter
import string
class AspectDetector:
TF_IDF_ASPECT_PERCENT = .01
NUMBER_OF_TWO_GRAMS_TO_CONSIDER = 10
# constructor
#
# param:
# trainingCorpus: the corpus to train TF-DF on
# reviewCorpus: the corpus containing the reviews to analyze
#
# This method creates and initializes the AspectDetector
def __init__(self, trainingCorpus, reviewCorpus):
print("Creating AspectDetector ... ")
self.trainingCorpus = trainingCorpus
self.reviewCorpus = reviewCorpus
self.potentialAspects = None
self.reviewWords = None
try:
with open('data/potentialAspects.dat', 'rb') as handle:
self.potentialAspects = pickle.load(handle)
with open('data/wordsFromReviewCorpus.dat', 'rb') as handle2:
self.reviewWords = set(pickle.load(handle2))
if self.reviewWords != set(self.reviewCorpus.words()):
self.reviewWords = None
except FileNotFoundError:
self.potentialAspects = None
if self.potentialAspects is None or self.reviewWords is None:
print(" ... Creating TF-IDF CorpusReader")
# train the tfidf model on the training corpus
self.tf_idf_Model = CorpusReader_TFIDF(trainingCorpus, stemmer=None)
print("Done creating AspectDetector")
# run
#
# param: none
#
# This method find the potential aspects from the reviewCorpus
def run(self):
# run if the aspects cannot be loaded from a file or if the file is for a different review corpus
if self.potentialAspects is None or self.reviewWords is None:
self.reviewWords = self.reviewCorpus.words()
# get all of the tf_idf values for the documents in the review corpus
vectors = list()
for file in self.reviewCorpus.fileids():
newVec = self.tf_idf_Model.td_idf_new(self.reviewCorpus.words(file))
vectors.append(newVec)
sumVect = [0.0] * len(vectors[0])
# now find the best words for all of them
for vector in vectors:
count = 0
for value in vector:
sumVect[count] += value
count += 1
# associate each word in the corpus with its average tf_idf value
averageVect = dict(zip(self.tf_idf_Model.tf_idf_dim(), sumVect))
# we now have an orderering of all the words in the corpus
biggestWords = list(sorted(averageVect.keys(), key=averageVect.get, reverse=True))
# now get the most common words
# these will be the preliminaty aspects thwich will be farther narrowed down
# right now will just take the top 1% although this can be narrowed down
self.potentialAspects = biggestWords[:int(AspectDetector.TF_IDF_ASPECT_PERCENT * len(self.tf_idf_Model.tf_idf_dim()))]
# going to now consider collocations of the corpus
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(self.reviewCorpus.words())
# find the most common bigrams of the corpus
bigrams = finder.nbest(bigram_measures.pmi, AspectDetector.NUMBER_OF_TWO_GRAMS_TO_CONSIDER)
# now we have both the "best" 2 grams, and the best "unigrams" (from the tfidf model)
# what I am thinking is that now we see if any of the unigrams are mutually contained in a two-grams
# ie if "battery" and "life" are both popular unigrams, and "battery life" is a common
# bigram we cut out the term that appears later in the twogram
# maybe should we also consider grams higher than 2 (probably big performance hit?!?)
for bigram_1, bigram_2 in bigrams:
if(bigram_1 in self.potentialAspects and bigram_2 in self.potentialAspects):
self.potentialAspects.remove(bigram_2)
with open('data/potentialAspects.dat', 'wb') as handle:
pickle.dump(self.potentialAspects, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/wordsFromReviewCorpus.dat', 'wb') as handle2:
pickle.dump(self.reviewWords, handle2, protocol=pickle.HIGHEST_PROTOCOL)
tagged_sents = []
sents = sent_tokenize(self.reviewCorpus.raw())
for sent in sents:
tokens = [e1.lower() for e1 in word_tokenize(sent)]
tagged_sent = nltk.pos_tag(tokens, tagset='universal')
tagged_sents.append(tagged_sent)
gram_dict = {}
for tagged_sent in tagged_sents:
for tup in tagged_sent:
gram = tup[0]
pos = tup[1]
if gram in gram_dict:
pos_dict = gram_dict[gram]
pos_dict[pos] += 1
else:
pos_dict = defaultdict(int)
pos_dict[pos] += 1
gram_dict[gram] = pos_dict
for gram in self.potentialAspects:
if gram in gram_dict:
try:
pos_dict = gram_dict[gram]
max_pos = max(pos_dict, key=lambda e1: e1[1])
if max_pos != "NOUN":
self.potentialAspects.remove(gram)
except IndexError:
self.potentialAspects.remove(gram)
else:
self.potentialAspects.remove(gram)
tokens = [e1.lower() for e1 in word_tokenize(self.reviewCorpus.raw())]
freq_raw = Counter(tokens)
for aspect in self.potentialAspects:
count = freq_raw[aspect]
if count > 3:
self.potentialAspects.remove(aspect)
nonAllowed = string.punctuation + "1234567890"
for aspect in self.potentialAspects:
for token in nonAllowed:
if token in aspect:
try:
self.potentialAspects.remove(aspect)
except ValueError:
pass
return self.potentialAspects