-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf.py
85 lines (61 loc) · 2.24 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from sklearn.naive_bayes import MultinomialNB
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import re
directoryPos = "./aclImdb/train/pos"
directoryNeg = "./aclImdb/train/neg"
testPosDirectory = "./aclImdb/test/pos"
testNegDirectory = "./aclImdb/test/neg"
corpus = []
review_actual_val = []
def addToCorpus(path, reviewName):
# Traverse pos train folder text files
for x in reviewName:
if x.endswith(".txt"):
rating = re.search(r"(?<=_)\d*", x).group()
review_actual_val.append(rating)
# Open the text file
filePath = path + "/" + x
file = open(filePath, "r", encoding="utf8")
# tokenize data
text_review = file.read()
text_review = text_review.lower()
# remove special characters leaving only word and apostrophe
text_review = re.sub("[^A-Za-z']+", ' ', text_review)
corpus.append(text_review)
return
def result(arr1, arr2):
total = 0
res = 0
for x, y in zip(arr1, arr2):
total += 1
if x == y:
res += 1
print("res: ", res, ", total: ", total)
return
def test(mnb, testPos, testNeg, firstIndex, lastIndex):
global corpus
corpus = []
global review_actual_val
review_actual_val = []
posReviewName = os.listdir(testPos)[firstIndex: lastIndex]
addToCorpus(testPos, posReviewName)
negReviewName = os.listdir(testNeg)[firstIndex: lastIndex]
addToCorpus(testNeg, negReviewName)
tfidf_test = tfidf.transform(corpus)
testResult = mnb.predict(tfidf_test)
result(review_actual_val, testResult)
return
posReviewName = os.listdir(directoryPos)[0:2000]
addToCorpus(directoryPos, posReviewName)
negReviewName = os.listdir(directoryNeg)[0:2000]
addToCorpus(directoryNeg, negReviewName)
tfidf = TfidfVectorizer()
tfidf_train = tfidf.fit_transform(corpus)
MNB = MultinomialNB(alpha = 1)
MNB.fit(tfidf_train, review_actual_val)
test(MNB, testPosDirectory, testNegDirectory, 0, 1000)
test(MNB, testPosDirectory, testNegDirectory, 1000, 2000)
test(MNB, testPosDirectory, testNegDirectory, 2000, 3000)
test(MNB, testPosDirectory, testNegDirectory, 3000, 4000)
test(MNB, testPosDirectory, testNegDirectory, 4000, 5000)