-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathspam_ham.py
161 lines (104 loc) · 5.15 KB
/
spam_ham.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# conda install -c conda-forge textblob
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
# Loading the data set
email_data = pd.read_csv("~/Downloads/Data Science/data set/ham_spam.csv",encoding = "ISO-8859-1")
# cleaning data
import re
stop_words = []
with open("D:\\Assignment\\words_data\\stop.txt") as f:
stop_words = f.read()
# splitting the entire string by giving separator as "\n" to get list of
# all stop words
stop_words = stop_words.split("\n")
"this is awsome 1231312 $#%$# a i he yu nwj"
def cleaning_text(i):
i = re.sub("[^A-Za-z" "]+"," ",i).lower()
i = re.sub("[0-9" "]+"," ",i)
w = []
for word in i.split(" "):
if len(word)>3:
w.append(word)
return (" ".join(w))
"This is Awsome 1231312 $#%$# a i he yu nwj".split(" ")
cleaning_text("This is Awsome 1231312 $#%$# a i he yu nwj")
# testing above function with sample text => removes punctuations, numbers
cleaning_text("Hope you are having a good week. Just checking in")
cleaning_text("hope i can understand your feelings 123121. 123 hi how .. are you?")
email_data.text = email_data.text.apply(cleaning_text)
# removing empty rows
email_data.shape
email_data = email_data.loc[email_data.text != " ",:]
# CountVectorizer
# Convert a collection of text documents to a matrix of token counts
# TfidfTransformer
# Transform a count matrix to a normalized tf or tf-idf representation
# creating a matrix of token counts for the entire text document
def split_into_words(i):
return [word for word in i.split(" ")]
# splitting data into train and test data sets
from sklearn.model_selection import train_test_split
email_train,email_test = train_test_split(email_data,test_size=0.3)
# Preparing email texts into word count matrix format
emails_bow = CountVectorizer(analyzer=split_into_words).fit(email_data.text)
# ["mailing","body","texting"]
# ["mailing","awesome","good"]
# ["mailing","body","texting","good","awesome"]
# "mailing" "body" "texting" "good" "awesome"
# 0 1 1 1 0 0
# 1 1 0 0 1 1
# For all messages
all_emails_matrix = emails_bow.transform(email_data.text)
all_emails_matrix.shape # (5559,6661)
# For training messages
train_emails_matrix = emails_bow.transform(email_train.text)
train_emails_matrix.shape # (3891,6661)
# For testing messages
test_emails_matrix = emails_bow.transform(email_test.text)
test_emails_matrix.shape # (1668,6661)
####### Without TFIDF matrices ########################
# Preparing a naive bayes model on training data set
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB
# Multinomial Naive Bayes
classifier_mb = MB()
classifier_mb.fit(train_emails_matrix,email_train.type)
train_pred_m = classifier_mb.predict(train_emails_matrix)
accuracy_train_m = np.mean(train_pred_m==email_train.type) # 98%
test_pred_m = classifier_mb.predict(test_emails_matrix)
accuracy_test_m = np.mean(test_pred_m==email_test.type) # 96%
# Gaussian Naive Bayes
classifier_gb = GB()
classifier_gb.fit(train_emails_matrix.toarray(),email_train.type.values) # we need to convert tfidf into array format which is compatible for gaussian naive bayes
train_pred_g = classifier_gb.predict(train_emails_matrix.toarray())
accuracy_train_g = np.mean(train_pred_g==email_train.type) # 90%
test_pred_g = classifier_gb.predict(test_emails_matrix.toarray())
accuracy_test_g = np.mean(test_pred_g==email_test.type) # 83%
#########################################################3
# Learning Term weighting and normalizing on entire emails
tfidf_transformer = TfidfTransformer().fit(all_emails_matrix)
# Preparing TFIDF for train emails
train_tfidf = tfidf_transformer.transform(train_emails_matrix)
train_tfidf.shape # (3891, 6661)
# Preparing TFIDF for test emails
test_tfidf = tfidf_transformer.transform(test_emails_matrix)
test_tfidf.shape # (1668, 6661)
# Preparing a naive bayes model on training data set
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB
# Multinomial Naive Bayes
classifier_mb = MB()
classifier_mb.fit(train_tfidf,email_train.type)
train_pred_m = classifier_mb.predict(train_tfidf)
accuracy_train_m = np.mean(train_pred_m==email_train.type) # 96%
test_pred_m = classifier_mb.predict(test_tfidf)
accuracy_test_m = np.mean(test_pred_m==email_test.type) # 95%
# Gaussian Naive Bayes
classifier_gb = GB()
classifier_gb.fit(train_tfidf.toarray(),email_train.type.values) # we need to convert tfidf into array format which is compatible for gaussian naive bayes
train_pred_g = classifier_gb.predict(train_tfidf.toarray())
accuracy_train_g = np.mean(train_pred_g==email_train.type) # 91%
test_pred_g = classifier_gb.predict(test_tfidf.toarray())
accuracy_test_g = np.mean(test_pred_g==email_test.type) # 85%
# inplace of tfidf we can also use train_emails_matrix and test_emails_matrix instead of term inverse document frequency matrix