-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_messages_classifier.py
57 lines (46 loc) · 2.16 KB
/
preprocess_messages_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import csv
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk import NaiveBayesClassifier, classify
import pickle
def preprocess(sentence):
return [wordnet_lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence)]
def get_features(text, setting):
if setting == 'bow':
return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
else:
return {word: True for word in preprocess(text) if not word in stoplist}
def train(features, samples_proportion):
train_size = int(len(features) * samples_proportion)
train_set, test_set = features[:train_size], features[train_size:]
print('Training set size = ' + str(len(train_set)) + ' messages')
print('Test set size = ' + str(len(test_set)) + ' messages')
classifier = NaiveBayesClassifier.train(train_set)
return train_set, test_set, classifier
# evaluate classifier
def evaluate(train_set, test_set, classifier):
print('Accuracy on the training set = ' + str(classify.accuracy(classifier, train_set)))
print('Accuracy of the test set = ' + str(classify.accuracy(classifier, test_set)))
training_data = []
training_labels = []
with open('export.csv', newline='') as csvfile:
csv_object = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in csv_object:
training_data.append(row[0])
training_labels.append(row[1])
stoplist = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()
all_features = [(get_features(data, 'bow'), label) for (data, label) in zip(training_data, training_labels)]
train_set, test_set, classifier = train(all_features, 0.8)
evaluate(train_set, test_set, classifier)
classifier.show_most_informative_features(20)
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier, protocol=2)
save_classifier.close()
fs = get_features("python is a great programming language", 'bow')
label = classifier.prob_classify(fs)
print(label.prob('on'), label.prob('off'))
fs2 = get_features("women", 'bow')
label2 = classifier.prob_classify(fs2)
print(label2.prob('on'), label2.prob('off'))