Skip to content

Commit 83ba5e6

Browse files
authored
Add files via upload
1 parent 6616d30 commit 83ba5e6

File tree

4 files changed

+719
-0
lines changed

4 files changed

+719
-0
lines changed

Diff for: ML_classifiers.py

+183
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
import sys
5+
from sklearn import datasets
6+
from sklearn import svm
7+
from sklearn.feature_extraction.text import CountVectorizer
8+
from sklearn.feature_extraction.text import TfidfTransformer
9+
from sklearn.naive_bayes import MultinomialNB
10+
from sklearn.linear_model import SGDClassifier
11+
from sklearn.linear_model import LogisticRegression
12+
from sklearn.ensemble import RandomForestClassifier
13+
from sklearn.pipeline import Pipeline
14+
from sklearn import metrics
15+
from sklearn.externals import joblib
16+
from nltk.stem import *
17+
from nltk.stem.porter import *
18+
19+
20+
if __name__ == '__main__':
21+
22+
#train_file = open(sys.argv[1], 'r')
23+
#test_file = open(sys.argv[2], 'r')
24+
25+
sizes = []
26+
f1_scores_nb = []
27+
f1_scores_svm = []
28+
f1_scores_lr = []
29+
f1_scores_rf = []
30+
31+
train_data = datasets.load_files("Selected 20NewsGroup/Training",decode_error='ignore',encoding='utf-8',shuffle=True)
32+
test_data = datasets.load_files("Selected 20NewsGroup/Test",decode_error='ignore',encoding='utf-8')
33+
docs_test = test_data.data
34+
35+
# Removing header
36+
for i in range(len(train_data.data)):
37+
train_data.data[i] = "\n".join(train_data.data[i].split("\n")[3:])
38+
39+
# Extracting features
40+
count_vect = CountVectorizer()
41+
X_train_counts = count_vect.fit_transform(train_data.data)
42+
43+
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
44+
X_train_tf = tf_transformer.transform(X_train_counts)
45+
tfidf_transformer = TfidfTransformer()
46+
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
47+
48+
# Stemming data
49+
stemmer = PorterStemmer()
50+
words = []
51+
st = []
52+
for i in range(len(train_data.data)):
53+
words = train_data.data[i].split(" ")
54+
singles = [stemmer.stem(word) for word in words]
55+
st.append(' '.join(singles))
56+
57+
58+
# Naive Bayes
59+
print("Naive Bayes")
60+
print("\n")
61+
text_clf_nb = Pipeline([('vect', CountVectorizer(stop_words='english')),
62+
('tfidf', TfidfTransformer()),
63+
('clf', MultinomialNB()),
64+
])
65+
text_clf_1 = text_clf_nb.fit(st, train_data.target)
66+
predicted1 = text_clf_1.predict(docs_test)
67+
print(metrics.classification_report(test_data.target, predicted1, target_names=test_data.target_names))
68+
69+
# SVM Classifier
70+
print("SVM Classifier")
71+
print("\n")
72+
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
73+
('tfidf', TfidfTransformer()),
74+
('clf', SGDClassifier(loss='hinge',penalty='l2'))
75+
])
76+
text_clf_2 = text_clf_svm.fit(st, train_data.target)
77+
predicted2 = text_clf_2.predict(docs_test)
78+
#svm.SVC(kernel='rbf')
79+
print(metrics.classification_report(test_data.target, predicted2, target_names=test_data.target_names))
80+
81+
#Logistic Regression
82+
print("Logistic Regression")
83+
print("\n")
84+
text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words='english')),
85+
('tfidf', TfidfTransformer()),
86+
('clf', LogisticRegression()),
87+
])
88+
text_clf_3 = text_clf_lr.fit(st, train_data.target)
89+
predicted3 = text_clf_3.predict(docs_test)
90+
print(metrics.classification_report(test_data.target, predicted3, target_names=test_data.target_names))
91+
92+
#Random Forest
93+
print("Random Forest")
94+
print("\n")
95+
text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words='english')),
96+
('tfidf', TfidfTransformer()),
97+
('clf', RandomForestClassifier()),
98+
])
99+
text_clf_4 = text_clf_rf.fit(st, train_data.target)
100+
predicted4 = text_clf_4.predict(docs_test)
101+
print(metrics.classification_report(test_data.target, predicted4, target_names=test_data.target_names))
102+
103+
104+
# Splitting Training size
105+
size1 = 0.2 * len(train_data.data)
106+
sizes.append(size1)
107+
108+
size2 = 0.4 * len(train_data.data)
109+
sizes.append(size2)
110+
111+
size3 = 0.6 * len(train_data.data)
112+
sizes.append(size3)
113+
114+
size4 = 0.8 * len(train_data.data)
115+
sizes.append(size4)
116+
117+
# Loop for different splits in training sets
118+
for s in sizes:
119+
120+
train = train_data.data[0:int(s)]
121+
train_target = train_data.target[0:int(s)]
122+
#Naive Bayes
123+
text_clf_split_nb = text_clf_nb.fit(train, train_target)
124+
predicted_nb = text_clf_split_nb.predict(docs_test)
125+
f1_scores_nb.append(metrics.f1_score(test_data.target, predicted_nb, average='macro'))
126+
127+
#SVM
128+
text_clf_split_svm = text_clf_svm.fit(train, train_target)
129+
predicted_svm = text_clf_split_svm.predict(docs_test)
130+
f1_scores_svm.append(metrics.f1_score(test_data.target, predicted_svm, average='macro'))
131+
132+
#Logistic Regression
133+
text_clf_split_lr = text_clf_lr.fit(train, train_target)
134+
predicted_lr = text_clf_split_lr.predict(docs_test)
135+
f1_scores_lr.append(metrics.f1_score(test_data.target, predicted_lr, average='macro'))
136+
137+
#Random Forest
138+
text_clf_split_rf = text_clf_rf.fit(train, train_target)
139+
predicted_rf = text_clf_split_rf.predict(docs_test)
140+
f1_scores_rf.append(metrics.f1_score(test_data.target, predicted_rf, average='macro'))
141+
142+
#plt.title("Learning curve for Naive Bayes")
143+
plt.ylabel("F1-scores")
144+
plt.xlabel("Training Sizes")
145+
plt.plot(sizes, f1_scores_nb, label="Naive Bayes")
146+
147+
#plt.title("Learning curve for SVM")
148+
plt.ylabel("F1-scores")
149+
plt.xlabel("Training Sizes")
150+
plt.plot(sizes, f1_scores_svm, label="SVM")
151+
152+
#plt.title("Learning curve for Logistic Regression")
153+
plt.ylabel("F1-scores")
154+
plt.xlabel("Training Sizes")
155+
plt.plot(sizes, f1_scores_lr, label="Logistic Regression")
156+
157+
#plt.title("Learning curve for Random Forest")
158+
plt.ylabel("F1-scores")
159+
plt.xlabel("Training Sizes")
160+
plt.plot(sizes, f1_scores_rf, label="Random Forest")
161+
162+
plt.grid(True)
163+
plt.legend(loc='best')
164+
plt.title("Training Size vs F1-score")
165+
plt.savefig("Legend plots")
166+
plt.close()
167+
168+
#Code to dump and load
169+
170+
#joblib.dump(text_clf_2, 'classifier.pkl')
171+
#classifier = joblib.load('classifier.pkl')
172+
#predicted_temp = classifier.predict(docs_test)
173+
#print("Loading.........")
174+
#print(metrics.classification_report(test_data.target, predicted_temp, target_names=test_data.target_names))
175+
176+
177+
178+
179+
180+
181+
182+
183+

Diff for: dsfb.py

+200
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import sys
2+
import heapq
3+
from itertools import count
4+
import time
5+
6+
7+
class PriorityQueue:
8+
def __init__(self):
9+
self._queue = []
10+
self.counter = count()
11+
12+
def put(self, item, priority):
13+
heapq.heappush(self._queue, (priority, next(self.counter), item))
14+
15+
def get(self):
16+
return heapq.heappop(self._queue)[2]
17+
18+
def empty(self):
19+
return len(self._queue) == 0
20+
21+
def __str__(self):
22+
return str(self._queue)
23+
24+
25+
class GraphColor:
26+
27+
def __init__(self, graph, values):
28+
29+
self.graph = graph
30+
self.values = values
31+
self.variables = list(self.graph.keys())
32+
self.domains = {var: list(self.values) for var in self.variables}
33+
self.curr_domains = None
34+
self.search = 0
35+
self.pruning = 0
36+
37+
# Checking consistency
38+
def isConsistent(self, var, color):
39+
self.load_domain()
40+
for neigh in self.graph[var]:
41+
if color == self.curr_domains[neigh]:
42+
return False
43+
return True
44+
45+
def isConsistentPlain(self, var, color, assign):
46+
for neigh in self.graph[var]:
47+
if neigh in assign:
48+
if color == assign[neigh]:
49+
return False
50+
return True
51+
52+
# Checking constraint satisfaction for arc consistency
53+
def isconstraint(self, X, x, Y, y):
54+
return x != y
55+
56+
def select_unassign_variable(self, assign):
57+
if len(assign) == 0:
58+
return self.variables[0]
59+
for i in self.variables:
60+
if i not in assign:
61+
return i
62+
63+
# Minimum remaining variable
64+
def select_unassigned_variable(self, assign):
65+
self.load_domain()
66+
unassign = [v for v in self.variables if v not in assign]
67+
min = 9999999
68+
for key in unassign:
69+
if min > len(self.curr_domains[key]):
70+
min = len(self.curr_domains[key])
71+
v = key
72+
return v
73+
74+
# Least constraining values
75+
def order_domain_values(self, assign, var):
76+
self.load_domain()
77+
lis = []
78+
res = []
79+
queue = PriorityQueue()
80+
for neigh in self.graph[var]:
81+
lis.append(self.curr_domains[neigh])
82+
k = sum([], lis)
83+
for val in self.curr_domains[var]:
84+
c = 0
85+
for i in k:
86+
if val == i:
87+
c += 1
88+
queue.put(val, c)
89+
while not queue.empty():
90+
res.append(queue.get())
91+
return res
92+
93+
def load_domain(self):
94+
if self.curr_domains is None:
95+
self.curr_domains = {v: list(self.domains[v]) for v in self.variables}
96+
97+
# List to maintain the (variable, value) that are removed for a specific variable
98+
def remove(self, var, value):
99+
self.load_domain()
100+
rem = [(var, a) for a in self.curr_domains[var] if a != value]
101+
self.curr_domains[var] = [value]
102+
return rem
103+
104+
# Restore the removal list
105+
def restore(self, rem):
106+
for var, color in rem:
107+
self.curr_domains[var].append(color)
108+
109+
# Removing arc inconsistencies
110+
def remove_inconsistent_values(self, xi, xj, rem):
111+
removed = False
112+
for x in self.curr_domains[xi]:
113+
if all(not self.isconstraint(xi, x, xj, y) for y in self.curr_domains[xj]):
114+
self.curr_domains[xi].remove(x)
115+
if rem is not None:
116+
rem.append((xi, x))
117+
removed = True
118+
return removed
119+
120+
# Arc consistency code
121+
def ac3(self, rem):
122+
queue = [(a, b) for a in self.variables for b in self.graph[a]]
123+
while len(queue) != 0:
124+
(xi, xj) = queue.pop()
125+
if self.remove_inconsistent_values(xi, xj, rem):
126+
self.pruning += 1
127+
for xk in self.graph[xi]:
128+
queue.append((xk, xi))
129+
130+
# DSFB code
131+
def dsfb_plain(self, assign):
132+
if len(assign) == len(self.variables):
133+
return assign
134+
var = self.select_unassign_variable(assign)
135+
for color in self.domains[var]:
136+
if self.isConsistentPlain(var, color, assign):
137+
assign[var] = color
138+
res = self.dsfb_plain(assign)
139+
if res != "fail":
140+
return res
141+
del assign[var]
142+
return "fail"
143+
144+
# DSFB++ code
145+
def dsfb_improved(self, assign):
146+
self.search += 1
147+
if len(assign) == len(self.variables):
148+
print("Search calls are {}".format(self.search))
149+
print("Arc pruning calls are {}".format(self.pruning))
150+
return assign
151+
var = self.select_unassigned_variable(assign)
152+
for color in self.order_domain_values(assign, var):
153+
if self.isConsistent(var, color):
154+
assign[var] = color
155+
rem = self.remove(var, color)
156+
self.ac3(rem)
157+
res = self.dsfb_improved(assign)
158+
if res != "fail":
159+
return res
160+
self.restore(rem)
161+
del assign[var]
162+
return "fail"
163+
164+
if __name__ == '__main__':
165+
166+
in_file = open(sys.argv[1], 'r')
167+
out_file = open(sys.argv[2], 'w')
168+
mode = sys.argv[3]
169+
lis = []
170+
adj = []
171+
graph = {}
172+
for line in in_file.readlines():
173+
lis.append(line.rstrip().split())
174+
175+
for i in range(0,int(lis[0][0])):
176+
graph[i] = []
177+
178+
# Constructing adjacency list of the given input
179+
for edge in lis[1:]:
180+
graph[int(edge[0])].append(int(edge[1]))
181+
graph[int(edge[1])].append(int(edge[0]))
182+
183+
val = range(int(lis[0][2]))
184+
assignment = {}
185+
g = GraphColor(graph, val)
186+
if mode == '0':
187+
t = time.time()
188+
asgn = g.dsfb_plain(assignment)
189+
print("Time taken: {} ms".format((time.time() - t)*1000))
190+
else:
191+
t = time.time()
192+
asgn = g.dsfb_improved(assignment)
193+
print("Time taken: {} ms".format((time.time() - t)*1000))
194+
195+
if asgn == "fail":
196+
print("No Answer")
197+
out_file.write("No Answer")
198+
else:
199+
for i in list(asgn.values()):
200+
out_file.write(str(i) + "\n")

0 commit comments

Comments
 (0)