-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSVM_base.py
176 lines (127 loc) · 5.1 KB
/
SVM_base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import csv
import pdb
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.metrics import accuracy_score
SEED_VAL = 1991
df = pd.read_csv('data/cleaned_data_ENG.csv')
#figure out salary buckets
min_sal_sorted_df = df.sort_values(by=['min_salary'])
max_sal_sorted_df = df.sort_values(by=['max_salary'])
plt.plot(min_sal_sorted_df['min_salary'].tolist(), 'bo')
plt.plot(max_sal_sorted_df['max_salary'].tolist(), 'ro')
#plt.savefig("salary_scurve.png")
#Ok lets do some bucketing
min_sal_buckets = range(50000,210000,10000)
num_buckets = len(min_sal_buckets)+1
def bucket_to_range(bucket):
if bucket == 0:
bucket_str = "<" + str(min_sal_buckets[0])
return bucket_str
if bucket == len(min_sal_buckets):
bucket_str = ">" + str(min_sal_buckets[-1])
return bucket_str
bottom = min_sal_buckets[bucket-1]
top = min_sal_buckets[bucket]
return str(bottom) + "-" + str(top)
def min_class_bucket(row):
proper_bucket = -1
if row.min_salary < min_sal_buckets[0]:
return 0
elif row.min_salary >= min_sal_buckets[-1]:
return len(min_sal_buckets)
for i in range(1,len(min_sal_buckets)):
if row.min_salary >= min_sal_buckets[i-1] and row.min_salary < min_sal_buckets[i]:
return i
df['salary_bucket'] = df.apply(min_class_bucket, axis=1)
df = df.astype( {'salary_bucket':int } )
df.to_csv("data/train_data_with_salary_buckets_ENG.csv")
df_XforSVM = df.filter(['is_acquired', 'is_public', 'remote_ok', 'NYC', \
'LA', 'SF', 'SEA', 'senior', 'back_end', 'full_stack', 'front_end'], axis=1)
df_YforSVM = df.filter(['salary_bucket'], axis=1 )
clf = LinearSVC(random_state=0, tol=1e-5, max_iter=10000)
X_train, X_test, Y_train, Y_test = train_test_split( df_XforSVM.values, df_YforSVM.values.ravel() , test_size=500, random_state=1991)
clf.fit(X_train, Y_train)
#Let's also try on a different type of svm with rbf kernel
rbf_clf = SVC(gamma = 'scale')
rbf_clf.fit(X_train,Y_train)
##Let's try testing this:
Y_test_pred = clf.predict(X_test)
Y_test_rbf = rbf_clf.predict(X_test)
#See how correct you are:
test_perf = accuracy_score(Y_test, Y_test_pred)
test_perf_rbf = accuracy_score(Y_test, Y_test_rbf)
print("Linear Performance on test set: " + str(test_perf) )
print("RBF Performance on test set: " + str(test_perf_rbf) )
#Performance on train set
Y_train_pred = clf.predict(X_train)
train_perf = accuracy_score(Y_train, Y_train_pred)
#Lets look at how far we are off
dif_matrix = Y_train - Y_train_pred
unique, counts = np.unique(dif_matrix, return_counts=True)
my_class_counts = dict(zip(unique, counts))
dif_matrix_abs = np.abs(dif_matrix)
unique, counts = np.unique(dif_matrix_abs, return_counts=True)
my_class_counts_abs = dict(zip(unique, counts))
x_count = [int(v) for v in my_class_counts.keys()]
y_count = [my_class_counts[v] for v in my_class_counts.keys()]
x_count_abs = [int(v) for v in my_class_counts_abs.keys()]
y_count_abs = [my_class_counts_abs[v] for v in my_class_counts_abs.keys()]
plt.clf()
plt.bar(x_count,y_count)
plt.xlabel('Error Misclassification')
plt.ylabel('Number of Occurences')
plt.title('Number of Buckets SVM Was Off By')
#plt.show()
plt.clf()
plt.bar(x_count_abs,y_count_abs)
plt.xlabel('Magnitude of Error Misclassification')
plt.ylabel('Number of Occurences')
plt.title('Number of Buckets SVM Was Off By')
#plt.show()
#total records
total_rec = sum(y_count_abs)
zero_to_one_bucket = sum(y_count_abs[0:2])
print("Percentage off by 1 bucket or less: " + str( zero_to_one_bucket / total_rec) )
##Perform same analysis for test set
dif_matrix_abs_test = np.abs(Y_test - Y_test_pred)
unique, counts = np.unique(dif_matrix_abs_test, return_counts=True)
plt.clf()
plt.bar(unique,counts)
plt.xlabel('Magnitude of Error Misclassification on Test Set')
plt.ylabel('Number of Occurences')
plt.title('Number of Buckets SVM Was Off By on Test Set')
#plt.show()
total_rec = sum(counts)
zero_to_one_bucket = sum(counts[0:2])
print("Percentage off by 1 bucket or less on test set: " + str( zero_to_one_bucket / total_rec) )
Y_train_pred_rbf = rbf_clf.predict(X_train)
train_perf_rbf = accuracy_score(Y_train, Y_train_pred_rbf)
print("Linear performance on train set: " + str(train_perf) )
print("RBF performance on train set: " + str(train_perf_rbf) )
#Lets take a look at what's being predicted; i.e. where am I doing poorly?
#pdb.set_trace()
df_results = pd.DataFrame( {'ytrain': Y_train, 'ypred' : Y_train_pred} )
df_count = df_results.groupby(df_results.columns.tolist()).size().reset_index().rename(columns={0:'count'})
ybase = df_count['ytrain']
ypd = df_count['ypred']
ypairs = list(zip(ybase, ypd))
plt.clf()
y_pos = np.arange(len(ypairs))
plt.bar( y_pos, df_count['count'] )
plt.xticks(y_pos, ypairs )
plt.xticks(rotation=90)
plt.xlabel("Pair (true_class,predicted_class)")
plt.ylabel("Count")
plt.title("Error Analysis for Multi-Class SVM")
plt.savefig("SVM_error_analysis")
#plt.show()
##Notes:
#Only predicting really into 3 classes for anything 0,3,4,6 and sometimes 8.
#does that mean I should try fewer buckets?
#1) try to incorporate company size; filter out cases with no size
print("Done!")