-
Notifications
You must be signed in to change notification settings - Fork 2
/
titanic.py
164 lines (137 loc) · 5.16 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Before using this script you have to download test.csv and train.csv from this site: https://www.kaggle.com/c/titanic/data
"""
import csv as csv
import numpy as np
# load in the csv file and read it as text
csv_file_object = csv.reader(open('train.csv', 'rU'))
# we want to store the csv data inside a numpy array
data=np.array(csv_file_object.next())
data=np.expand_dims(data,axis=0)
# save each row to our new data array
for row in csv_file_object:
data = np.append(data,[row],axis=0)
header = data[0]
data = np.delete(data,0,0)
print("header: ",header)
hd = {}
i = 0
for h in header:
hd[h] = i
i += 1
def sigmoid(x):
return 1.0/(1+np.exp(-x))
"""
Predicts the outcome for the input x and the weights w
x_0 is 1 and w_0 is the bias
"""
def predict(inp,weights):
return sigmoid(np.dot(inp.T,weights.T))
"""
get the gradient for the inputs x,weights w and the outputs y
"""
def gradient(x,w,y):
# create an empty gradient array which has the same length as the array of weights
grads = np.empty((len(w)))
# compute the gradient for each weight
for j in range(len(w)):
grads[j] = np.mean(np.sum([x[i,j]*(predict(x[i],w)-y[i]) for i in range(len(x))]))
return grads
"""
get the new weights based on the old ones w, learning rate a and the gradients
"""
def getWeights(w,a,grads):
return w-a*grads
"""
Determine the cost of the prediction (pred)
"""
def cost(real,pred):
return np.sqrt(1.0/2*np.mean(np.power(real-pred,2)))
def getInputRepresentation(m,entry,test=False):
if test:
t = 1
else:
t = 0
inp = np.zeros(m+1)
inp[0] = 1
inp[1] = entry[hd['Sex']-t] == "female"
pClass = entry[hd['Pclass']-t].astype(np.float)
inp[2] = pClass == 1
inp[3] = pClass == 2
inp[4] = pClass == 3
# we can also add the age column and divide it into 5 groups
if entry[hd['Age']-t] == '':
inp[5:10] = [0.1,0.3,0.2,0.2,0.2]
else:
inp[5] = (entry[hd['Age']-t].astype(np.float) <= 15)
inp[6] = (entry[hd['Age']-t].astype(np.float) > 15) & (entry[hd['Age']-t].astype(np.float) <= 25)
inp[7] = (entry[hd['Age']-t].astype(np.float) > 25) & (entry[hd['Age']-t].astype(np.float) <= 32)
inp[8] = (entry[hd['Age']-t].astype(np.float) > 32) & (entry[hd['Age']-t].astype(np.float) <= 41)
inp[9] = (entry[hd['Age']-t].astype(np.float) > 41)
inp[10] = (entry[hd['SibSp']-t].astype(np.float) <= 0)
inp[11] = (entry[hd['SibSp']-t].astype(np.float) > 0) & (entry[hd['SibSp']-t].astype(np.float) <= 1)
inp[12] = (entry[hd['SibSp']-t].astype(np.float) > 1)
inp[13] = (entry[hd['Parch']-t].astype(np.float) <= 0)
inp[14] = (entry[hd['Parch']-t].astype(np.float) > 0) & (entry[hd['Parch']-t].astype(np.float) <= 1)
inp[15] = (entry[hd['Parch']-t].astype(np.float) > 1)
if entry[hd['Fare']-t] == '':
inp[16:21] = [0.2,0.2,0.2,0.2,0.2]
else:
inp[16] = (entry[hd['Fare']-t].astype(np.float) <= 8)
inp[17] = (entry[hd['Fare']-t].astype(np.float) > 8) & (entry[hd['Fare']-t].astype(np.float) <= 11)
inp[18] = (entry[hd['Fare']-t].astype(np.float) > 11) & (entry[hd['Fare']-t].astype(np.float) <= 22)
inp[19] = (entry[hd['Fare']-t].astype(np.float) > 22) & (entry[hd['Fare']-t].astype(np.float) <= 40)
inp[20] = (entry[hd['Fare']-t].astype(np.float) > 40)
title = entry[hd['Name']-t].split(", ")[1].split(" ")[0]
inp[21] = title == "Mr."
inp[22] = title == "Mrs."
inp[23] = title == "Miss."
inp[24] = title == "Master."
inp[25] = title == "Dr."
inp[26] = title == "Sir."
if (np.count_nonzero(inp[21:27]) == 0):
inp[27] = 1
return inp
inputs = []
outputs = []
m = 27 # number of features without threshold
for entry in data:
inp = getInputRepresentation(m,entry)
inputs.append(inp)
outputs.append(entry[hd['Survived']])
inputs = np.array(inputs).astype(np.float)
outputs = np.array(outputs).astype(np.float)
weights = np.random.rand(m+1) # one for the threshold
alpha = 0.001
epochs = 100
train_size = int((3*len(inputs))/4)
trainX = inputs[0:train_size]
trainY = outputs[0:train_size]
testX = inputs[train_size:]
testY = outputs[train_size:]
for t in range(epochs):
weights = getWeights(weights,alpha,gradient(trainX,weights,trainY))
sum_costs = 0
for inp,outp in zip(testX,testY):
prediction = predict(inp,weights)
last_cost = cost(outp,0 if prediction < 0.5 else 1)
sum_costs += last_cost
print(weights)
print(sum_costs/(len(inputs)-train_size))
# First, read in test.csv
test_file = open('test.csv', 'rU')
test_file_object = csv.reader(test_file)
header = test_file_object.next()
# Write out the PassengerId, and my prediction.
predictions_file = open("prediction.csv", "w")
predictions_file_object = csv.writer(predictions_file)
# write the column headers
predictions_file_object.writerow(["PassengerId", "Survived"])
# For each row in test file,
for row in test_file_object:
inp = getInputRepresentation(m,np.array(row),test=True)
prediction = predict(inp,weights)
predictions_file_object.writerow([row[0], "0" if prediction < 0.5 else "1"])
# Close out the files.
test_file.close()
predictions_file.close()