-
Notifications
You must be signed in to change notification settings - Fork 0
/
s_knn.py
211 lines (172 loc) · 6.08 KB
/
s_knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#simple machine learning projects with Korean and Japanese money
%matplotlib inline
import numpy as np
import scipy as sp
import os, sys
from scipy import ndimage
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from PIL import Image
#width and height we are going to work with for the .png pictures
width = 322
height = 137
num_of_pics = 9
#paths for folders
#korean currency
path_k_1000 = 'Korea/1000/'
path_k_5000 = 'Korea/5000/'
path_k_10000 = 'Korea/10000/'
path_k_50000 = 'Korea/50000/'
#japanese currency
path_j_1000 = 'Japan/1000_yen/'
path_j_5000 = 'Japan/5000_yen/'
path_j_10000 = 'Japan/10000_yen/'
#list
paths = [path_k_1000, path_k_5000, path_k_10000, path_k_50000, path_j_1000, path_j_5000, path_j_10000]
file_name = ['1000_', '5000_', '10000_', '50000_', 'j_1000_', 'j_5000_', 'j_10000_']
#TESTING
img = Image.open('Korea/1000/1000_9.png') # image extension *.png,*.jpg
new_width = 322
new_height = 187
img = img.resize((new_width, new_height), Image.ANTIALIAS)
img.save('test.png') # format may what u want ,*.png,*jpg,*.gif
string = str(10)
type(string)
file_name
#function that takes image and converts it into one dimensional array and append
def img_to_array(array, img_path):
#open image and convert
png = Image.open(img_path)
png.load()
background = Image.new("RGB", png.size, (255, 255, 255))
background.paste(png, mask=png.split()[3]) # 3 is the alpha channel
background
test = np.array(background)
test = test.flatten()
#use numpy stack
array = np.vstack((array, test))
return array
#now we want to normalizae all the .png data into same size. So we will have same number of features
#there are total of 9 * 7 = 63 images, and each folder has 9 images
index = 0
data = np.zeros(132342) #give initial first row, and get rid of this row later
for path in paths:
#if path is Korean currency
if path[0] == 'K':
num_pics = 9
start = 0
while start < num_pics:
#get the path for the image
pic_num = start + 1
path_image = path + file_name[index] + str(pic_num) + '.png'
new_path_image = path + file_name[index] + str(pic_num)+'_N' + '.png'
img = Image.open(path_image)
img = img.resize((width, height), Image.ANTIALIAS)
img.save(new_path_image)
#insert this information into the numpy array
data = img_to_array(data,new_path_image)
#update index
start = start + 1
#if path is Japanese currency
elif path[0] == 'J':
num_pics = 9
start = 0
while start < num_pics:
#get the path for the image
pic_num = start + 1
path_image = path + file_name[index] + str(pic_num) + '.png'
new_path_image = path + file_name[index] + str(pic_num)+'_N' + '.png'
img = Image.open(path_image)
img = img.resize((width, height), Image.ANTIALIAS)
img.save(new_path_image)
#insert this information into the numpy array
data = img_to_array(data,new_path_image)
#update index
start = start + 1
index = index + 1
data = np.delete(data, 0, 0)
#Displaying image from array -> picture
STANDARD_SIZE = (322, 137)#standardized pixels in image.
def get_image(mat):
size = STANDARD_SIZE[0]*STANDARD_SIZE[1]*3
r,g,b = mat[0:size:3], mat[1:size:3],mat[2:size:3]
rgbArray = np.zeros((STANDARD_SIZE[1],STANDARD_SIZE[0], 3), 'uint8')#3 channels
rgbArray[..., 0] = r.reshape((STANDARD_SIZE[1], STANDARD_SIZE[0]))
rgbArray[..., 1] = b.reshape((STANDARD_SIZE[1], STANDARD_SIZE[0]))
rgbArray[..., 2] = g.reshape((STANDARD_SIZE[1], STANDARD_SIZE[0]))
return rgbArray
def display_image(mat):
with sns.axes_style("white"):
plt.imshow(get_image(mat))
plt.xticks([])
plt.yticks([])
display_image(data[0])
#example of displying an image
#Machine learning part (PCA and kNN)
from sklearn.decomposition import PCA
pca = PCA(n_components=60)
X = pca.fit_transform(data)
print pca.explained_variance_ratio_.sum()
pca
#Above code tells us that 99percent of variance is explained by 60 features. Let's create pandas dataframe with all the labels, and features
#Create DataFrame
DF = pd.DataFrame(columns={"Label"})
type_bills = np.empty([63,], dtype = '|S24')
type_KJ = ['K_1000', 'K_5000', 'K_10000', 'K_50000', 'J_1000', 'J_5000', 'J_10000']
start = 0
stop = 9
count = 0
while count < 63:
type_bills[count] = type_KJ[count/9]
count = count + 1
#save each pc value to coressponding variable
df = pd.DataFrame({"Label":type_bills})
for i in range(pca.explained_variance_ratio_.shape[0]):
df["pc%i" % (i+1)] = X[:,i]
df.head()
#function that will help displaying component of PCA
def normit(a):
a=(a - a.min())/(a.max() -a.min())
a=a*256
return np.round(a)
def getNC(pc, j):
size=322*137*3
r=pc.components_[j][0:size:3]
g=pc.components_[j][1:size:3]
b=pc.components_[j][2:size:3]
r=normit(r)
g=normit(g)
b=normit(b)
return r,g,b
def display_component(pc, j):
r,g,b = getNC(pc,j)
rgbArray = np.zeros((137,322,3), 'uint8')
rgbArray[..., 0] = r.reshape(137,322)
rgbArray[..., 1] = g.reshape(137,322)
rgbArray[..., 2] = b.reshape(137,322)
plt.imshow(rgbArray)
plt.xticks([])
plt.yticks([])
#Testing
display_component(pca,23)
#Graph kNN
k_mark = 'o'
j_mark = 's'
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for label, color in zip(df['Label'].unique(), colors):
mask = df['Label']==label
if(label[0] == 'K'):
plt.scatter(df[mask]['pc1'], df[mask]['pc2'], c=color, label=label, marker=k_mark, s = 150)
elif(label[0] == 'J'):
plt.scatter(df[mask]['pc1'], df[mask]['pc2'], c=color, label=label, marker=j_mark, s = 150)
plt.title("kNN between Korean currency and Japanese currency")
plt.legend()