-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_strm_subclass.py
317 lines (285 loc) · 13.9 KB
/
data_strm_subclass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import numpy as np
import scipy.linalg as la
from matplotlib import pyplot as plt
from scipy import sparse as sp
from time import time
import scipy.sparse.linalg as spla
from math import sqrt
import streaming_subclass as stsb
#####################################################################
# Obtain the necessary data and data values
def get_bagX(filename, Acc=True):
'''
Reads in bag of words data and return it as a sparse csr matrix
Inputs:
--------------------------------------------------------------------
filename: str, name of the file containing the bag of words data
Outputs:
-------------------------------------------------------------------
n: int, the number of samples in the dataset (in this case, the number of
documents)
d: int, the number of features in the dataset (in this case, the number of
words)
nnz: int, the number of nonzero values in the dataset
density: float between 0 and 1, the density of the dataset (indicates
sparsity)
SparseX: sparse nxd csr matrix where each row is a document and each column
is a word
norm2: optional output (returns if Acc=True). The frobenius norm squared of
the dataset. Note that if you want to compute the explained variance for
your streaming PCA algorithm, you need the squared frobenius norm of the
dataset.
'''
DWN = np.genfromtxt(filename, max_rows=3)
# D is the number of samples (n), W is the number of words (d) and N
# is the number of nonzero values
n, d, nnz = DWN[0], DWN[1], DWN[2]
density = nnz / (n * d)
Data = np.loadtxt(filename, skiprows=3, dtype=int)
SparseX = sp.csr_matrix((Data[:,2], (Data[:,0]-1, Data[:,1]-1)))
if Acc:
norm2 = spla.norm(SparseX, ord='fro')**2
return n, d, nnz, density, SparseX, norm2
else:
return n, d, nnz, density, SparseX
def get_bagXblocks(filename, B, Acc=True, block_total=1000):
'''
Reads in bag of words data and returns the properties as well as a list of
sparse blocks
Inputs:
--------------------------------------------------------------------
filename: str, name of the file containing the bag of words data
B: int, the number of rows in each block
Acc: optional bool, indicates whether or not the accuracy will be measured
for this dataset. If True, returns the norm of the dataset as well.
Outputs:
-------------------------------------------------------------------
n: int, the number of samples in the dataset (in this case, the number of
documents)
d: int, the number of features in the dataset (in this case, the number of
words)
nnz: int, the number of nonzero values in the dataset
density: float between 0 and 1, the density of the dataset (indicates
sparsity)
SparseX: sparse nxd csr matrix where each row is a document and each column
is a word
norm2: optional output (returns if Acc=True). The frobenius norm squared of
the dataset
'''
Xblocks=[]
with open(filename, 'r') as f:
n = int(f.readline())
d = int(f.readline())
nnz = int(f.readline())
density = nnz / (n*d)
blocknum = 1
row, col, data = [], [], []
for i in range(nnz):
entry = list(map(int, f.readline().split()))
# if the row num (with zero based indexing)
# is in the current block
if entry[0] - 1 < blocknum * B:
# note bag of words uses 1 based indexing
row.append((entry[0]-1) % B)
col.append(entry[1]-1)
data.append(entry[2])
else:
Xi = sp.csr_matrix((data, (row,col)), shape=(B,d))
Xblocks.append(Xi)
blocknum += 1
if blocknum > block_total:
if Acc:
norm2 = 0
for X in Xblocks:
norm2 += spla.norm(X, ord='fro')**2
return n, d, nnz, density, Xblocks, norm2
else:
return n, d, nnz, density, Xblocks
# Start the new block in the row, col, and data entries.
row, col, data = [(entry[0] - 1) % B], [entry[1] - 1], [entry[2]]
Xi = sp.csr_matrix((data, (row, col)), shape=(B,d))
Xblocks.append(Xi)
if Acc:
norm2 = 0
for X in Xblocks:
norm2 += spla.norm(X, ord='fro')**2
return n, d, nnz, density, Xblocks, norm2
else:
return n, d, nnz, density, Xblocks
#########################################################################################
# Run the dataset simultaneously for multiple algorithms
# Currently: Oja with learning rates c/t and c/sqrt(t), AdaOja, and HPCA
def run_sim_bag(filename, k, methods=['AdaOja', 'HPCA', 'SPM'], tol=.005, b0=1e-5, p=None, B=10, m=1, gamma=.9, beta_1 = 0.9, beta_2 = 0.999, delta=1e-8, eta=1e-3, Sparse=True, Acc=True, X=None, xnorm2=None, num_acc=100, Time=True, bias_correction=False, b0_dim=1):
'''
This runs several streaming PCA algorithms simultaneously on bag of words
data
Inputs:
----------------------------------------------------------------------------
filename: The name of the file containing the bag-of-words data
k: int, the number of top eigenvectors to compute using the streaming PCA
algorithms
b0: optional float > 0, default 1e-5. The initial "guess" for the learning
rate parameter for adagrad
p: optional int, default None (which initializes to k). p >= k, the number
of vectors used in the SPM method.
B: optional int, the batch size for the streaming methods. Default 10.
m: optional int > 0, default 1. The number of convergence iterations per
block for HPCA
Sparse: optional Bool, default True. Indicates whether the samples are
added in as sparse or dense arrays.
Acc: optional Bool, default False. Indicates whether the accuracy, here the
explained variance, is computed at each block step.
X: Nonetype, nxd array, or list of Bval x d blocks Xi s.t. Xi make up the
rows of X (note the last block in X may not be of length Bval, but all
other blocks are assumed to have the same number of rows). X must be
provided if Acc=True.
xnorm2: optional float, the squared frobenius norm of X.
num_acc: optional number of accuracy readings to take out of all possible
block samples. Acc <= int(n/B).
Time: optional Bool, default False. Indicates whether or not to time the
implementation.
Outputs:
----------------------------------------------------------------------------
adaoja =
hpca =
spm =
'''
with open(filename, 'r') as f:
n = int(f.readline())
d = int(f.readline())
nnz = int(f.readline())
spca_objects = []
# Initialize the streaming objects
if 'AdaOja' in methods:
adaoja = stsb.AdaOja(d, k, b0=b0, B=B, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time, b0_dim=b0_dim)
spca_objects.append(adaoja)
if 'HPCA' in methods:
hpca = stsb.HPCA(d, k, B=B, m=m, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time)
spca_objects.append(hpca)
if 'SPM' in methods:
spm = stsb.SPM(d, k, p=p, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time)
spca_objects.append(spm)
if 'RMSProp' in methods:
rmsp = stsb.RMSProp(d, k, gamma=gamma, b0=b0, eta=eta, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time, b0_dim=b0_dim)
spca_objects.append(rmsp)
if 'ADAM' in methods:
adam = stsb.ADAM(d, k, beta_1 = beta_1, beta_2 = beta_2, delta=1e-8, eta=eta, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time, bias_correction=bias_correction, b0_dim=b0_dim)
spca_objects.append(adam)
if 'WindOja' in methods:
woja = stsb.WindOja(d, k, b0=b0, B=B, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time, b0_dim = b0_dim, tol=tol)
spca_objects.append(woja)
blocknum = 1
row, col, data = [], [], []
for i in range(nnz):
entry = list(map(int, f.readline().split()))
# if the row num (with zero based indexing)
# is in the current block
if entry[0] - 1 < blocknum * B:
# note bag of words uses 1 based indexing
row.append((entry[0]-1) % B)
col.append(entry[1]-1)
data.append(entry[2])
else:
# Add the current block to the model
if Sparse:
Xi = sp.csr_matrix((data, (row, col)), shape=(B,d))
else:
Xi = np.zeros((B, d))
Xi[row, col] = data
for spca in spca_objects:
spca.add_block(Xi)
# Increase the block number
blocknum += 1
# Start the new block in the row, col, and data entries.
row, col, data = [(entry[0] - 1) % B], [entry[1] - 1], [entry[2]]
# Insert final block
if Sparse:
Xi = sp.csr_matrix((data, (row, col)), shape=(max(row) + 1,d))
else:
Xi = np.zeros((max(row) + 1, d))
Xi[row,col] = data
for spca in spca_objects:
spca.add_block(Xi, final_sample=True)
return spca_objects
def run_sim_fullX(X, k, methods=['AdaOja', 'HPCA', 'SPM'], tol=.005, b0=1e-5, gamma=.9, beta_1 = 0.9, beta_2 = 0.999, eta=1e-3, delta=1e-8, p=None, B=10, m=1, Sparse=True, Acc=True, xnorm2=None, num_acc=100, Time=True, num_samples=None, bias_correction=False, b0_dim=1):
'''
This runs several streaming PCA algorithms simultaneously on data that is
provided in array X
'''
n, d = X.shape
if num_samples is not None:
num_acc = int(n / num_samples * num_acc)
nblock = int(num_samples / B)
endBsize = num_samples - nblock * B
else:
nblock = int(n / B)
endBsize = n - nblock * B
spca_objects = []
# Initialize the streaming objects
if 'AdaOja' in methods:
adaoja = stsb.AdaOja(d, k, b0=b0, B=B, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time, b0_dim=b0_dim)
spca_objects.append(adaoja)
if 'HPCA' in methods:
hpca = stsb.HPCA(d, k, B=B, m=m, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time)
spca_objects.append(hpca)
if 'SPM' in methods:
spm = stsb.SPM(d, k, p=p, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time)
spca_objects.append(spm)
if 'RMSProp' in methods:
rmsp = stsb.RMSProp(d, k, gamma=gamma, b0=b0, eta=eta, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time, b0_dim=b0_dim)
spca_objects.append(rmsp)
if 'ADAM' in methods:
adam = stsb.ADAM(d, k, beta_1 = beta_1, beta_2 = beta_2, delta=1e-8, eta=eta, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time, bias_correction=bias_correction, b0_dim=b0_dim)
spca_objects.append(adam)
if 'WindOja' in methods:
woja = stsb.WindOja(d, k, b0=b0, B=B, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time, b0_dim = b0_dim, tol=tol)
spca_objects.append(woja)
for i in range(0, nblock*B, B):
Xi = X[i:i+B]
if endBsize == 0 and i == (nblock - 1) * B:
for spca in spca_objects:
spca.add_block(Xi, final_sample=True)
else:
for spca in spca_objects:
spca.add_block(Xi)
if endBsize > 0:
if num_samples is not None:
Xi = X[nblock * B:num_samples]
else:
Xi = X[nblock * B:]
for spca in spca_objects:
spca.add_block(Xi, final_sample=True)
return spca_objects
def run_sim_blocklist(Xlist, k, methods=['AdaOja', 'HPCA', 'SPM'], tol=.005, b0=1e-5, gamma=.9, beta_1 = 0.9, beta_2 = 0.999, eta=1e-3, delta=1e-8, bias_correction=False, b0_dim=1, p=None, m=1, Sparse=True, Acc=True, xnorm2=None, num_acc=100, Time=True):
'''
This runs several streaming PCA methods simultaneously on a dataset
provided as a list of blocks
'''
B, d = Xlist[0].shape
spca_objects = []
# Initialize the streaming objects
if 'AdaOja' in methods:
adaoja = stsb.AdaOja(d, k, b0=b0, B=B, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time, b0_dim=b0_dim)
spca_objects.append(adaoja)
if 'HPCA' in methods:
hpca = stsb.HPCA(d, k, B=B, m=m, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time)
spca_objects.append(hpca)
if 'SPM' in methods:
spm = stsb.SPM(d, k, p=p, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time)
spca_objects.append(spm)
if 'RMSProp' in methods:
rmsp = stsb.RMSProp(d, k, gamma=gamma, b0=b0, eta=eta, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time, b0_dim=b0_dim)
spca_objects.append(rmsp)
if 'ADAM' in methods:
adam = stsb.ADAM(d, k, beta_1 = beta_1, beta_2 = beta_2, delta=delta, eta=eta, B=B, Sparse=Sparse, Acc=Acc, X=X, xnorm2=xnorm2, num_acc=num_acc, Time=Time, bias_correction=bias_correction, b0_dim=b0_dim)
spca_objects.append(adam)
if 'WindOja' in methods:
woja = stsb.WindOja(d, k, b0=b0, B=B, Sparse=Sparse, Acc=Acc, xnorm2=xnorm2, X=X, num_acc=num_acc, Time=Time, b0_dim = b0_dim, tol=tol)
spca_objects.append(woja)
nblocks = len(Xlist)
for i in range(nblocks-1):
for spca in spca_objects:
spca.add_block(Xlist[i])
for spca in spca_objects:
spca.add_block(Xlist[-1], final_sample=True)
return spca_objects