This repository has been archived by the owner on Oct 19, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
lasso.jl
291 lines (228 loc) · 9.02 KB
/
lasso.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
function lasso_reg{T <: AbstractFloat, V <: AbstractVector}(
x :: AbstractMatrix{T},
y :: V,
λ :: T;
K :: Int = 100,
v :: FISTAVariables{T,V} = FISTAVariables(x, y, K),
max_iter :: Int = 1000,
tol :: T = convert(T, 1e-4),
mu :: T = one(T),
backtrack_size :: T = convert(T, 0.5),
quiet :: Bool = true
) FISTAResults{T,V}
n,p = size(x)
@assert n == length(y) "Arguments x and y must have same number of rows"
loss = convert(T, Inf)
loss0 = loss
iter = 0
# start timing
tic()
for iter = 1:max_iter
quiet || println("Iter = ", iter)
copy!(v.b0, v.b)
copy!(v.xb0, v.xb)
loss0 = loss
steps = 0
# apply Nesterov acceleration
k = iter / (iter + 3)
v.z .= v.b .+ k.*(v.b .- v.b0)
while true
steps += 1
quiet || println("steps = ", steps)
# compute estimated response xb0 .= x*b0
A_mul_B!(v.xb0, x, v.z)
# compute residuals
v.r .= y .- v.xb0
lossz = sumabs2(v.r) / 2
# compute gradient
#df .= -x'*r
At_mul_B!(v.df, x, v.r)
# estimate coefficients
v.b .= v.z .+ mu.*v.df
v.b .= softthreshold.(v.b, λ)
# recompute estimated response xb.= x*b
A_mul_B!(v.xb, x, v.b)
# update residuals
v.r .= y .- v.xb
# update sum of squares
loss = sumabs2(v.r) / 2
# difference of estimates
v.bdiff .= v.b .- v.z
# check backtracking condition and break if satisfied
loss <= lossz - dot(v.df, v.bdiff) + one(T) / (2*mu) * sumabs2(v.bdiff) && break
# at this point we must keep backtracking, so reduce step size and repeat loop
mu *= backtrack_size
end
quiet || println("loss = ", loss)
abs(loss - loss0) < tol && break
end
# get execution time
exec_time = toq()
#return v.b
return FISTAResults(exec_time, loss, iter, v.b)
end
function lasso_path{T <: AbstractFloat, V <: AbstractVector}(
x :: AbstractMatrix{T},
y :: V;
K :: Int = 100,
lambdas :: Vector{T} = compute_lambdas(x,y,K),
max_iter :: Int = 1000,
tol :: T = convert(T, 1e-4),
mu :: T = one(T),
backtrack_size :: T = convert(T, 0.5),
quiet :: Bool = true
) :: SparseMatrixCSC{T,Int}
# size of problem?
n,p = size(x)
# ensure conformable arguments
@assert n == length(y) "Arguments x and y must have same number of rows"
# prespecify a matrix to store calculated models
betas = spzeros(T, p, K)
# allocate all intermediate arrays
# insert dummy variable for MCP gamma
v = FISTAVariables(x, y, lambdas, convert(T, 3))
# compute the path
for i = 1:K
# current lambda?
λ = v.lambdas[i]
# track progress
quiet || print_with_color(:blue, "Computing iteration $i with λ = $(λ).\n\n")
# run FISTA on this lambda value
output = lasso_reg(x, y, λ, v=v, max_iter=max_iter, tol=tol, mu=mu, backtrack_size=backtrack_size, quiet=quiet)
# save current progress
betas[:,i] = sparsevec(output.beta)
# cut path short if b is completely full
if countnz(output.beta) == p
quiet || warn("Model at λ = ", λ, " is completely saturated.\nlasso_path will terminate early...\n")
return betas[:,1:i]
end
end
return betas
end
function one_fold{T <: Float}(
x :: DenseMatrix{T},
y :: DenseVector{T},
K :: Int,
folds :: DenseVector{Int},
fold :: Int;
tol :: T = convert(T, 1e-4),
max_iter :: Int = 100,
mu :: T = one(T),
backtrack_size :: T = convert(T, 0.5),
quiet :: Bool = true,
lambdas :: Vector{T} = compute_lambdas(x,y,K)
) :: Vector{T}
# make vector of indices for folds
test_idx = folds .== fold
test_size = sum(test_idx)
# train_idx is the vector that indexes the TRAINING set
train_idx = !test_idx
# allocate the arrays for the training set
x_train = x[train_idx,:]
y_train = y[train_idx]
# compute the regularization path on the training set
betas = lasso_path(x_train, y_train, K=K, tol=tol, max_iter=max_iter, quiet=quiet, mu=mu, backtrack_size=backtrack_size, lambdas=lambdas)
# compute the mean out-of-sample error for the TEST set
Xb = view(x, test_idx, :) * betas
# r = broadcast(-, view(y, test_idx, 1), Xb)
# r = broadcast(-, y[test_idx], Xb)
r = y[test_idx] .- Xb
er = vec(sumabs2(r, 1)) ./ (2*test_size)
return er
end
function pfold{T <: Float}(
x :: DenseMatrix{T},
y :: DenseVector{T},
K :: Int,
folds :: DenseVector{Int},
q :: Int;
pids :: Vector{Int} = procs(),
tol :: T = convert(T, 1e-4),
max_iter :: Int = 100,
mu :: T = one(T),
backtrack_size :: T = convert(T, 0.5),
quiet :: Bool = true,
lambdas :: Vector{T} = compute_lambdas(x,y,K)
) :: Vector{T}
# how many CPU processes can pfold use?
np = length(pids)
# report on CPU processes
quiet || println("pfold: np = ", np)
quiet || println("pids = ", pids)
# set up function to share state (indices of folds)
i = 1
nextidx() = (idx=i; i+=1; idx)
# preallocate cell array for results
results = SharedArray(T, (K,q), pids=pids) :: SharedMatrix{T}
# master process will distribute tasks to workers
# master synchronizes results at end before returning
@sync begin
# loop over all workers
for worker in pids
# exclude process that launched pfold, unless only one process is available
if worker != myid() || np == 1
# asynchronously distribute tasks
@async begin
while true
# grab next fold
current_fold = nextidx()
# if current fold exceeds total number of folds then exit loop
current_fold > q && break
# report distribution of fold to worker and device
quiet || print_with_color(:blue, "Computing fold $current_fold on worker $worker.\n\n")
# launch job on worker
# worker loads data from file paths and then computes the errors in one fold
r = remotecall_fetch(worker) do
one_fold(x, y, K, folds, current_fold, tol=tol, max_iter=max_iter, backtrack_size=backtrack_size, mu=mu, quiet=quiet, lambdas=lambdas)
end # end remotecall_fetch()
setindex!(results, r, :, current_fold)
end # end while
end # end @async
end # end if
end # end for
end # end @sync
# return reduction (row-wise sum) over results
return (vec(sum(results, 2) ./ q))
end
function cv_lasso{T <: Float}(
x :: DenseMatrix{T},
y :: DenseVector{T};
q :: Int = cv_get_num_folds(3, 5),
K :: Int = 20,
lambdas :: Vector{T} = compute_lambdas(x,y,K),
folds :: DenseVector{Int} = cv_get_folds(sdata(y),q),
pids :: Vector{Int} = procs(),
tol :: T = convert(T, 1e-4),
max_iter :: Int = 100,
mu :: T = one(T),
backtrack_size :: T = convert(T, 0.5),
quiet :: Bool = true,
)
# do not allow crossvalidation with fewer than 3 folds
q > 2 || throw(ArgumentError("Number of folds q = $q must be at least 3."))
# problem dimensions?
n,p = size(x)
# check for conformable arrays
n == length(y) || throw(DimensionMismatch("Row dimension of x ($n) must match number of rows in y ($(length(y)))"))
# want to compute a path for each fold
# the folds are computed asynchronously over processes enumerated by pids
# master process then reduces errors across folds and returns MSEs
mses = pfold(x, y, K, folds, q, pids=pids, tol=tol, max_iter=max_iter, backtrack_size=backtrack=backtrack_size, mu=mu, quiet=quiet, lambdas=lambdas)
# what is the best model size?
# if there are multiple model sizes of EXACTLY the same MSE,
# then this chooses the smaller of the two
i = indmin(mses)
# k = path[i] :: Int
λ = lambdas[i] :: T
# path = vec(mapslices(
# print results
# !quiet && print_cv_results(mses, path, k)
# refit the best model
# b, bidx = refit_iht(sdata(x), sdata(y), k, tol=tol, max_iter=max_iter, max_step=max_step, quiet=quiet)
output = lasso_reg(sdata(x), sdata(y), λ, K=K, max_iter=max_iter, backtrack_size=backtrack_size, quiet=quiet, tol=tol, mu=mu)
bidx = find(output.beta)
b = output.beta[bidx]
k = countnz(output.beta)
#return FISTACrossvalidationResults(mses, path, λ, b, bidx, k)
return FISTACrossvalidationResults(mses, λ, b, bidx, k)
end