Skip to content

Commit

Permalink
faster sprand, new randsubseq (fixes JuliaLang#6714 and JuliaLang#6708)
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed May 3, 2014
1 parent 27ab7af commit 4d6b47d
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 43 deletions.
40 changes: 40 additions & 0 deletions base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1314,3 +1314,43 @@ push!(A, a, b, c...) = push!(push!(A, a, b), c...)
unshift!(A) = A
unshift!(A, a, b) = unshift!(unshift!(A, b), a)
unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)

# Fill S (resized as needed) with a random subsequence of A, where
# each element of A is included in S with independent probability p.
# (Note that this is different from the problem of finding a random
# size-m subset of A where m is fixed!)
function randsubseq!(S::AbstractArray, A::AbstractArray, p::Real)
0 <= p <= 1 || throw(ArgumentError("probability $p not in [0,1]"))
n = length(A)
p == 1 && return copy!(resize!(S, n), A)
empty!(S)
p == 0 && return S
sizehint(S, int(p * length(A)))
if p > 0.15 # empirical threshold for trivial O(n) algorithm to be better
for i = 1:n
rand() <= p && push!(S, A[i])
end
else
# Skip through A, in order, from each element i to the next element i+s
# included in S. The probability that the next included element is
# s==k (k > 0) is (1-p)^(k-1) * p, and hence the probability (CDF) that
# s is in {1,...,k} is 1-(1-p)^k = F(k). Thus, we can draw the skip s
# from this probability distribution via the discrete inverse-transform
# method: s = iceil(F^{-1}(u)) where u = rand(), which is simply
# s = iceil(log(rand()) / log1p(-p)).
L = 1 / log1p(-p)
i = 0
while true
s = log(rand()) * L # note that rand() < 1, so s > 0
s >= n - i && return S # compare before iceil to avoid overflow
push!(S, A[i += iceil(s)])
end
# [This algorithm is similar in spirit to, but much simpler than,
# the one by Vitter for a related problem in "Faster methods for
# random sampling," Comm. ACM Magazine 7, 703-718 (1984).]
end
return S
end

randsubseq{T}(A::AbstractArray{T}, p::Real) = randsubseq!(T[], A, p)

2 changes: 2 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,8 @@ export
promote_shape,
randcycle,
randperm,
randsubseq!,
randsubseq,
range,
reducedim,
repmat,
Expand Down
76 changes: 34 additions & 42 deletions base/sparse/sparsematrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -335,56 +335,48 @@ function findnz{Tv,Ti}(S::SparseMatrixCSC{Tv,Ti})
return (I, J, V)
end

truebools(n::Integer) = ones(Bool, n)
function sprand{T}(m::Integer, n::Integer, density::FloatingPoint, rng::Function,::Type{T}=eltype(rng(1)))
0 <= density <= 1 || throw(ArgumentError("density must be between 0 and 1"))
function sprand{T}(m::Integer, n::Integer, density::FloatingPoint,
rng::Function,::Type{T}=eltype(rng(1)))
0 <= density <= 1 || throw(ArgumentError("$density not in [0,1]"))
N = n*m
N == 0 && return spzeros(T,m,n)
N == 1 && return rand() <= density ? sparse(rng(1)) : spzeros(T,1,1)
# if density < 0.5, we'll randomly generate the indices to set
# otherwise, we'll randomly generate the indices to skip
K = (density > 0.5) ? N*(1-density) : N*density
# Use Newton's method to invert the birthday problem
l = log(1.0-1.0/N)
k = K
k = k + ((1-K/N)*exp(-k*l) - 1)/l
k = k + ((1-K/N)*exp(-k*l) - 1)/l # for K<N/2, 2 iterations suffice
ik = int(k)
ind = sort(rand(1:N, ik))
uind = Array(Int, 0) # unique indices
sizehint(uind, int(N*density))
if density < 0.5
if ik == 0
return sparse(Int[],Int[],Array(T,0),m,n)
end
j = ind[1]
push!(uind, j)
uj = j
for i = 2:length(ind)
j = ind[i]
if j != uj
push!(uind, j)
uj = j
end
end
else
push!(ind, N+1) # sentinel
ii = 1
for i = 1:N
if i != ind[ii]
push!(uind, i)
else
while (i == ind[ii])
ii += 1
end
end

I, J = Array(Int, 0), Array(Int, 0) # indices of nonzero elements
sizehint(I, int(N*density))
sizehint(J, int(N*density))

# density of nonzero columns:
L = log1p(-density)
coldensity = -expm1(m*L) # = 1 - (1-density)^m
colsparsity = exp(m*L) # = 1 - coldensity
L = 1/L

rows = Array(Int, 0)
for j in randsubseq(1:n, coldensity)
# To get the right statistics, we *must* have a nonempty column j
# even if p*m << 1. To do this, we use an approach similar to
# the one in randsubseq to compute the expected first nonzero row k,
# except given that at least one is nonzero (via Bayes' rule);
# carefully rearranged to avoid excessive roundoff errors.
k = ceil(log(colsparsity + rand()*coldensity) * L)
ik = k < 1 ? 1 : k > m ? m : int(k) # roundoff-error/underflow paranoia
randsubseq!(rows, 1:m-ik, density)
push!(rows, m-ik+1)
append!(I, rows)
nrows = length(rows)
Jlen = length(J)
resize!(J, Jlen+nrows)
for i = Jlen+1:length(J)
J[i] = j
end
end
I, J = ind2sub((m,n), uind)
return sparse_IJ_sorted!(I, J, rng(length(uind)), m, n, +) # it will never need to combine
return sparse_IJ_sorted!(I, J, rng(length(I)), m, n, +) # it will never need to combine
end

sprand(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,rand,Float64)
sprandn(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,randn,Float64)
truebools(n::Integer) = ones(Bool, n)
sprandbool(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,truebools,Bool)

spones{T}(S::SparseMatrixCSC{T}) =
Expand Down
13 changes: 13 additions & 0 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3814,6 +3814,19 @@ Indexing, Assignment, and Concatenation

Throw an error if the specified indexes are not in bounds for the given array.

.. function:: randsubseq(A, p) -> Vector

Return a vector consisting of a random subsequence of the given array ``A``,
where each element of ``A`` is included (in order) with independent
probability ``p``. (Complexity is linear in ``p*length(A)``, so this
function is efficient even if ``p`` is small and ``A`` is large.)

.. function:: randsubseq!(S, A, p)

Like ``randsubseq``, but the results are stored in ``S`` (which is
resized as needed).


Array functions
~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion doc/stdlib/sparse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ Sparse matrices support much of the same set of operations as dense matrices. Th

.. function:: sprand(m,n,density[,rng])

Create a random sparse matrix with the specified density. Nonzeros are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified.
Create a random sparse matrix, in which the probability of any element being nonzero is given by ``density``. Nonzeros are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified.

.. function:: sprandn(m,n,density)

Expand Down

0 comments on commit 4d6b47d

Please sign in to comment.