JuliaLang · stevengj · May 6, 2014 · May 3, 2014 · May 5, 2014
diff --git a/NEWS.md b/NEWS.md
@@ -175,6 +175,8 @@ Library improvements
 
  * `eigs(A, sigma)` now uses shift-and-invert for nonzero shifts `sigma` and inverse iteration for `which="SM"`. If `sigma==nothing` (the new default), computes ordinary (forward) iterations. ([#5776])
 
+ * `sprand` is faster, and whether any entry is nonzero is now determined independently with the specified probability ([#6726]).
+
  * Dense linear algebra for special matrix types
 
  * Interconversions between the special matrix types `Diagonal`, `Bidiagonal`,
@@ -253,6 +255,8 @@ Library improvements
  * Extended API for ``cov`` and ``cor``, which accept keyword arguments ``vardim``, 
  ``corrected``, and ``mean`` ([#6273])
 
+ * New functions `randsubseq` and `randsubseq!` to create a random subsequence of an array ([#6726])
+
 
 Deprecated or removed
 ---------------------

diff --git a/base/abstractarray.jl b/base/abstractarray.jl
@@ -1314,3 +1314,44 @@ push!(A, a, b, c...) = push!(push!(A, a, b), c...)
 unshift!(A) = A
 unshift!(A, a, b) = unshift!(unshift!(A, b), a)
 unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)
+
+# Fill S (resized as needed) with a random subsequence of A, where
+# each element of A is included in S with independent probability p.
+# (Note that this is different from the problem of finding a random
+# size-m subset of A where m is fixed!)
+function randsubseq!(S::AbstractArray, A::AbstractArray, p::Real)
+ 0 <= p <= 1 || throw(ArgumentError("probability $p not in [0,1]"))
+ n = length(A)
+ p == 1 && return copy!(resize!(S, n), A)
+ empty!(S)
+ p == 0 && return S
+ nexpected = p * length(A)
+ sizehint(S, iround(nexpected + 5*sqrt(nexpected)))
+ if p > 0.15 # empirical threshold for trivial O(n) algorithm to be better
+ for i = 1:n
+ rand() <= p && push!(S, A[i])
+ end
+ else
+ # Skip through A, in order, from each element i to the next element i+s
+ # included in S. The probability that the next included element is 
+ # s==k (k > 0) is (1-p)^(k-1) * p, and hence the probability (CDF) that
+ # s is in {1,...,k} is 1-(1-p)^k = F(k). Thus, we can draw the skip s
+ # from this probability distribution via the discrete inverse-transform
+ # method: s = iceil(F^{-1}(u)) where u = rand(), which is simply
+ # s = iceil(log(rand()) / log1p(-p)).
+ L = 1 / log1p(-p)
+ i = 0
+ while true
+ s = log(rand()) * L # note that rand() < 1, so s > 0
+ s >= n - i && return S # compare before iceil to avoid overflow
+ push!(S, A[i += iceil(s)])
+ end
+ # [This algorithm is similar in spirit to, but much simpler than,
+ # the one by Vitter for a related problem in "Faster methods for
+ # random sampling," Comm. ACM Magazine 7, 703-718 (1984).]
+ end
+ return S
+end
+
+randsubseq{T}(A::AbstractArray{T}, p::Real) = randsubseq!(T[], A, p)
+
diff --git a/base/exports.jl b/base/exports.jl
@@ -545,6 +545,8 @@ export
  promote_shape,
  randcycle,
  randperm,
+ randsubseq!,
+ randsubseq,
  range,
  reducedim,
  repmat,

diff --git a/base/sparse/sparsematrix.jl b/base/sparse/sparsematrix.jl
@@ -335,56 +335,48 @@ function findnz{Tv,Ti}(S::SparseMatrixCSC{Tv,Ti})
  return (I, J, V)
 end
 
-truebools(n::Integer) = ones(Bool, n)
-function sprand{T}(m::Integer, n::Integer, density::FloatingPoint, rng::Function,::Type{T}=eltype(rng(1)))
- 0 <= density <= 1 || throw(ArgumentError("density must be between 0 and 1"))
+function sprand{T}(m::Integer, n::Integer, density::FloatingPoint,
+  rng::Function,::Type{T}=eltype(rng(1)))
+ 0 <= density <= 1 || throw(ArgumentError("$density not in [0,1]"))
  N = n*m
  N == 0 && return spzeros(T,m,n)
  N == 1 && return rand() <= density ? sparse(rng(1)) : spzeros(T,1,1)
- # if density < 0.5, we'll randomly generate the indices to set
- # otherwise, we'll randomly generate the indices to skip
- K = (density > 0.5) ? N*(1-density) : N*density
- # Use Newton's method to invert the birthday problem
- l = log(1.0-1.0/N)
- k = K
- k = k + ((1-K/N)*exp(-k*l) - 1)/l
- k = k + ((1-K/N)*exp(-k*l) - 1)/l # for K<N/2, 2 iterations suffice
- ik = int(k)
- ind = sort(rand(1:N, ik))
- uind = Array(Int, 0) # unique indices
- sizehint(uind, int(N*density))
- if density < 0.5
- if ik == 0
- return sparse(Int[],Int[],Array(T,0),m,n)
- end
- j = ind[1]
- push!(uind, j)
- uj = j
- for i = 2:length(ind)
- j = ind[i]
- if j != uj
- push!(uind, j)
- uj = j
- end
- end
- else
- push!(ind, N+1) # sentinel
- ii = 1
- for i = 1:N
- if i != ind[ii]
- push!(uind, i)
- else
- while (i == ind[ii])
- ii += 1
- end
- end
+
+ I, J = Array(Int, 0), Array(Int, 0) # indices of nonzero elements
+ sizehint(I, int(N*density))
+ sizehint(J, int(N*density))
+
+ # density of nonzero columns:
+ L = log1p(-density)
+ coldensity = -expm1(m*L) # = 1 - (1-density)^m 
+ colsparsity = exp(m*L) # = 1 - coldensity
+ L = 1/L
+
+ rows = Array(Int, 0)
+ for j in randsubseq(1:n, coldensity)
+ # To get the right statistics, we *must* have a nonempty column j
+ # even if p*m << 1. To do this, we use an approach similar to
+ # the one in randsubseq to compute the expected first nonzero row k,
+ # except given that at least one is nonzero (via Bayes' rule);
+ # carefully rearranged to avoid excessive roundoff errors.
+ k = ceil(log(colsparsity + rand()*coldensity) * L)
+ ik = k < 1 ? 1 : k > m ? m : int(k) # roundoff-error/underflow paranoia
+ randsubseq!(rows, 1:m-ik, density)
+ push!(rows, m-ik+1)
+ append!(I, rows)
+ nrows = length(rows)
+ Jlen = length(J)
+ resize!(J, Jlen+nrows)
+ for i = Jlen+1:length(J)
+ J[i] = j
  end
  end
- I, J = ind2sub((m,n), uind)
- return sparse_IJ_sorted!(I, J, rng(length(uind)), m, n, +) # it will never need to combine
+ return sparse_IJ_sorted!(I, J, rng(length(I)), m, n, +) # it will never need to combine
 end
+
 sprand(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,rand,Float64)
 sprandn(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,randn,Float64)
+truebools(n::Integer) = ones(Bool, n)
 sprandbool(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,truebools,Bool)
 
 spones{T}(S::SparseMatrixCSC{T}) =

diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -3814,6 +3814,20 @@ Indexing, Assignment, and Concatenation
 
  Throw an error if the specified indexes are not in bounds for the given array.
 
+.. function:: randsubseq(A, p) -> Vector
+
+ Return a vector consisting of a random subsequence of the given array ``A``,
+ where each element of ``A`` is included (in order) with independent
+ probability ``p``. (Complexity is linear in ``p*length(A)``, so this
+ function is efficient even if ``p`` is small and ``A`` is large.) Technically,
+ this process is known as "Bernoulli sampling" of ``A``.
+
+.. function:: randsubseq!(S, A, p)
+
+ Like ``randsubseq``, but the results are stored in ``S`` (which is
+ resized as needed).
+
+
 Array functions
 ~~~~~~~~~~~~~~~
 

diff --git a/doc/stdlib/sparse.rst b/doc/stdlib/sparse.rst
@@ -55,17 +55,17 @@ Sparse matrices support much of the same set of operations as dense matrices. Th
 
  Construct a sparse diagonal matrix. ``B`` is a tuple of vectors containing the diagonals and ``d`` is a tuple containing the positions of the diagonals. In the case the input contains only one diagonaly, ``B`` can be a vector (instead of a tuple) and ``d`` can be the diagonal position (instead of a tuple), defaulting to 0 (diagonal). Optionally, ``m`` and ``n`` specify the size of the resulting sparse matrix.
 
-.. function:: sprand(m,n,density[,rng])
+.. function:: sprand(m,n,p[,rng])
 
- Create a random sparse matrix with the specified density. Nonzeros are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified.
+ Create a random ``m`` by ``n`` sparse matrix, in which the probability of any element being nonzero is independently given by ``p`` (and hence the mean density of nonzeros is also exactly ``p``). Nonzero values are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified.
 
-.. function:: sprandn(m,n,density)
+.. function:: sprandn(m,n,p)
 
- Create a random sparse matrix of specified density with nonzeros sampled from the normal distribution.
+ Create a random ``m`` by ``n`` sparse matrix with the specified (independent) probability ``p`` of any entry being nonzero, where nonzero values are sampled from the normal distribution.
 
-.. function:: sprandbool(m,n,density)
+.. function:: sprandbool(m,n,p)
 
- Create a random sparse boolean matrix with the specified density.
+ Create a random ``m`` by ``n`` sparse boolean matrix with the specified (independent) probability ``p`` of any entry being ``true``.
 
 .. function:: etree(A[, post])