From db3b28d37cbb99a0f40a730b005e74d92fd1f1a1 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Thu, 13 Feb 2014 20:32:25 -0500 Subject: [PATCH 1/2] Add unique(AbstractArray, dim) Efficiently finds the unique columns, rows, etc. of an array. The algorithm first hashes each row, then finds the unique hashes, and finally checks that the hashes don't collide. It is roughly O(n) in the number of elements in the matrix. This is my first time using Cartesian. Without it, this code is presently about 10% faster for finding unique rows of a matrix, but the overhead is probably worth it for the generality. --- base/multidimensional.jl | 67 ++++++++++++++++++++++++++++++++++++++++ test/arrayops.jl | 26 ++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 883476c965d17..228da75ea7728 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -410,3 +410,70 @@ for (V, PT, BT) in [((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)] return P end end + +## unique across dim + +immutable Prehashed + hash::Uint +end +hash(x::Prehashed) = x.hash + +@ngenerate N typeof(A) function unique{T,N}(A::AbstractArray{T,N}, dim::Int) + 1 <= dim <= N || return copy(A) + hashes = zeros(Uint, size(A, dim)) + + # Compute hash for each row + j = 0 + @nloops N i A d->(if d == dim; j = i_d; end) begin + @inbounds hashes[j] = bitmix(hashes[j], hash((@nref N A i))) + end + + # Collect index of first row for each hash + uniquerow = Array(Int, size(A, dim)) + firstrow = Dict{Prehashed,Int}() + for j = 1:size(A, dim) + uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j) + end + uniquerows = collect(values(firstrow)) + + # Check for collisions + collided = falses(size(A, dim)) + @inbounds begin + @nloops N i A d->(if d == dim; j = i_d; end) begin + if (@nref N A d->ifelse(d == dim, uniquerow[j], i_d)) != (@nref N A i) + collided[j] = true + end + end + end + + if any(collided) + nowcollided = BitArray(size(A, dim)) + while any(collided) + # Collect index of first row for each collided hash + empty!(firstrow) + for j = 1:size(A, dim) + collided[j] || continue + uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j) + end + for v in values(firstrow) + push!(uniquerows, v) + end + + # Check for collisions + fill!(nowcollided, false) + @nloops N i A d->begin + if d == dim + j = i_d + (!collided[j] || uniquerow[j] == j) && continue + end + end begin + if (@nref N A d->ifelse(d == dim, uniquerow[j], i_d)) != (@nref N A i) + nowcollided[j] = true + end + end + (collided, nowcollided) = (nowcollided, collided) + end + end + + @nref N A d->d == dim ? sort!(uniquerows) : (1:size(A, d)) +end diff --git a/test/arrayops.jl b/test/arrayops.jl index 0917e69bb73ee..dc9ae60d6d3a3 100644 --- a/test/arrayops.jl +++ b/test/arrayops.jl @@ -331,6 +331,32 @@ for i = tensors @test isequal(i,permutedims(ipermutedims(i,perm),perm)) end +## unique across dim ## + +# All rows and columns unique +A = ones(10, 10) +A[diagind(A)] = shuffle!([1:10]) +@test unique(A, 1) == A +@test unique(A, 2) == A + +# 10 repeats of each row +B = A[shuffle!(repmat(1:10, 10)), :] +C = unique(B, 1) +@test sortrows(C) == sortrows(A) +@test unique(B, 2) == B +@test unique(B.', 2).' == C + +# Along third dimension +D = cat(3, B, B) +@test unique(D, 1) == cat(3, C, C) +@test unique(D, 3) == cat(3, B) + +# With hash collisions +immutable HashCollision + x::Float64 +end +Base.hash(::HashCollision) = uint(0) +@test map(x->x.x, unique(map(HashCollision, B), 1)) == C ## reduce ## From c402aeae250e9c977a09c77dbfa7645a599035f7 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Fri, 14 Feb 2014 11:57:05 -0500 Subject: [PATCH 2/2] unique(AbstractArray, dim): incorporate @timholy's suggestions --- base/multidimensional.jl | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 228da75ea7728..bfa0ce5851e09 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -423,25 +423,30 @@ hash(x::Prehashed) = x.hash hashes = zeros(Uint, size(A, dim)) # Compute hash for each row - j = 0 - @nloops N i A d->(if d == dim; j = i_d; end) begin - @inbounds hashes[j] = bitmix(hashes[j], hash((@nref N A i))) + k = 0 + @nloops N i A d->(if d == dim; k = i_d; end) begin + @inbounds hashes[k] = bitmix(hashes[k], hash((@nref N A i))) end # Collect index of first row for each hash uniquerow = Array(Int, size(A, dim)) firstrow = Dict{Prehashed,Int}() - for j = 1:size(A, dim) - uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j) + for k = 1:size(A, dim) + uniquerow[k] = get!(firstrow, Prehashed(hashes[k]), k) end uniquerows = collect(values(firstrow)) # Check for collisions collided = falses(size(A, dim)) @inbounds begin - @nloops N i A d->(if d == dim; j = i_d; end) begin - if (@nref N A d->ifelse(d == dim, uniquerow[j], i_d)) != (@nref N A i) - collided[j] = true + @nloops N i A d->(if d == dim + k = i_d + j_d = uniquerow[k] + else + j_d = i_d + end) begin + if (@nref N A j) != (@nref N A i) + collided[k] = true end end end @@ -463,12 +468,15 @@ hash(x::Prehashed) = x.hash fill!(nowcollided, false) @nloops N i A d->begin if d == dim - j = i_d - (!collided[j] || uniquerow[j] == j) && continue + k = i_d + j_d = uniquerow[k] + (!collided[k] || j_d == k) && continue + else + j_d = i_d end end begin - if (@nref N A d->ifelse(d == dim, uniquerow[j], i_d)) != (@nref N A i) - nowcollided[j] = true + if (@nref N A j) != (@nref N A i) + nowcollided[k] = true end end (collided, nowcollided) = (nowcollided, collided)