From c8c5b6f39cb40c66834035216ddad1af3884f4e6 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 20 Nov 2014 15:38:44 -0600
Subject: [PATCH 1/9] Use stagedfunctions instead of @ngenerate and @nsplat

This commit tackles all files except multidimensional.jl
---
 base/broadcast.jl  | 64 +++++++++++++++++++++++++------------------
 base/reducedim.jl  | 68 +++++++++++++++++++++++++---------------------
 base/statistics.jl | 62 ++++++++++++++++++++++--------------------
 3 files changed, 106 insertions(+), 88 deletions(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 8e29f437c4c28..0be1863bfd15f 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -239,38 +239,48 @@ broadcast!_function(f::Function) = (B, As...) -> broadcast!(f, B, As...)
 broadcast_function(f::Function) = (As...) -> broadcast(f, As...)
 
 broadcast_getindex(src::AbstractArray, I::AbstractArray...) = broadcast_getindex!(Array(eltype(src), broadcast_shape(I...)), src, I...)
-@ngenerate N typeof(dest) function broadcast_getindex!(dest::AbstractArray, src::AbstractArray, I::NTuple{N, AbstractArray}...)
-    check_broadcast_shape(size(dest), I...)  # unnecessary if this function is never called directly
-    checkbounds(src, I...)
-    @nloops N i dest d->(@nexprs N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
-        @nexprs N k->(@inbounds J_k = @nref N I_k d->j_d_k)
-        @inbounds (@nref N dest i) = (@nref N src J)
+stagedfunction broadcast_getindex!(dest::AbstractArray, src::AbstractArray, I::AbstractArray...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        @nexprs $N d->(I_d = I[d])
+        check_broadcast_shape(size(dest), $(Isplat...))  # unnecessary if this function is never called directly
+        checkbounds(src, $(Isplat...))
+        @nloops $N i dest d->(@nexprs $N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
+            @nexprs $N k->(@inbounds J_k = @nref $N I_k d->j_d_k)
+            @inbounds (@nref $N dest i) = (@nref $N src J)
+        end
+        dest
     end
-    dest
 end
 
-@ngenerate N typeof(A) function broadcast_setindex!(A::AbstractArray, x, I::NTuple{N, AbstractArray}...)
-    checkbounds(A, I...)
-    shape = broadcast_shape(I...)
-    @nextract N shape d->(length(shape) < d ? 1 : shape[d])
-    if !isa(x, AbstractArray)
-        @nloops N i d->(1:shape_d) d->(@nexprs N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
-            @nexprs N k->(@inbounds J_k = @nref N I_k d->j_d_k)
-            @inbounds (@nref N A J) = x
-        end
-    else
-        X = x
-        # To call setindex_shape_check, we need to create fake 1-d indexes of the proper size
-        @nexprs N d->(fakeI_d = 1:shape_d)
-        Base.setindex_shape_check(X, (@ntuple N fakeI)...)
-        k = 1
-        @nloops N i d->(1:shape_d) d->(@nexprs N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
-            @nexprs N k->(@inbounds J_k = @nref N I_k d->j_d_k)
-            @inbounds (@nref N A J) = X[k]
-            k += 1
+stagedfunction broadcast_setindex!(A::AbstractArray, x, I::AbstractArray...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        @nexprs $N d->(I_d = I[d])
+        checkbounds(A, $(Isplat...))
+        shape = broadcast_shape($(Isplat...))
+        @nextract $N shape d->(length(shape) < d ? 1 : shape[d])
+        if !isa(x, AbstractArray)
+            @nloops $N i d->(1:shape_d) d->(@nexprs $N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
+                @nexprs $N k->(@inbounds J_k = @nref $N I_k d->j_d_k)
+                @inbounds (@nref $N A J) = x
+            end
+        else
+            X = x
+            # To call setindex_shape_check, we need to create fake 1-d indexes of the proper size
+            @nexprs $N d->(fakeI_d = 1:shape_d)
+            Base.setindex_shape_check(X, (@ntuple $N fakeI)...)
+            k = 1
+            @nloops $N i d->(1:shape_d) d->(@nexprs $N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
+                @nexprs $N k->(@inbounds J_k = @nref $N I_k d->j_d_k)
+                @inbounds (@nref $N A J) = X[k]
+                k += 1
+            end
         end
+        A
     end
-    A
 end
 
 ## elementwise operators ##
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 351e574458d61..efabd282d8329 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -154,38 +154,40 @@ function check_reducdims(R, A)
     return lsiz
 end
 
-@ngenerate N typeof(R) function _mapreducedim!{T,N}(f, op, R::AbstractArray, A::AbstractArray{T,N})
-    lsiz = check_reducdims(R, A)
-    isempty(A) && return R
-    @nextract N sizeR d->size(R,d)
-    sizA1 = size(A, 1)
-
-    if has_fast_linear_indexing(A) && lsiz > 16
-        # use mapreduce_impl, which is probably better tuned to achieve higher performance
-        nslices = div(length(A), lsiz)
-        ibase = 0
-        for i = 1:nslices
-            @inbounds R[i] = op(R[i], mapreduce_impl(f, op, A, ibase+1, ibase+lsiz))
-            ibase += lsiz
-        end
-    elseif size(R, 1) == 1 && sizA1 > 1
-        # keep the accumulator as a local variable when reducing along the first dimension
-        @nloops N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-            @inbounds r = (@nref N R j)
-            for i_1 = 1:sizA1
-                @inbounds v = f(@nref N A i)
-                r = op(r, v)
+stagedfunction _mapreducedim!{T,N}(f, op, R::AbstractArray, A::AbstractArray{T,N})
+    quote
+        lsiz = check_reducdims(R, A)
+        isempty(A) && return R
+        @nextract $N sizeR d->size(R,d)
+        sizA1 = size(A, 1)
+
+        if has_fast_linear_indexing(A) && lsiz > 16
+            # use mapreduce_impl, which is probably better tuned to achieve higher performance
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            for i = 1:nslices
+                @inbounds R[i] = op(R[i], mapreduce_impl(f, op, A, ibase+1, ibase+lsiz))
+                ibase += lsiz
+            end
+        elseif size(R, 1) == 1 && sizA1 > 1
+            # keep the accumulator as a local variable when reducing along the first dimension
+            @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds r = (@nref $N R j)
+                for i_1 = 1:sizA1
+                    @inbounds v = f(@nref $N A i)
+                    r = op(r, v)
+                end
+                @inbounds (@nref $N R j) = r
+            end
+        else
+            # general implementation
+            @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds v = f(@nref $N A i)
+                @inbounds (@nref $N R j) = op((@nref $N R j), v)
             end
-            @inbounds (@nref N R j) = r
-        end
-    else
-        # general implementation
-        @nloops N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-            @inbounds v = f(@nref N A i)
-            @inbounds (@nref N R j) = op((@nref N R j), v)
         end
+        return R
     end
-    return R
 end
 
 mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) = _mapreducedim!(f, op, R, A)
@@ -290,13 +292,17 @@ function gen_findreduction_body(N, f::Function)
     end
 end
 
-eval(ngenerate(:N, :(typeof((Rval,Rind))), :(_findmin!{T,N}(Rval::AbstractArray, Rind::AbstractArray, A::AbstractArray{T,N})), N->gen_findreduction_body(N, <)))
+stagedfunction _findmin!{T,N}(Rval::AbstractArray, Rind::AbstractArray, A::AbstractArray{T,N})
+    gen_findreduction_body(N, <)
+end
 findmin!{R}(rval::AbstractArray{R}, rind::AbstractArray, A::AbstractArray; init::Bool=true) = _findmin!(initarray!(rval, typemax(R), init), rind, A)
 findmin{T}(A::AbstractArray{T}, region) =
     isempty(A) ? (similar(A,reduced_dims0(A,region)), zeros(Int,reduced_dims0(A,region))) :
                   _findmin!(reducedim_initarray0(A, region, typemax(T)), zeros(Int,reduced_dims0(A,region)), A)
 
-eval(ngenerate(:N, :(typeof((Rval,Rind))), :(_findmax!{T,N}(Rval::AbstractArray, Rind::AbstractArray, A::AbstractArray{T,N})), N->gen_findreduction_body(N, >)))
+stagedfunction _findmax!{T,N}(Rval::AbstractArray, Rind::AbstractArray, A::AbstractArray{T,N})
+    gen_findreduction_body(N, >)
+end
 findmax!{R}(rval::AbstractArray{R}, rind::AbstractArray, A::AbstractArray; init::Bool=true) = _findmax!(initarray!(rval, typemin(R), init), rind, A)
 findmax{T}(A::AbstractArray{T}, region) =
     isempty(A) ? (similar(A,reduced_dims0(A,region)), zeros(Int,reduced_dims0(A,region))) :
diff --git a/base/statistics.jl b/base/statistics.jl
index 8a3019b40337a..d7d3ee4730e49 100644
--- a/base/statistics.jl
+++ b/base/statistics.jl
@@ -102,39 +102,41 @@ call(f::CentralizedAbs2Fun, x) = abs2(x - f.m)
 centralize_sumabs2(A::AbstractArray, m::Number, ifirst::Int, ilast::Int) =
     mapreduce_impl(CentralizedAbs2Fun(m), AddFun(), A, ifirst, ilast)
 
-@ngenerate N typeof(R) function centralize_sumabs2!{S,T,N}(R::AbstractArray{S}, A::AbstractArray{T,N}, means::AbstractArray)
-    # following the implementation of _mapreducedim! at base/reducedim.jl
-    lsiz = check_reducdims(R, A)
-    isempty(R) || fill!(R, zero(S))
-    isempty(A) && return R
-    @nextract N sizeR d->size(R,d)
-    sizA1 = size(A, 1)
-
-    if has_fast_linear_indexing(A) && lsiz > 16
-        # use centralize_sumabs2, which is probably better tuned to achieve higher performance
-        nslices = div(length(A), lsiz)
-        ibase = 0
-        for i = 1:nslices
-            @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz)
-            ibase += lsiz
-        end
-    elseif size(R, 1) == 1 && sizA1 > 1
-        # keep the accumulator as a local variable when reducing along the first dimension
-        @nloops N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-            @inbounds r = (@nref N R j)
-            @inbounds m = (@nref N means j)
-            for i_1 = 1:sizA1
-                @inbounds r += abs2((@nref N A i) - m)
+stagedfunction centralize_sumabs2!{S,T,N}(R::AbstractArray{S}, A::AbstractArray{T,N}, means::AbstractArray)
+    quote
+        # following the implementation of _mapreducedim! at base/reducedim.jl
+        lsiz = check_reducdims(R, A)
+        isempty(R) || fill!(R, zero(S))
+        isempty(A) && return R
+        @nextract $N sizeR d->size(R,d)
+        sizA1 = size(A, 1)
+
+        if has_fast_linear_indexing(A) && lsiz > 16
+            # use centralize_sumabs2, which is probably better tuned to achieve higher performance
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            for i = 1:nslices
+                @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz)
+                ibase += lsiz
+            end
+        elseif size(R, 1) == 1 && sizA1 > 1
+            # keep the accumulator as a local variable when reducing along the first dimension
+            @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds r = (@nref $N R j)
+                @inbounds m = (@nref $N means j)
+                for i_1 = 1:sizA1
+                    @inbounds r += abs2((@nref $N A i) - m)
+                end
+                @inbounds (@nref $N R j) = r
+            end
+        else
+            # general implementation
+            @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds (@nref $N R j) += abs2((@nref $N A i) - (@nref $N means j))
             end
-            @inbounds (@nref N R j) = r
-        end
-    else
-        # general implementation
-        @nloops N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-            @inbounds (@nref N R j) += abs2((@nref N A i) - (@nref N means j))
         end
+        return R
     end
-    return R
 end
 
 function varm{T}(A::AbstractArray{T}, m::Number; corrected::Bool=true)

From 373e61b79ed53ae50c0b65e47818664d10884830 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 20 Nov 2014 15:38:57 -0600
Subject: [PATCH 2/9] ngenerate/nsplat: multidimensional algorithms on
 AbstractArrays

---
 base/multidimensional.jl | 230 +++++++++++++++++++++------------------
 1 file changed, 126 insertions(+), 104 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 77a99f8ad33e1..e8d4dba1b3276 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -181,9 +181,12 @@ using .IteratorsMD
 
 ### From array.jl
 
-@ngenerate N Void function checksize(A::AbstractArray, I::NTuple{N, Any}...)
-    @nexprs N d->(size(A, d) == length(I_d) || throw(DimensionMismatch("index $d has length $(length(I_d)), but size(A, $d) = $(size(A,d))")))
-    nothing
+stagedfunction checksize(A::AbstractArray, I...)
+    N = length(I)
+    quote
+        @nexprs $N d->(size(A, d) == length(I[d]) || throw(DimensionMismatch("index $d has length $(length(I[d])), but size(A, $d) = $(size(A,d))")))
+        nothing
+    end
 end
 
 @inline unsafe_getindex(v::BitArray, ind::Int) = Base.unsafe_bitgetindex(v.chunks, ind)
@@ -259,17 +262,19 @@ end
 end
 
 
-@ngenerate N NTuple{N,Vector{Int}} function findn{T,N}(A::AbstractArray{T,N})
-    nnzA = countnz(A)
-    @nexprs N d->(I_d = Array(Int, nnzA))
-    k = 1
-    @nloops N i A begin
-        @inbounds if (@nref N A i) != zero(T)
-            @nexprs N d->(I_d[k] = i_d)
-            k += 1
+stagedfunction findn{T,N}(A::AbstractArray{T,N})
+    quote
+        nnzA = countnz(A)
+        @nexprs $N d->(I_d = Array(Int, nnzA))
+        k = 1
+        @nloops $N i A begin
+            @inbounds if (@nref $N A i) != zero(T)
+                @nexprs $N d->(I_d[k] = i_d)
+                k += 1
+            end
         end
+        @ntuple $N I
     end
-    @ntuple N I
 end
 
 
@@ -386,57 +391,70 @@ end
 
 
  cumsum(A::AbstractArray, axis::Integer=1) =  cumsum!(similar(A, Base._cumsum_type(A)), A, axis)
+cumsum!(B, A::AbstractArray) = cumsum!(B, A, 1)
 cumprod(A::AbstractArray, axis::Integer=1) = cumprod!(similar(A), A, axis)
+cumprod!(B, A) = cumprod!(B, A, 1)
 
 for (f, op) in ((:cumsum!, :+),
                 (:cumprod!, :*))
     @eval begin
-        @ngenerate N typeof(B) function ($f){T,N}(B, A::AbstractArray{T,N}, axis::Integer=1)
-            if size(B, axis) < 1
-                return B
-            end
-            size(B) == size(A) || throw(DimensionMismatch("size of B must match A"))
-            if axis == 1
-                # We can accumulate to a temporary variable, which allows register usage and will be slightly faster
-                @inbounds @nloops N i d->(d > 1 ? (1:size(A,d)) : (1:1)) begin
-                    tmp = convert(eltype(B), @nref(N, A, i))
-                    @nref(N, B, i) = tmp
-                    for i_1 = 2:size(A,1)
-                        tmp = ($op)(tmp, @nref(N, A, i))
-                        @nref(N, B, i) = tmp
-                    end
+        stagedfunction ($f){T,N}(B, A::AbstractArray{T,N}, axis::Integer)
+            quote
+                if size(B, axis) < 1
+                    return B
                 end
-            else
-                @nexprs N d->(isaxis_d = axis == d)
-                # Copy the initial element in each 1d vector along dimension `axis`
-                @inbounds @nloops N i d->(d == axis ? (1:1) : (1:size(A,d))) @nref(N, B, i) = @nref(N, A, i)
-                # Accumulate
-                @inbounds @nloops N i d->((1+isaxis_d):size(A, d)) d->(j_d = i_d - isaxis_d) begin
-                    @nref(N, B, i) = ($op)(@nref(N, B, j), @nref(N, A, i))
+                size(B) == size(A) || throw(DimensionMismatch("Size of B must match A"))
+                if axis == 1
+                    # We can accumulate to a temporary variable, which allows register usage and will be slightly faster
+                    @inbounds @nloops $N i d->(d > 1 ? (1:size(A,d)) : (1:1)) begin
+                        tmp = convert(eltype(B), @nref($N, A, i))
+                        @nref($N, B, i) = tmp
+                        for i_1 = 2:size(A,1)
+                            tmp = ($($op))(tmp, @nref($N, A, i))
+                            @nref($N, B, i) = tmp
+                        end
+                    end
+                else
+                    @nexprs $N d->(isaxis_d = axis == d)
+                    # Copy the initial element in each 1d vector along dimension `axis`
+                    @inbounds @nloops $N i d->(d == axis ? (1:1) : (1:size(A,d))) @nref($N, B, i) = @nref($N, A, i)
+                    # Accumulate
+                    @inbounds @nloops $N i d->((1+isaxis_d):size(A, d)) d->(j_d = i_d - isaxis_d) begin
+                        @nref($N, B, i) = ($($op))(@nref($N, B, j), @nref($N, A, i))
+                    end
                 end
+                B
             end
-            B
         end
     end
 end
 
 ### from abstractarray.jl
 
-@ngenerate N typeof(A) function fill!{T,N}(A::AbstractArray{T,N}, x)
-    xT = convert(T, x)
-    @nloops N i A begin
-        @inbounds (@nref N A i) = xT
+function fill!(A::AbstractArray, x)
+    for I in eachindex(A)
+        @inbounds A[I] = x
     end
     A
 end
 
-@ngenerate N typeof(dest) function copy!{T,N}(dest::AbstractArray{T,N}, src::AbstractArray{T,N})
-    if @nall N d->(size(dest,d) == size(src,d))
-        @nloops N i dest begin
-            @inbounds (@nref N dest i) = (@nref N src i)
+function copy!{T,N}(dest::AbstractArray{T,N}, src::AbstractArray{T,N})
+    samesize = true
+    for d = 1:N
+        if size(dest,d) != size(src,d)
+            samesize = false
+            break
+        end
+    end
+    if samesize
+        for I in eachindex(dest)
+            @inbounds dest[I] = src[I]
         end
     else
-        invoke(copy!, (typeof(dest), Any), dest, src)
+        length(dest) == length(src) || throw(DimensionMismatch("Inconsistent lengths"))
+        for (Idest, Isrc) in zip(eachindex(dest),eachindex(src))
+            @inbounds dest[Idest] = src[Isrc]
+        end
     end
     dest
 end
@@ -697,19 +715,21 @@ end
 
 ## findn
 
-@ngenerate N NTuple{N,Vector{Int}} function findn{N}(B::BitArray{N})
-    nnzB = countnz(B)
-    I = ntuple(N, x->Array(Int, nnzB))
-    if nnzB > 0
-        count = 1
-        @nloops N i B begin
-            if (@nref N B i) # TODO: should avoid bounds checking
-                @nexprs N d->(I[d][count] = i_d)
-                count += 1
+stagedfunction findn{N}(B::BitArray{N})
+    quote
+        nnzB = countnz(B)
+        I = ntuple($N, x->Array(Int, nnzB))
+        if nnzB > 0
+            count = 1
+            @nloops $N i B begin
+                if (@nref $N B i) # TODO: should avoid bounds checking
+                    @nexprs $N d->(I[d][count] = i_d)
+                    count += 1
+                end
             end
         end
+        return I
     end
-    return I
 end
 
 ## isassigned
@@ -774,70 +794,72 @@ immutable Prehashed
 end
 hash(x::Prehashed) = x.hash
 
-@ngenerate N typeof(A) function unique{T,N}(A::AbstractArray{T,N}, dim::Int)
-    1 <= dim <= N || return copy(A)
-    hashes = zeros(UInt, size(A, dim))
+stagedfunction unique{T,N}(A::AbstractArray{T,N}, dim::Int)
+    quote
+        1 <= dim <= $N || return copy(A)
+        hashes = zeros(UInt, size(A, dim))
 
-    # Compute hash for each row
-    k = 0
-    @nloops N i A d->(if d == dim; k = i_d; end) begin
-       @inbounds hashes[k] = hash(hashes[k], hash((@nref N A i)))
-    end
+        # Compute hash for each row
+        k = 0
+        @nloops $N i A d->(if d == dim; k = i_d; end) begin
+            @inbounds hashes[k] = hash(hashes[k], hash((@nref $N A i)))
+        end
 
-    # Collect index of first row for each hash
-    uniquerow = Array(Int, size(A, dim))
-    firstrow = Dict{Prehashed,Int}()
-    for k = 1:size(A, dim)
-        uniquerow[k] = get!(firstrow, Prehashed(hashes[k]), k)
-    end
-    uniquerows = collect(values(firstrow))
+        # Collect index of first row for each hash
+        uniquerow = Array(Int, size(A, dim))
+        firstrow = Dict{Prehashed,Int}()
+        for k = 1:size(A, dim)
+            uniquerow[k] = get!(firstrow, Prehashed(hashes[k]), k)
+        end
+        uniquerows = collect(values(firstrow))
 
-    # Check for collisions
-    collided = falses(size(A, dim))
-    @inbounds begin
-        @nloops N i A d->(if d == dim
+        # Check for collisions
+        collided = falses(size(A, dim))
+        @inbounds begin
+                          @nloops $N i A d->(if d == dim
                               k = i_d
                               j_d = uniquerow[k]
                           else
                               j_d = i_d
                           end) begin
-            if (@nref N A j) != (@nref N A i)
-                collided[k] = true
-            end
+                              if (@nref $N A j) != (@nref $N A i)
+                                  collided[k] = true
+                              end
+                          end
         end
-    end
 
-    if any(collided)
-        nowcollided = BitArray(size(A, dim))
-        while any(collided)
-            # Collect index of first row for each collided hash
-            empty!(firstrow)
-            for j = 1:size(A, dim)
-                collided[j] || continue
-                uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j)
-            end
-            for v in values(firstrow)
-                push!(uniquerows, v)
-            end
+        if any(collided)
+            nowcollided = BitArray(size(A, dim))
+            while any(collided)
+                # Collect index of first row for each collided hash
+                empty!(firstrow)
+                for j = 1:size(A, dim)
+                    collided[j] || continue
+                    uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j)
+                end
+                for v in values(firstrow)
+                    push!(uniquerows, v)
+                end
 
-            # Check for collisions
-            fill!(nowcollided, false)
-            @nloops N i A d->begin
-                                 if d == dim
-                                     k = i_d
-                                     j_d = uniquerow[k]
-                                     (!collided[k] || j_d == k) && continue
-                                 else
-                                     j_d = i_d
-                                 end
-                             end begin
-                if (@nref N A j) != (@nref N A i)
-                    nowcollided[k] = true
+                # Check for collisions
+                fill!(nowcollided, false)
+                @nloops $N i A d->begin
+                    if d == dim
+                        k = i_d
+                        j_d = uniquerow[k]
+                        (!collided[k] || j_d == k) && continue
+                    else
+                        j_d = i_d
+                    end
+                end begin
+                    if (@nref $N A j) != (@nref $N A i)
+                        nowcollided[k] = true
+                    end
                 end
+                (collided, nowcollided) = (nowcollided, collided)
             end
-            (collided, nowcollided) = (nowcollided, collided)
         end
-    end
 
-    @nref N A d->d == dim ? sort!(uniquerows) : (1:size(A, d))
+        @nref $N A d->d == dim ? sort!(uniquerows) : (1:size(A, d))
+    end
 end

From d487943c0bd6d7d3b031573221e2337199e7a9e9 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 20 Nov 2014 16:26:43 -0600
Subject: [PATCH 3/9] ngenerate/nsplat: getindex/setindex for AbstractArrays

---
 base/multidimensional.jl | 105 ++++++++++++++++++++++++---------------
 1 file changed, 64 insertions(+), 41 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index e8d4dba1b3276..9251a79c9741d 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -197,28 +197,36 @@ end
 @inline unsafe_setindex!{T}(v::AbstractArray{T}, x::T, ind::Real) = unsafe_setindex!(v, x, to_index(ind))
 
 # Version that uses cartesian indexing for src
-@ngenerate N typeof(dest) function _getindex!(dest::Array, src::AbstractArray, I::NTuple{N,Union(Int,AbstractVector)}...)
-    checksize(dest, I...)
-    k = 1
-    @nloops N i dest d->(@inbounds j_d = unsafe_getindex(I_d, i_d)) begin
-        @inbounds dest[k] = (@nref N src j)
-        k += 1
+stagedfunction _getindex!(dest::Array, src::AbstractArray, I::Union(Int,AbstractVector)...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        checksize(dest, $(Isplat...))
+        k = 1
+        @nloops $N i dest d->(@inbounds j_d = unsafe_getindex(I[d], i_d)) begin
+            @inbounds dest[k] = (@nref $N src j)
+            k += 1
+        end
+        dest
     end
-    dest
 end
 
 # Version that uses linear indexing for src
-@ngenerate N typeof(dest) function _getindex!(dest::Array, src::Array, I::NTuple{N,Union(Int,AbstractVector)}...)
-    checksize(dest, I...)
-    stride_1 = 1
-    @nexprs N d->(stride_{d+1} = stride_d*size(src,d))
-    @nexprs N d->(offset_d = 1)  # only really need offset_$N = 1
-    k = 1
-    @nloops N i dest d->(@inbounds offset_{d-1} = offset_d + (unsafe_getindex(I_d, i_d)-1)*stride_d) begin
-        @inbounds dest[k] = src[offset_0]
-        k += 1
+stagedfunction _getindex!(dest::Array, src::Array, I::Union(Int,AbstractVector)...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        checksize(dest, $(Isplat...))
+        stride_1 = 1
+        @nexprs $N d->(stride_{d+1} = stride_d*size(src,d))
+        @nexprs $N d->(offset_d = 1)  # only really need offset_$N = 1
+        k = 1
+        @nloops $N i dest d->(@inbounds offset_{d-1} = offset_d + (unsafe_getindex(I[d], i_d)-1)*stride_d) begin
+            @inbounds dest[k] = src[offset_0]
+            k += 1
+        end
+        dest
     end
-    dest
 end
 
 # It's most efficient to call checkbounds first, then to_index, and finally
@@ -226,39 +234,54 @@ end
 _getindex(A, I::(Union(Int,AbstractVector)...)) =
     _getindex!(similar(A, index_shape(I...)), A, I...)
 
-@nsplat N function getindex(A::Array, I::NTuple{N,Union(Real,AbstractVector)}...)
-    checkbounds(A, I...)
-    _getindex(A, to_index(I...))
+# The stagedfunction here is just to work around the performance hit
+# of splatting
+stagedfunction getindex(A::Array, I::Union(Real,AbstractVector)...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        checkbounds(A, $(Isplat...))
+        _getindex(A, to_index($(Isplat...)))
+    end
 end
 
 # Also a safe version of getindex!
-@nsplat N function getindex!(dest, src, I::NTuple{N,Union(Real,AbstractVector)}...)
-    checkbounds(src, I...)
-    _getindex!(dest, src, to_index(I...)...)
+stagedfunction getindex!(dest, src, I::Union(Real,AbstractVector)...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    Jsplat = Expr[:(to_index(I[$d])) for d = 1:N]
+    quote
+        checkbounds(src, $(Isplat...))
+        _getindex!(dest, src, $(Jsplat...))
+    end
 end
 
 
-@ngenerate N typeof(A) function setindex!(A::Array, x, J::NTuple{N,Union(Real,AbstractArray)}...)
-    @ncall N checkbounds A J
-    @nexprs N d->(I_d = to_index(J_d))
-    stride_1 = 1
-    @nexprs N d->(stride_{d+1} = stride_d*size(A,d))
-    @nexprs N d->(offset_d = 1)  # really only need offset_$N = 1
-    if !isa(x, AbstractArray)
-        @nloops N i d->(1:length(I_d)) d->(@inbounds offset_{d-1} = offset_d + (unsafe_getindex(I_d, i_d)-1)*stride_d) begin
-            @inbounds A[offset_0] = x
-        end
-    else
-        X = x
-        @ncall N setindex_shape_check X I
-        # TODO? A variant that can use cartesian indexing for RHS
-        k = 1
-        @nloops N i d->(1:length(I_d)) d->(@inbounds offset_{d-1} = offset_d + (unsafe_getindex(I_d, i_d)-1)*stride_d) begin
-            @inbounds A[offset_0] = X[k]
+stagedfunction setindex!(A::Array, x, J::Union(Real,AbstractArray)...)
+    N = length(J)
+    quote
+        @nexprs $N d->(J_d = J[d])
+        @ncall $N checkbounds A J
+        @nexprs $N d->(I_d = to_index(J_d))
+        stride_1 = 1
+        @nexprs $N d->(stride_{d+1} = stride_d*size(A,d))
+        @nexprs $N d->(offset_d = 1)  # really only need offset_$N = 1
+        if !isa(x, AbstractArray)
+            @nloops $N i d->(1:length(I_d)) d->(@inbounds offset_{d-1} = offset_d + (unsafe_getindex(I_d, i_d)-1)*stride_d) begin
+                @inbounds A[offset_0] = x
+            end
+        else
+            X = x
+            @ncall $N setindex_shape_check X I
+            # TODO? A variant that can use cartesian indexing for RHS
+            k = 1
+            @nloops $N i d->(1:length(I_d)) d->(@inbounds offset_{d-1} = offset_d + (unsafe_getindex(I_d, i_d)-1)*stride_d) begin
+                @inbounds A[offset_0] = X[k]
             k += 1
+            end
         end
+        A
     end
-    A
 end
 
 

From 0440ae2397b669931edfb1fa8173d99ba9625b8f Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Fri, 21 Nov 2014 05:29:52 -0600
Subject: [PATCH 4/9] ngenerate/nsplat: getindex methods for BitArrays

---
 base/multidimensional.jl | 126 +++++++++++++++++++++++----------------
 1 file changed, 76 insertions(+), 50 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 9251a79c9741d..dca286ef70b34 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -490,27 +490,33 @@ end
 # (uses linear indexing, which is defined in bitarray.jl)
 # (code is duplicated for safe and unsafe versions for performance reasons)
 
-@ngenerate N Bool function unsafe_getindex(B::BitArray, I_0::Int, I::NTuple{N,Int}...)
-    stride = 1
-    index = I_0
-    @nexprs N d->begin
-        stride *= size(B,d)
-        index += (I_d - 1) * stride
+stagedfunction unsafe_getindex(B::BitArray, I_0::Int, I::Int...)
+    N = length(I)
+    quote
+        stride = 1
+        index = I_0
+        @nexprs $N d->begin
+            stride *= size(B,d)
+            index += (I[d] - 1) * stride
+        end
+        return unsafe_getindex(B, index)
     end
-    return unsafe_getindex(B, index)
 end
 
-@ngenerate N Bool function getindex(B::BitArray, I_0::Int, I::NTuple{N,Int}...)
-    stride = 1
-    index = I_0
-    @nexprs N d->begin
-        l = size(B,d)
-        stride *= l
-        1 <= I_{d-1} <= l || throw(BoundsError(B, tuple(I_0, @ntuple(N,I)...)))
-        index += (I_d - 1) * stride
+stagedfunction getindex(B::BitArray, I_0::Int, I::Int...)
+    N = length(I)
+    quote
+        stride = 1
+        index = I_0
+        @nexprs $N d->(I_d = I[d])
+        @nexprs $N d->begin
+            l = size(B,d)
+            stride *= l
+            1 <= I_{d-1} <= l || throw(BoundsError())
+            index += (I_d - 1) * stride
+        end
+        return B[index]
     end
-    1 <= index <= length(B) || throw(BoundsError(B, tuple(I_0, @ntuple(N,I)...)))
-    return unsafe_getindex(B, index)
 end
 
 # contiguous multidimensional indexing: if the first dimension is a range,
@@ -529,53 +535,73 @@ end
 
 getindex{T<:Real}(B::BitArray, I0::UnitRange{T}) = getindex(B, to_index(I0))
 
-@ngenerate N BitArray{length(index_shape(I0, I...))} function unsafe_getindex(B::BitArray, I0::UnitRange{Int}, I::NTuple{N,Union(Int,UnitRange{Int})}...)
-    X = BitArray(index_shape(I0, I...))
-
-    f0 = first(I0)
-    l0 = length(I0)
-
-    gap_lst_1 = 0
-    @nexprs N d->(gap_lst_{d+1} = length(I_d))
-    stride = 1
-    ind = f0
-    @nexprs N d->begin
-        stride *= size(B, d)
-        stride_lst_d = stride
-        ind += stride * (first(I_d) - 1)
-        gap_lst_{d+1} *= stride
-    end
+stagedfunction unsafe_getindex(B::BitArray, I0::UnitRange{Int}, I::Union(Int,UnitRange{Int})...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        @nexprs $N d->(I_d = I[d])
+        X = BitArray(index_shape(I0, $(Isplat...)))
+
+        f0 = first(I0)
+        l0 = length(I0)
+
+        gap_lst_1 = 0
+        @nexprs $N d->(gap_lst_{d+1} = length(I_d))
+        stride = 1
+        ind = f0
+        @nexprs $N d->begin
+            stride *= size(B, d)
+            stride_lst_d = stride
+            ind += stride * (first(I_d) - 1)
+            gap_lst_{d+1} *= stride
+        end
 
-    storeind = 1
-    @nloops(N, i, d->I_d,
-        d->nothing, # PRE
-        d->(ind += stride_lst_d - gap_lst_d), # POST
+        storeind = 1
+        @nloops($N, i, d->I_d,
+                d->nothing, # PRE
+                d->(ind += stride_lst_d - gap_lst_d), # POST
         begin # BODY
             copy_chunks!(X.chunks, storeind, B.chunks, ind, l0)
             storeind += l0
-        end)
-    return X
+                end)
+        return X
+    end
 end
 
 # general multidimensional non-scalar indexing
 
-@ngenerate N BitArray{length(index_shape(I...))} function unsafe_getindex(B::BitArray, I::NTuple{N,Union(Int,AbstractVector{Int})}...)
-    X = BitArray(index_shape(I...))
-    Xc = X.chunks
+stagedfunction unsafe_getindex(B::BitArray, I::Union(Int,AbstractVector{Int})...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        @nexprs $N d->(I_d = I[d])
+        X = BitArray(index_shape($(Isplat...)))
+        Xc = X.chunks
 
-    ind = 1
-    @nloops N i d->I_d begin
-        unsafe_bitsetindex!(Xc, (@ncall N unsafe_getindex B i), ind)
-        ind += 1
+        stride_1 = 1
+        @nexprs $N d->(stride_{d+1} = stride_d * size(B, d))
+        @nexprs 1 d->(offset_{$N} = 1)
+        ind = 1
+        @nloops($N, i, d->I_d,
+                d->(offset_{d-1} = offset_d + (i_d-1)*stride_d), # PRE
+                begin
+            unsafe_bitsetindex!(Xc, B[offset_0], ind)
+            ind += 1
+                end)
+        return X
     end
-    return X
 end
 
 # general version with Real (or logical) indexing which dispatches on the appropriate method
 
-@ngenerate N BitArray{length(index_shape(I...))} function getindex(B::BitArray, I::NTuple{N,Union(Real,AbstractVector)}...)
-    checkbounds(B, I...)
-    return unsafe_getindex(B, to_index(I...)...)
+stagedfunction getindex(B::BitArray, I::Union(Real,AbstractVector)...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    Jsplat = Expr[:(to_index(I[$d])) for d = 1:N]
+    quote
+        checkbounds(B, $(Isplat...))
+        return unsafe_getindex(B, $(Jsplat...))
+    end
 end
 
 ## setindex!

From cfdbae468453c3749df3ef3ac1de35bba659df83 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Fri, 21 Nov 2014 06:25:57 -0600
Subject: [PATCH 5/9] ngenerate/nsplat: BitArray setindex! part 1

---
 base/multidimensional.jl | 130 +++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index dca286ef70b34..4dcc3e56e01f1 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -611,29 +611,35 @@ end
 # bounds check and is defined in bitarray.jl)
 # (code is duplicated for safe and unsafe versions for performance reasons)
 
-@ngenerate N typeof(B) function unsafe_setindex!(B::BitArray, x::Bool, I_0::Int, I::NTuple{N,Int}...)
-    stride = 1
-    index = I_0
-    @nexprs N d->begin
-        stride *= size(B,d)
-        index += (I_d - 1) * stride
+stagedfunction unsafe_setindex!(B::BitArray, x::Bool, I_0::Int, I::Int...)
+    N = length(I)
+    quote
+        stride = 1
+        index = I_0
+        @nexprs $N d->begin
+            stride *= size(B,d)
+            index += (I[d] - 1) * stride
+        end
+        unsafe_setindex!(B, x, index)
+        return B
     end
-    unsafe_setindex!(B, x, index)
-    return B
 end
 
-@ngenerate N typeof(B) function setindex!(B::BitArray, x::Bool, I_0::Int, I::NTuple{N,Int}...)
-    stride = 1
-    index = I_0
-    @nexprs N d->begin
-        l = size(B,d)
-        stride *= l
-        1 <= I_{d-1} <= l || throw(BoundsError(B, tuple(I_0, @ntuple(N,I)...)))
-        index += (I_d - 1) * stride
+stagedfunction setindex!(B::BitArray, x::Bool, I_0::Int, I::Int...)
+    N = length(I)
+    quote
+        stride = 1
+        index = I_0
+        @nexprs $N d->(I_d = I[d])
+        @nexprs $N d->begin
+            l = size(B,d)
+            stride *= l
+            1 <= I_{d-1} <= l || throw(BoundsError())
+            index += (I_d - 1) * stride
+        end
+        B[index] = x
+        return B
     end
-    1 <= index <= length(B) || throw(BoundsError(B, tuple(I_0, @ntuple(N,I)...)))
-    unsafe_setindex!(B, x, index)
-    return B
 end
 
 # contiguous multidimensional indexing: if the first dimension is a range,
@@ -655,59 +661,65 @@ function unsafe_setindex!(B::BitArray, x::Bool, I0::UnitRange{Int})
     return B
 end
 
-@ngenerate N typeof(B) function unsafe_setindex!(B::BitArray, X::BitArray, I0::UnitRange{Int}, I::NTuple{N,Union(Int,UnitRange{Int})}...)
-    length(X) == 0 && return B
-    f0 = first(I0)
-    l0 = length(I0)
+stagedfunction unsafe_setindex!(B::BitArray, X::BitArray, I0::UnitRange{Int}, I::Union(Int,UnitRange{Int})...)
+    N = length(I)
+    quote
+        length(X) == 0 && return B
+        f0 = first(I0)
+        l0 = length(I0)
 
-    gap_lst_1 = 0
-    @nexprs N d->(gap_lst_{d+1} = length(I_d))
-    stride = 1
-    ind = f0
-    @nexprs N d->begin
-        stride *= size(B, d)
-        stride_lst_d = stride
-        ind += stride * (first(I_d) - 1)
-        gap_lst_{d+1} *= stride
-    end
+        gap_lst_1 = 0
+        @nexprs $N d->(gap_lst_{d+1} = length(I[d]))
+        stride = 1
+        ind = f0
+        @nexprs $N d->begin
+            stride *= size(B, d)
+            stride_lst_d = stride
+            ind += stride * (first(I[d]) - 1)
+            gap_lst_{d+1} *= stride
+        end
 
-    refind = 1
-    @nloops(N, i, d->I_d,
-        d->nothing, # PRE
-        d->(ind += stride_lst_d - gap_lst_d), # POST
+        refind = 1
+        @nloops($N, i, d->I[d],
+                d->nothing, # PRE
+                d->(ind += stride_lst_d - gap_lst_d), # POST
         begin # BODY
             copy_chunks!(B.chunks, ind, X.chunks, refind, l0)
             refind += l0
-        end)
+                end)
 
-    return B
+        return B
+    end
 end
 
-@ngenerate N typeof(B) function unsafe_setindex!(B::BitArray, x::Bool, I0::UnitRange{Int}, I::NTuple{N,Union(Int,UnitRange{Int})}...)
-    f0 = first(I0)
-    l0 = length(I0)
-    l0 == 0 && return B
-    @nexprs N d->(length(I_d) == 0 && return B)
+stagedfunction unsafe_setindex!(B::BitArray, x::Bool, I0::UnitRange{Int}, I::Union(Int,UnitRange{Int})...)
+    N = length(I)
+    quote
+        f0 = first(I0)
+        l0 = length(I0)
+        l0 == 0 && return B
+        @nexprs $N d->(length(I[d]) == 0 && return B)
 
-    gap_lst_1 = 0
-    @nexprs N d->(gap_lst_{d+1} = length(I_d))
-    stride = 1
-    ind = f0
-    @nexprs N d->begin
-        stride *= size(B, d)
-        stride_lst_d = stride
-        ind += stride * (first(I_d) - 1)
-        gap_lst_{d+1} *= stride
-    end
+        gap_lst_1 = 0
+        @nexprs $N d->(gap_lst_{d+1} = length(I[d]))
+        stride = 1
+        ind = f0
+        @nexprs $N d->begin
+            stride *= size(B, d)
+            stride_lst_d = stride
+            ind += stride * (first(I[d]) - 1)
+            gap_lst_{d+1} *= stride
+        end
 
-    @nloops(N, i, d->I_d,
-        d->nothing, # PRE
-        d->(ind += stride_lst_d - gap_lst_d), # POST
+        @nloops($N, i, d->I[d],
+                d->nothing, # PRE
+                d->(ind += stride_lst_d - gap_lst_d), # POST
         begin # BODY
             fill_chunks!(B.chunks, x, ind, l0)
-        end)
+                end)
 
-    return B
+        return B
+    end
 end
 
 

From c092c0ba230807fcded9ef1c49899dd13a5a6a11 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Tue, 3 Feb 2015 10:18:02 -0600
Subject: [PATCH 6/9] Fix failures on new tests

---
 base/multidimensional.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 4dcc3e56e01f1..01b01ba436f0e 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -454,9 +454,10 @@ end
 
 ### from abstractarray.jl
 
-function fill!(A::AbstractArray, x)
+function fill!{T}(A::AbstractArray{T}, x)
+    xT = convert(T, x)
     for I in eachindex(A)
-        @inbounds A[I] = x
+        @inbounds A[I] = xT
     end
     A
 end
@@ -474,8 +475,11 @@ function copy!{T,N}(dest::AbstractArray{T,N}, src::AbstractArray{T,N})
             @inbounds dest[I] = src[I]
         end
     else
-        length(dest) == length(src) || throw(DimensionMismatch("Inconsistent lengths"))
-        for (Idest, Isrc) in zip(eachindex(dest),eachindex(src))
+        length(dest) >= length(src) || throw(BoundsError())
+        iterdest = eachindex(dest)
+        sdest = start(iterdest)
+        for Isrc in eachindex(src)
+            Idest, sdest = next(iterdest, sdest)
             @inbounds dest[Idest] = src[Isrc]
         end
     end

From 7e2624a580d2a25ec7f0a6a8ecc57ea354c86ed9 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Tue, 3 Feb 2015 10:59:37 -0600
Subject: [PATCH 7/9] Remove final usages of atsign-ngenerate from base

---
 base/multidimensional.jl | 136 ++++++++++++++++++++++-----------------
 1 file changed, 78 insertions(+), 58 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 01b01ba436f0e..4cf3872b5996d 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -729,20 +729,28 @@ end
 
 # general multidimensional non-scalar indexing
 
-@ngenerate N typeof(B) function unsafe_setindex!(B::BitArray, X::AbstractArray, I::NTuple{N,Union(Int,AbstractArray{Int})}...)
-    refind = 1
-    @nloops N i d->I_d @inbounds begin
-        @ncall N unsafe_setindex! B convert(Bool,X[refind]) i
-        refind += 1
+stagedfunction unsafe_setindex!(B::BitArray, X::AbstractArray, I::Union(Int,AbstractArray{Int})...)
+    N = length(I)
+    quote
+        refind = 1
+        @nexprs $N d->(I_d = I[d])
+        @nloops $N i d->I_d @inbounds begin
+            @ncall $N unsafe_setindex! B convert(Bool,X[refind]) i
+            refind += 1
+        end
+        return B
     end
-    return B
 end
 
-@ngenerate N typeof(B) function unsafe_setindex!(B::BitArray, x::Bool, I::NTuple{N,Union(Int,AbstractArray{Int})}...)
-    @nloops N i d->I_d begin
-        @ncall N unsafe_setindex! B x i
+stagedfunction unsafe_setindex!(B::BitArray, x::Bool, I::Union(Int,AbstractArray{Int})...)
+    N = length(I)
+    quote
+        @nexprs $N d->(I_d = I[d])
+        @nloops $N i d->I_d begin
+            @ncall $N unsafe_setindex! B x i
+        end
+        return B
     end
-    return B
 end
 
 # general versions with Real (or logical) indexing which dispatch on the appropriate method
@@ -753,11 +761,14 @@ function setindex!(B::BitArray, x, i::Real)
     return unsafe_setindex!(B, convert(Bool,x), to_index(i))
 end
 
-@ngenerate N typeof(B) function setindex!(B::BitArray, x, I::NTuple{N,Union(Real,AbstractArray)}...)
-    checkbounds(B, I...)
-    #return unsafe_setindex!(B, convert(Bool,x), to_index(I...)...) # segfaults! (???)
-    @nexprs N d->(J_d = to_index(I_d))
-    return @ncall N unsafe_setindex! B convert(Bool,x) J
+stagedfunction setindex!(B::BitArray, x, I::Union(Real,AbstractArray)...)
+    N = length(I)
+    quote
+        checkbounds(B, I...)
+        #return unsafe_setindex!(B, convert(Bool,x), to_index(I...)...) # segfaults! (???)
+        @nexprs $N d->(J_d = to_index(I[d]))
+        return @ncall $N unsafe_setindex! B convert(Bool,x) J
+    end
 end
 
 
@@ -769,11 +780,14 @@ function setindex!(B::BitArray, X::AbstractArray, i::Real)
     return unsafe_setindex!(B, X, j)
 end
 
-@ngenerate N typeof(B) function setindex!(B::BitArray, X::AbstractArray, I::NTuple{N,Union(Real,AbstractArray)}...)
-    checkbounds(B, I...)
-    @nexprs N d->(J_d = to_index(I_d))
-    @ncall N setindex_shape_check X J
-    return @ncall N unsafe_setindex! B X J
+stagedfunction setindex!(B::BitArray, X::AbstractArray, I::Union(Real,AbstractArray)...)
+    N = length(I)
+    quote
+        checkbounds(B, I...)
+        @nexprs $N d->(J_d = to_index(I[d]))
+        @ncall $N setindex_shape_check X J
+        return @ncall $N unsafe_setindex! B X J
+    end
 end
 
 
@@ -799,54 +813,60 @@ end
 
 ## isassigned
 
-@ngenerate N Bool function isassigned(B::BitArray, I_0::Int, I::NTuple{N,Int}...)
-    stride = 1
-    index = I_0
-    @nexprs N d->begin
-        l = size(B,d)
-        stride *= l
-        1 <= I_{d-1} <= l || return false
-        index += (I_d - 1) * stride
+stagedfunction isassigned(B::BitArray, I_0::Int, I::Int...)
+    N = length(I)
+    quote
+        @nexprs $N d->(I_d = I[d])
+        stride = 1
+        index = I_0
+        @nexprs N d->begin
+            l = size(B,d)
+            stride *= l
+            1 <= I_{d-1} <= l || return false
+            index += (I_d - 1) * stride
+        end
+        return isassigned(B, index)
     end
-    return isassigned(B, index)
 end
 
 ## permutedims
 
 for (V, PT, BT) in [((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)]
-    @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
-        dimsB = size(B)
-        length(perm) == N || throw(ArgumentError("expected permutation of size $N, but length(perm)=$(length(perm))"))
-        isperm(perm) || throw(ArgumentError("input is not a permutation"))
-        dimsP = size(P)
-        for i = 1:length(perm)
-            dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
-        end
+    @eval stagedfunction permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
+        quote
+            dimsB = size(B)
+            length(perm) == N || throw(ArgumentError("expected permutation of size $N, but length(perm)=$(length(perm))"))
+            isperm(perm) || throw(ArgumentError("input is not a permutation"))
+            dimsP = size(P)
+            for i = 1:length(perm)
+                dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
+            end
 
-        #calculates all the strides
-        strides_1 = 0
-        @nexprs N d->(strides_{d+1} = stride(B, perm[d]))
+            #calculates all the strides
+            strides_1 = 0
+            @nexprs $N d->(strides_{d+1} = stride(B, perm[d]))
 
-        #Creates offset, because indexing starts at 1
-        offset = 1 - sum(@ntuple N d->strides_{d+1})
+            #Creates offset, because indexing starts at 1
+            offset = 1 - sum(@ntuple $N d->strides_{d+1})
 
-        if isa(B, SubArray)
-            offset += first_index(B::SubArray) - 1
-            B = B.parent
-        end
+            if isa(B, SubArray)
+                offset += first_index(B::SubArray) - 1
+                B = B.parent
+            end
 
-        ind = 1
-        @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1)
-        @nloops(N, i, P,
-            d->(counts_d = strides_d), # PRE
-            d->(counts_{d+1} += strides_{d+1}), # POST
-            begin # BODY
-                sumc = sum(@ntuple N d->counts_{d+1})
-                @inbounds P[ind] = B[sumc+offset]
-                ind += 1
-            end)
-
-        return P
+            ind = 1
+            @nexprs 1 d->(counts_{$N+1} = strides_{$N+1}) # a trick to set counts_($N+1)
+            @nloops($N, i, P,
+                    d->(counts_d = strides_d), # PRE
+                    d->(counts_{d+1} += strides_{d+1}), # POST
+                    begin # BODY
+                    sumc = sum(@ntuple $N d->counts_{d+1})
+                    @inbounds P[ind] = B[sumc+offset]
+                    ind += 1
+                    end)
+
+            return P
+        end
     end
 end
 

From 5e562cfebb0cd4d19f6a997a5fb874bb1f8c64d8 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Tue, 3 Feb 2015 13:08:17 -0600
Subject: [PATCH 8/9] Remove atsign-nsplat from sharedarray.jl

---
 base/sharedarray.jl | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/base/sharedarray.jl b/base/sharedarray.jl
index 5d8f52ac05539..2125ec3ae4aa3 100644
--- a/base/sharedarray.jl
+++ b/base/sharedarray.jl
@@ -210,12 +210,24 @@ convert(::Type{Array}, S::SharedArray) = S.s
 getindex(S::SharedArray) = getindex(S.s)
 getindex(S::SharedArray, I::Real) = getindex(S.s, I)
 getindex(S::SharedArray, I::AbstractArray) = getindex(S.s, I)
-@nsplat N 1:5 getindex(S::SharedArray, I::NTuple{N,Union(Real,AbstractVector)}...) = getindex(S.s, I...)
+stagedfunction getindex(S::SharedArray, I::Union(Real,AbstractVector)...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        getindex(S.s, $(Isplat...))
+    end
+end
 
 setindex!(S::SharedArray, x) = setindex!(S.s, x)
 setindex!(S::SharedArray, x, I::Real) = setindex!(S.s, x, I)
 setindex!(S::SharedArray, x, I::AbstractArray) = setindex!(S.s, x, I)
-@nsplat N 1:5 setindex!(S::SharedArray, x, I::NTuple{N,Union(Real,AbstractVector)}...) = setindex!(S.s, x, I...)
+stagedfunction setindex!(S::SharedArray, x, I::Union(Real,AbstractVector)...)
+    N = length(I)
+    Isplat = Expr[:(I[$d]) for d = 1:N]
+    quote
+        setindex!(S.s, x, $(Isplat...))
+    end
+end
 
 function fill!(S::SharedArray, v)
     vT = convert(eltype(S), v)

From def80f73cea829391c74d34173e4bfa07fc7b81a Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Tue, 3 Feb 2015 13:39:19 -0600
Subject: [PATCH 9/9] Remove ngenerate and nsplat from base

---
 base/cartesian.jl | 271 +---------------------------------------------
 1 file changed, 1 insertion(+), 270 deletions(-)

diff --git a/base/cartesian.jl b/base/cartesian.jl
index 1517ed5f8c368..892f41d8bef73 100644
--- a/base/cartesian.jl
+++ b/base/cartesian.jl
@@ -1,275 +1,6 @@
 module Cartesian
 
-export @ngenerate, @nsplat, @nloops, @nref, @ncall, @nexprs, @nextract, @nall, @ntuple, @nif, ngenerate
-
-const CARTESIAN_DIMS = 4
-
-### @ngenerate, for auto-generation of separate versions of functions for different dimensionalities
-# Examples (deliberately trivial):
-#     @ngenerate N returntype myndims{T,N}(A::Array{T,N}) = N
-# or alternatively
-#     function gen_body(N::Int)
-#         quote
-#             return $N
-#         end
-#     end
-#     eval(ngenerate(:N, returntypeexpr, :(myndims{T,N}(A::Array{T,N})), gen_body))
-# The latter allows you to use a single gen_body function for both ngenerate and
-# when your function maintains its own method cache (e.g., reduction or broadcasting).
-#
-# Special syntax for function prototypes:
-#   @ngenerate N returntype function myfunction(A::AbstractArray, I::NTuple{N, Int}...)
-# for N = 3 translates to
-#   function myfunction(A::AbstractArray, I_1::Int, I_2::Int, I_3::Int)
-# and for the generic (cached) case as
-#   function myfunction(A::AbstractArray, I::Int...)
-#     @nextract N I I
-# with N = length(I). N should _not_ be listed as a parameter of the function unless
-# earlier arguments use it that way.
-# To avoid ambiguity, it would be preferable to have some specific syntax for this, such as
-#   myfunction(A::AbstractArray, I::Int...N)
-# where N can be an integer or symbol. Currently T...N generates a parser error.
-macro ngenerate(itersym, returntypeexpr, funcexpr)
-    if isa(funcexpr, Expr) && funcexpr.head == :macrocall && funcexpr.args[1] == symbol("@inline")
-        funcexpr = Base._inline(funcexpr.args[2])
-    end
-    isfuncexpr(funcexpr) || throw(ArgumentError("requires a function expression"))
-    esc(ngenerate(itersym, returntypeexpr, funcexpr.args[1], N->sreplace!(copy(funcexpr.args[2]), itersym, N)))
-end
-
-# @nsplat takes an expression like
-#    @nsplat N 2:3 myfunction(A, I::NTuple{N,Real}...) = getindex(A, I...)
-# and generates
-#    myfunction(A, I_1::Real, I_2::Real) = getindex(A, I_1, I_2)
-#    myfunction(A, I_1::Real, I_2::Real, I_3::Real) = getindex(A, I_1, I_2, I_3)
-#    myfunction(A, I::Real...) = getindex(A, I...)
-# An @nsplat function _cannot_ have any other Cartesian macros in it.
-# If you omit the range, it uses 1:CARTESIAN_DIMS.
-macro nsplat(itersym, args...)
-    local rng
-    if length(args) == 1
-        rng = 1:CARTESIAN_DIMS
-        funcexpr = args[1]
-    elseif length(args) == 2
-        rangeexpr = args[1]
-        funcexpr = args[2]
-        if !isa(rangeexpr, Expr) || rangeexpr.head != :(:) || length(rangeexpr.args) != 2
-            throw(ArgumentError("first argument must be a from:to expression"))
-        end
-        rng = rangeexpr.args[1]:rangeexpr.args[2]
-    else
-        throw(ArgumentError("wrong number of arguments"))
-    end
-    if isa(funcexpr, Expr) && funcexpr.head == :macrocall && funcexpr.args[1] == symbol("@inline")
-        funcexpr = Base._inline(funcexpr.args[2])
-    end
-    isfuncexpr(funcexpr) || throw(ArgumentError("second argument must be a function expression"))
-    prototype = funcexpr.args[1]
-    body = funcexpr.args[2]
-    varname, T = get_splatinfo(prototype, itersym)
-    isempty(varname) && throw(ArgumentError("last argument must be a splat"))
-    explicit = [Expr(:function, resolvesplat!(copy(prototype), varname, T, N),
-                     resolvesplats!(copy(body), varname, N)) for N in rng]
-    protosplat = resolvesplat!(copy(prototype), varname, T, 0)
-    protosplat.args[end] = Expr(:..., protosplat.args[end])
-    splat = Expr(:function, protosplat, body)
-    esc(Expr(:block, explicit..., splat))
-end
-
-generate1(itersym, prototype, bodyfunc, N::Int, varname, T) =
-    Expr(:function, spliceint!(sreplace!(resolvesplat!(copy(prototype), varname, T, N), itersym, N)),
-         resolvesplats!(bodyfunc(N), varname, N))
-
-function ngenerate(itersym, returntypeexpr, prototype, bodyfunc, dims=1:CARTESIAN_DIMS, makecached::Bool = true)
-    varname, T = get_splatinfo(prototype, itersym)
-    # Generate versions for specific dimensions
-    fdim = [generate1(itersym, prototype, bodyfunc, N, varname, T) for N in dims]
-    if !makecached
-        return Expr(:block, fdim...)
-    end
-    # Generate the generic cache-based version
-    if isempty(varname)
-        setitersym, extractvarargs = :(), N -> nothing
-    else
-        s = symbol(varname)
-        setitersym = hasparameter(prototype, itersym) ? (:(@assert $itersym == length($s))) : (:($itersym = length($s)))
-        extractvarargs = N -> Expr(:block, map(popescape, _nextract(N, s, s).args)...)
-    end
-    fsym = funcsym(prototype)
-    dictname = symbol(fsym,"_cache")
-    fargs = funcargs(prototype)
-    if !isempty(varname)
-        fargs[end] = Expr(:..., fargs[end].args[1])
-    end
-    flocal = funcrename(copy(prototype), :_F_)
-    F = Expr(:function, resolvesplat!(prototype, varname, T), quote
-             $setitersym
-             if !haskey($dictname, $itersym)
-                 gen1 = Base.Cartesian.generate1($(symbol(itersym)), $(Expr(:quote, flocal)), $bodyfunc, $itersym, $varname, $T)
-                 $(dictname)[$itersym] = eval(quote
-                     local _F_
-                     $gen1
-                     _F_
-                 end)
-             end
-             ($(dictname)[$itersym]($(fargs...)))::$returntypeexpr
-         end)
-    Expr(:block, fdim..., quote
-            let $dictname = Dict{Int,Function}()
-            $F
-            end
-        end)
-end
-
-isfuncexpr(ex::Expr) =
-    ex.head == :function || (ex.head == :(=) && typeof(ex.args[1]) == Expr && ex.args[1].head == :call)
-isfuncexpr(arg) = false
-
-sreplace!(arg, sym, val) = arg
-function sreplace!(ex::Expr, sym, val)
-    for i = 1:length(ex.args)
-        ex.args[i] = sreplace!(ex.args[i], sym, val)
-    end
-    ex
-end
-sreplace!(s::Symbol, sym, val) = s == sym ? val : s
-
-# If using the syntax that will need "desplatting",
-#     myfunction(A::AbstractArray, I::NTuple{N, Int}...)
-# return the variable name (as a string) and type
-function get_splatinfo(ex::Expr, itersym::Symbol)
-    if ex.head == :call
-        a = ex.args[end]
-        if  isa(a, Expr) && a.head == :... && length(a.args) == 1
-            b = a.args[1]
-            if isa(b, Expr) && b.head == :(::)
-                varname = string(b.args[1])
-                c = b.args[2]
-                if isa(c, Expr) && c.head == :curly && c.args[1] == :NTuple && c.args[2] == itersym
-                    T = c.args[3]
-                    return varname, T
-                end
-            end
-        end
-    end
-    "", Void
-end
-
-# Replace splatted with desplatted for a specific number of arguments
-function resolvesplat!(prototype, varname, T::Union(Type,Symbol,Expr), N::Int)
-    if !isempty(varname)
-        prototype.args[end] = N > 0 ? Expr(:(::), symbol(varname, "_1"), T) :
-                                      Expr(:(::), symbol(varname), T)
-        for i = 2:N
-            push!(prototype.args, Expr(:(::), symbol(varname, "_", i), T))
-        end
-    end
-    prototype
-end
-
-# Return the generic splatting form, e.g.,
-#     myfunction(A::AbstractArray, I::Int...)
-function resolvesplat!(prototype, varname, T::Union(Type,Symbol,Expr))
-    if !isempty(varname)
-        svarname = symbol(varname)
-        prototype.args[end] = Expr(:..., :($svarname::$T))
-    end
-    prototype
-end
-
-# Desplatting function calls: replace func(a, b, I...) with func(a, b, I_1, I_2, I_3)
-resolvesplats!(arg, varname, N) = arg
-function resolvesplats!(ex::Expr, varname, N::Int)
-    if ex.head == :call
-        for i = 2:length(ex.args)-1
-            resolvesplats!(ex.args[i], varname, N)
-        end
-        a = ex.args[end]
-        if isa(a, Expr) && a.head == :... && a.args[1] == symbol(varname)
-            ex.args[end] = symbol(varname, "_1")
-            for i = 2:N
-                push!(ex.args, symbol(varname, "_", i))
-            end
-        else
-            resolvesplats!(a, varname, N)
-        end
-    else
-        for i = 1:length(ex.args)
-            resolvesplats!(ex.args[i], varname, N)
-        end
-    end
-    ex
-end
-
-# Remove any function parameters that are integers
-function spliceint!(ex::Expr)
-    if ex.head == :escape
-        return esc(spliceint!(ex.args[1]))
-    end
-    ex.head == :call || throw(ArgumentError("$ex must be a call"))
-    if isa(ex.args[1], Expr) && ex.args[1].head == :curly
-        args = ex.args[1].args
-        for i = length(args):-1:1
-            if isa(args[i], Int)
-                deleteat!(args, i)
-            end
-        end
-    end
-    ex
-end
-
-function popescape(ex::Expr)
-    while ex.head == :escape
-        ex = ex.args[1]
-    end
-    ex
-end
-
-# Extract the "function name"
-function funcsym(prototype::Expr)
-    prototype = popescape(prototype)
-    prototype.head == :call || throw(ArgumentError("$prototype must be a call"))
-    tmp = prototype.args[1]
-    if isa(tmp, Expr) && tmp.head == :curly
-        tmp = tmp.args[1]
-    end
-    return tmp
-end
-
-function funcrename(prototype::Expr, name::Symbol)
-    prototype = popescape(prototype)
-    prototype.head == :call || throw(ArgumentError("$prototype must be a call"))
-    tmp = prototype.args[1]
-    if isa(tmp, Expr) && tmp.head == :curly
-        tmp.args[1] = name
-    else
-        prototype.args[1] = name
-    end
-    return prototype
-end
-
-function hasparameter(prototype::Expr, sym::Symbol)
-    prototype = popescape(prototype)
-    prototype.head == :call || throw(ArgumentError("$prototype must be a call"))
-    tmp = prototype.args[1]
-    if isa(tmp, Expr) && tmp.head == :curly
-        for i = 2:length(tmp.args)
-            if tmp.args[i] == sym
-                return true
-            end
-        end
-    end
-    false
-end
-
-# Extract the symbols of the function arguments
-funcarg(s::Symbol) = s
-funcarg(ex::Expr) = ex.args[1]
-function funcargs(prototype::Expr)
-    prototype = popescape(prototype)
-    prototype.head == :call || throw(ArgumentError("$prototype must be a call"))
-    map(a->funcarg(a), prototype.args[2:end])
-end
+export @nloops, @nref, @ncall, @nexprs, @nextract, @nall, @ntuple, @nif
 
 ### Cartesian-specific macros