From e96ed37d787550cf53aedda7da9b94ff28588c9d Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 9 Aug 2024 09:49:30 -0500 Subject: [PATCH 1/9] datadeps: Add at-stencil helper --- src/Dagger.jl | 2 + src/stencil.jl | 203 +++++++++++++++++++++++++++++++++++++++++ src/utils/haloarray.jl | 112 +++++++++++++++++++++++ 3 files changed, 317 insertions(+) create mode 100644 src/stencil.jl create mode 100644 src/utils/haloarray.jl diff --git a/src/Dagger.jl b/src/Dagger.jl index cb763a66b..848b54b72 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -65,6 +65,8 @@ include("sch/Sch.jl"); using .Sch # Data dependency task queue include("datadeps.jl") +include("utils/haloarray.jl") +include("stencil.jl") # Array computations include("array/darray.jl") diff --git a/src/stencil.jl b/src/stencil.jl new file mode 100644 index 000000000..808e01459 --- /dev/null +++ b/src/stencil.jl @@ -0,0 +1,203 @@ +function get_neighbor_edge(arr, dim, dir, dist) + if dir == -1 + start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - dist + 1) : firstindex(arr, i), ndims(arr))) + stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr))) + elseif dir == 1 + start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr))) + stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + dist - 1) : lastindex(arr, i), ndims(arr))) + end + return collect(@view arr[start_idx:stop_idx]) +end +function get_neighbor_corner(chunk, corner_side, neigh_dist) + start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(chunk, i) - neigh_dist + 1) : firstindex(chunk, i), ndims(chunk))) + stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(chunk, i) : (firstindex(chunk, i) + neigh_dist - 1), ndims(chunk))) + return collect(@view chunk[start_idx:stop_idx]) +end +function get_neighborhood_chunks(chunks, idx, neigh_dist) + @assert neigh_dist == 1 + chunk_dist = 1 + # Get the center + accesses = Any[chunks[idx]] + # Get the edges + for dim in 1:ndims(chunks) + for dir in (-1, +1) + if dir == -1 && idx[dim] == firstindex(chunks, dim) + new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks))) + elseif dir == +1 && idx[dim] == lastindex(chunks, dim) + new_idx = idx - CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks))) + else + new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks))) + end + chunk = chunks[new_idx] + push!(accesses, Dagger.@spawn get_neighbor_edge(chunk, dim, dir, neigh_dist)) + end + end + # Get the corners + for corner_num in 1:(2^ndims(chunks)) + corner_side = CartesianIndex(reverse(ntuple(ndims(chunks)) do i + ((corner_num-1) >> (((ndims(chunks) - i) + 1) - 1)) & 1 + end)) + corner_idx = CartesianIndex(ntuple(ndims(chunks)) do i + corner_shift = iszero(corner_side[i]) ? -1 : 1 + return mod1(idx[i] + corner_shift, size(chunks, i)) + end) + chunk = chunks[corner_idx] + push!(accesses, Dagger.@spawn get_neighbor_corner(chunk, corner_side, neigh_dist)) + end + @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))" + return accesses +end +function build_halo(neigh_dist, center::Array{T,N}, all_neighbors...) where {T,N} + # FIXME: Don't collect views + edges = collect.(all_neighbors[1:(2*N)]) + corners = collect.(all_neighbors[((2^N)+1):end]) + @assert length(edges) == 2*N && length(corners) == 2^N "Halo mismatch: edges=$(length(edges)) corners=$(length(corners))" + arr = HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N)) + return arr +end +function load_neighborhood(arr::HaloArray{T,N}, idx, neigh_dist) where {T,N} + start_idx = idx - CartesianIndex(ntuple(_->neigh_dist, ndims(arr))) + stop_idx = idx + CartesianIndex(ntuple(_->neigh_dist, ndims(arr))) + # FIXME: Don't collect HaloArray view + return collect(@view arr[start_idx:stop_idx]) +end + +""" + @stencil idx in range begin body end + +Allows the specification of stencil operations within a `spawn_datadeps` +region. The `idx` variable is used to iterate over `range`, which must be a +`DArray`. An example usage may look like: + +```julia +A = zeros(Blocks(3, 3), Int, 9, 9) +A[5, 5] = 1 +B = zeros(Blocks(3, 3), Int, 9, 9) +Dagger.@spawn_datadeps() do + @stencil idx in A begin + # Sum values of all neighbors with self + A[idx] = sum(@neighbors(A[idx], 1, :wrap)) + # Decrement all values by 1 + A[idx] -= 1 + # Copy A to B + B[idx] = A[idx] + end +end +``` + +Each expression within an `@stencil` region that accesses `A[idx]` is +transformed into a set of tasks that operate on each chunk of `A`, and within +each task, elements of that chunk of `A` can be accessed. Elements of other +`DArray`s can also be accessed, such as `B[idx]`, so long as `B` has the same +size, shape, and chunk layout as `A`. + +Additionally, the `@neighbors` macro can be used to access a neighborhood of +values around `A[idx]`, at a configurable distance (in this case, 1 element +distance) and with or without wrapping (in this case, `:wrap` enables +wrapping). Neighborhoods are computed with respect to neighboring chunks as +well - if a neighborhood would overflow from the current chunk into one or more +neighboring chunks, values from those neighboring chunks will be included in +the neighborhood. + +Note that, while `@stencil` may look like a `for` loop, it does not follow the +same semantics; in particular, an expression within `@stencil` occurs "all at +once" (across all indices) before the next expression occurs. This means that +`A[idx] = sum(@neighbors(A[idx], 1, :wrap))` will write the sum of +neighbors for all `idx` values into `A[idx]` before `A[idx] -= 1` decrements +the values `A` by 1, and that occurs before any of the values are copied to `B` +in `B[idx] = A[idx]`. Of course, pipelining and other optimizations may still +occur, so long as they respect the sequential nature of `@stencil` (just like +with other operations in `spawn_datadeps`). +""" +macro stencil(index_ex, orig_ex) + @assert @capture(index_ex, index_var_ = index_range_) || @capture(index_ex, index_var_ in index_range_) "Invalid indexing expression: $index_ex" + @assert Meta.isexpr(orig_ex, :block) "Invalid stencil block: $orig_ex" + + # Collect access pattern information + inners = [] + all_accessed_vars = Set{Symbol}() + for inner_ex in orig_ex.args + inner_ex isa LineNumberNode && continue + @assert @capture(inner_ex, write_ex_ = read_ex_) "Invalid update expression: $inner_ex" + @assert @capture(write_ex, write_var_[write_idx_]) "Update expression requires a write: $write_ex" + @assert write_idx == index_var "Can only write to $index_var: $write_ex" + accessed_vars = Set{Symbol}() + read_vars = Set{Symbol}() + neighborhoods = Dict{Symbol, Tuple{Int, Bool}}() + push!(accessed_vars, write_var) + MacroTools.prewalk(read_ex) do read_inner_ex + if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == index_var + push!(accessed_vars, read_var) + push!(read_vars, read_var) + elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, wrapping_)) && read_idx == index_var + @assert neigh_dist == 1 "Neighborhoods greater than 1 not yet supported" + @assert wrapping == :wrap "Unknown wrapping pattern `$(repr(wrapping))`" + push!(accessed_vars, read_var) + push!(read_vars, read_var) + neighborhoods[read_var] = (neigh_dist, true) + end + return read_inner_ex + end + union!(all_accessed_vars, accessed_vars) + push!(inners, (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods)) + end + + # Codegen update functions + final_ex = Expr(:block) + @gensym chunk_idx + for (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods) in inners + # Generate a variable for chunk access + @gensym chunk_idx + + # Generate function with transformed body + @gensym inner_index_var + new_inner_ex_body = MacroTools.prewalk(inner_ex) do old_inner_ex + if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == index_var + # Direct access + return :($read_var[$inner_index_var]) + elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, wrapping_)) && read_idx == index_var + # Neighborhood access + return :($load_neighborhood($read_var, $inner_index_var, $neigh_dist)) + end + return old_inner_ex + end + new_inner_ex = quote + for $inner_index_var in CartesianIndices($write_var) + $new_inner_ex_body + end + end + inner_fn = Expr(:->, Expr(:tuple, Expr(:parameters, write_var, read_vars...)), new_inner_ex) + + # Generate @spawn call with appropriate vars and deps + deps_ex = Any[] + if write_var in read_vars + push!(deps_ex, Expr(:kw, write_var, :($ReadWrite($chunks($write_var)[$chunk_idx])))) + else + push!(deps_ex, Expr(:kw, write_var, :($Write($chunks($write_var)[$chunk_idx])))) + end + neighbor_copy_all_ex = Expr(:block) + for read_var in read_vars + if read_var in keys(neighborhoods) + # Generate a neighborhood copy operation + neigh_dist, dowrap = neighborhoods[read_var] + deps_inner_ex = Expr(:block) + @gensym neighbor_copy_var + push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, map($Read, $get_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist))...))) + push!(deps_ex, Expr(:kw, read_var, :($Read($neighbor_copy_var)))) + else + push!(deps_ex, Expr(:kw, read_var, :($Read($chunks($read_var)[$chunk_idx])))) + end + end + spawn_ex = :(Dagger.@spawn $inner_fn(;$(deps_ex...))) + + # Generate loop + push!(final_ex.args, quote + for $chunk_idx in $index_range + $neighbor_copy_all_ex + $spawn_ex + end + end) + end + + return esc(final_ex) +end diff --git a/src/utils/haloarray.jl b/src/utils/haloarray.jl new file mode 100644 index 000000000..83b9e0d19 --- /dev/null +++ b/src/utils/haloarray.jl @@ -0,0 +1,112 @@ +# Define the HaloArray type with minimized halo storage +struct HaloArray{T,N,E,C} <: AbstractArray{T,N} + center::Array{T,N} + edges::NTuple{E, Array{T,N}} + corners::NTuple{C, Array{T,N}} + halo_width::NTuple{N,Int} +end + +# Helper function to create an empty HaloArray with minimized halo storage +function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) where {T,N} + center = Array{T,N}(undef, center_size...) + edges = ntuple(2N) do i + prev_dims = center_size[1:(cld(i,2)-1)] + next_dims = center_size[(cld(i,2)+1):end] + return Array{T,N}(undef, prev_dims..., halo_width[cld(i,2)], next_dims...) + end + corners = ntuple(2^N) do i + return Array{T,N}(undef, halo_width) + end + return HaloArray{T,N,2N,2^N}(center, edges, corners, halo_width) +end + +Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width +function Base.axes(tile::HaloArray{T,N,H}) where {T,N,H} + ntuple(N) do i + first_ind = 1 - tile.halo_width[i] + last_ind = size(tile.center, i) + tile.halo_width[i] + return first_ind:last_ind + end +end +function Base.similar(tile::HaloArray{T,N,H}, ::Type{T}, dims::NTuple{N,Int}) where {T,N,H} + center_size = dims + halo_width = tile.halo_width + return HaloArray{T,N,H}(center_size, halo_width) +end +function Base.copy(tile::HaloArray{T,N,H}) where {T,N,H} + center = copy(tile.center) + halo = ntuple(i->copy(tile.edges[i]), H) + halo_width = tile.halo_width + return HaloArray{T,N,H}(center, halo, halo_width) +end + +# Define getindex for HaloArray +function Base.getindex(tile::HaloArray{T,N}, I::Vararg{Int,N}) where {T,N} + checkbounds(tile, I...) + if all(1 .<= I .<= size(tile.center)) + return tile.center[I...] + elseif !any(1 .<= I .<= size(tile.center)) + # Corner + # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11 + corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1 + corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N)) + return tile.corners[corner_idx][corner_offset] + else + for d in 1:N + if I[d] < 1 + halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...) + return tile.edges[(2*(d-1))+1][halo_idx...] + elseif I[d] > size(tile.center, d) + halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...) + return tile.edges[(2*(d-1))+2][halo_idx...] + end + end + end + error("Index out of bounds") +end + +# Define setindex! for HaloArray +function Base.setindex!(tile::HaloArray{T,N}, value, I::Vararg{Int,N}) where {T,N} + checkbounds(tile, I...) + if all(1 .<= I .<= size(tile.center)) + # Center + return tile.center[I...] = value + elseif !any(1 .<= I .<= size(tile.center)) + # Corner + # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11 + corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1 + corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N)) + return tile.corners[corner_idx][corner_offset] = value + else + # Edge + for d in 1:N + if I[d] < 1 + halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...) + return tile.edges[(2*(d-1))+1][halo_idx...] = value + elseif I[d] > size(tile.center, d) + halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...) + return tile.edges[(2*(d-1))+2][halo_idx...] = value + end + end + end + error("Index out of bounds") +end + +#= +# Example usage +center_size = (3, 5) +halo_width = (1, 1) +tile = HaloArray{Float64, 2}(center_size, halo_width) + +# Set values in the center and halo +tile[2, 2] = 1.0 +tile[0, 2] = 2.0 # This should be in an edge +tile[0, 0] = 3.0 # This should be in a corner +tile[4, 6] = 4.0 # This should be in a corner + +# Get values from the center and halo +println(tile[2, 2]) # 1.0 +println(tile[0, 2]) # 2.0 +println(tile[0, 0]) # 3.0 +println(tile[4, 6]) # 4.0 +=# From c9b310395b86797d93506e645043481c5176a2f0 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 12 Aug 2024 12:00:10 -0500 Subject: [PATCH 2/9] fixup! datadeps: Add at-stencil helper --- src/stencil.jl | 59 +++++++++++++++++++++++++++--------------- src/utils/haloarray.jl | 11 +++++--- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/stencil.jl b/src/stencil.jl index 808e01459..37e085dd0 100644 --- a/src/stencil.jl +++ b/src/stencil.jl @@ -1,3 +1,8 @@ +# FIXME: Remove me +const Read = In +const Write = Out +const ReadWrite = InOut + function get_neighbor_edge(arr, dim, dir, dist) if dir == -1 start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - dist + 1) : firstindex(arr, i), ndims(arr))) @@ -13,8 +18,7 @@ function get_neighbor_corner(chunk, corner_side, neigh_dist) stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(chunk, i) : (firstindex(chunk, i) + neigh_dist - 1), ndims(chunk))) return collect(@view chunk[start_idx:stop_idx]) end -function get_neighborhood_chunks(chunks, idx, neigh_dist) - @assert neigh_dist == 1 +function get_neighborhood_chunks(chunks, idx, neigh_dist, boundary) chunk_dist = 1 # Get the center accesses = Any[chunks[idx]] @@ -47,7 +51,7 @@ function get_neighborhood_chunks(chunks, idx, neigh_dist) @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))" return accesses end -function build_halo(neigh_dist, center::Array{T,N}, all_neighbors...) where {T,N} +function build_halo(neigh_dist, boundary, center::Array{T,N}, all_neighbors...) where {T,N} # FIXME: Don't collect views edges = collect.(all_neighbors[1:(2*N)]) corners = collect.(all_neighbors[((2^N)+1):end]) @@ -62,6 +66,17 @@ function load_neighborhood(arr::HaloArray{T,N}, idx, neigh_dist) where {T,N} return collect(@view arr[start_idx:stop_idx]) end +struct Wrap end +boundary_init(::Wrap, arr, size) = similar(arr, eltype(arr), size) +boundary_has_transition(::Wrap) = true +boundary_transition(::Wrap, idx, size) = mod1(idx, size) + +struct Pad{T} + padval::T +end +boundary_init(::Pad{T}, arr, size) where T = Fill(padval, size) +boundary_has_transition(::Pad) = false + """ @stencil idx in range begin body end @@ -70,13 +85,15 @@ region. The `idx` variable is used to iterate over `range`, which must be a `DArray`. An example usage may look like: ```julia +import Dagger: @stencil, Wrap + A = zeros(Blocks(3, 3), Int, 9, 9) A[5, 5] = 1 B = zeros(Blocks(3, 3), Int, 9, 9) Dagger.@spawn_datadeps() do @stencil idx in A begin # Sum values of all neighbors with self - A[idx] = sum(@neighbors(A[idx], 1, :wrap)) + A[idx] = sum(@neighbors(A[idx], 1, Wrap())) # Decrement all values by 1 A[idx] -= 1 # Copy A to B @@ -93,16 +110,16 @@ size, shape, and chunk layout as `A`. Additionally, the `@neighbors` macro can be used to access a neighborhood of values around `A[idx]`, at a configurable distance (in this case, 1 element -distance) and with or without wrapping (in this case, `:wrap` enables -wrapping). Neighborhoods are computed with respect to neighboring chunks as -well - if a neighborhood would overflow from the current chunk into one or more -neighboring chunks, values from those neighboring chunks will be included in -the neighborhood. +distance) and with various kinds of boundary conditions (in this case, `Wrap()` +specifies wrapping behavior on the boundaries). Neighborhoods are computed with +respect to neighboring chunks as well - if a neighborhood would overflow from +the current chunk into one or more neighboring chunks, values from those +neighboring chunks will be included in the neighborhood. Note that, while `@stencil` may look like a `for` loop, it does not follow the same semantics; in particular, an expression within `@stencil` occurs "all at once" (across all indices) before the next expression occurs. This means that -`A[idx] = sum(@neighbors(A[idx], 1, :wrap))` will write the sum of +`A[idx] = sum(@neighbors(A[idx], 1, Wrap()))` will write the sum of neighbors for all `idx` values into `A[idx]` before `A[idx] -= 1` decrements the values `A` by 1, and that occurs before any of the values are copied to `B` in `B[idx] = A[idx]`. Of course, pipelining and other optimizations may still @@ -123,18 +140,16 @@ macro stencil(index_ex, orig_ex) @assert write_idx == index_var "Can only write to $index_var: $write_ex" accessed_vars = Set{Symbol}() read_vars = Set{Symbol}() - neighborhoods = Dict{Symbol, Tuple{Int, Bool}}() + neighborhoods = Dict{Symbol, Tuple{Any, Any}}() push!(accessed_vars, write_var) - MacroTools.prewalk(read_ex) do read_inner_ex + prewalk(read_ex) do read_inner_ex if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == index_var push!(accessed_vars, read_var) push!(read_vars, read_var) - elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, wrapping_)) && read_idx == index_var - @assert neigh_dist == 1 "Neighborhoods greater than 1 not yet supported" - @assert wrapping == :wrap "Unknown wrapping pattern `$(repr(wrapping))`" + elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var push!(accessed_vars, read_var) push!(read_vars, read_var) - neighborhoods[read_var] = (neigh_dist, true) + neighborhoods[read_var] = (neigh_dist, boundary) end return read_inner_ex end @@ -151,11 +166,11 @@ macro stencil(index_ex, orig_ex) # Generate function with transformed body @gensym inner_index_var - new_inner_ex_body = MacroTools.prewalk(inner_ex) do old_inner_ex + new_inner_ex_body = prewalk(inner_ex) do old_inner_ex if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == index_var # Direct access return :($read_var[$inner_index_var]) - elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, wrapping_)) && read_idx == index_var + elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var # Neighborhood access return :($load_neighborhood($read_var, $inner_index_var, $neigh_dist)) end @@ -179,10 +194,10 @@ macro stencil(index_ex, orig_ex) for read_var in read_vars if read_var in keys(neighborhoods) # Generate a neighborhood copy operation - neigh_dist, dowrap = neighborhoods[read_var] + neigh_dist, boundary = neighborhoods[read_var] deps_inner_ex = Expr(:block) @gensym neighbor_copy_var - push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, map($Read, $get_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist))...))) + push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, $boundary, map($Read, $get_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...))) push!(deps_ex, Expr(:kw, read_var, :($Read($neighbor_copy_var)))) else push!(deps_ex, Expr(:kw, read_var, :($Read($chunks($read_var)[$chunk_idx])))) @@ -192,12 +207,14 @@ macro stencil(index_ex, orig_ex) # Generate loop push!(final_ex.args, quote - for $chunk_idx in $index_range + for $chunk_idx in $CartesianIndices($chunks($index_range)) $neighbor_copy_all_ex $spawn_ex end end) end + @show final_ex + return esc(final_ex) end diff --git a/src/utils/haloarray.jl b/src/utils/haloarray.jl index 83b9e0d19..835131d7b 100644 --- a/src/utils/haloarray.jl +++ b/src/utils/haloarray.jl @@ -1,8 +1,8 @@ # Define the HaloArray type with minimized halo storage -struct HaloArray{T,N,E,C} <: AbstractArray{T,N} - center::Array{T,N} - edges::NTuple{E, Array{T,N}} - corners::NTuple{C, Array{T,N}} +struct HaloArray{T,N,E,C,A,EA,CA} <: AbstractArray{T,N} + center::A + edges::NTuple{E, EA} + corners::NTuple{C, CA} halo_width::NTuple{N,Int} end @@ -20,6 +20,9 @@ function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) w return HaloArray{T,N,2N,2^N}(center, edges, corners, halo_width) end +HaloArray(center::AT, edges::NTuple{E, EA}, corners::NTuple{C, CA}, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},C,E,CA,EA} = + HaloArray{T,N,E,C,AT,EA,CA}(center, edges, corners, halo_width) + Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width function Base.axes(tile::HaloArray{T,N,H}) where {T,N,H} ntuple(N) do i From 5f0bff1ec37769a40145051464b8d02407385ef3 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 12 Aug 2024 12:01:14 -0500 Subject: [PATCH 3/9] fixup! fixup! datadeps: Add at-stencil helper --- docs/src/stencils.jl | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 docs/src/stencils.jl diff --git a/docs/src/stencils.jl b/docs/src/stencils.jl new file mode 100644 index 000000000..0f388d8a7 --- /dev/null +++ b/docs/src/stencils.jl @@ -0,0 +1,43 @@ +# Stencil Operations + + + +```julia +N = 27 +nt = 3 +tiles = zeros(Blocks(N, N), Bool, N*nt, N*nt) +outputs = zeros(Blocks(N, N), Bool, N*nt, N*nt) + +# Create fun initial state +tiles[13, 14] = 1 +tiles[14, 14] = 1 +tiles[15, 14] = 1 +tiles[15, 15] = 1 +tiles[14, 16] = 1 +@view(tiles[(2N+1):3N, (2N+1):3N]) .= rand(Bool, N, N) + +import Dagger: @stencil, Wrap + +anim = @animate for _ in 1:niters + Dagger.spawn_datadeps() do + @stencil idx = tiles begin + outputs[idx] = begin + nhood = @neighbors(tiles[idx], 1, Wrap()) + neighs = sum(nhood) - tiles[idx] + if tiles[idx] && neighs < 2 + 0 + elseif tiles[idx] && neighs > 3 + 0 + elseif !tiles[idx] && neighs == 3 + 1 + else + tiles[idx] + end + end + tiles[idx] = outputs[idx] + end + end + heatmap(Int.(collect(outputs))) +end +path = mp4(anim; fps=5, show_msg=true).filename +``` From e4e5ca887514917e4dc0ad4a6dc0d2f2cc57b35d Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 12 Aug 2024 13:48:41 -0500 Subject: [PATCH 4/9] stencils: Add custom boundary conditions --- Project.toml | 2 ++ src/Dagger.jl | 5 ++- src/stencil.jl | 86 ++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 69 insertions(+), 24 deletions(-) diff --git a/Project.toml b/Project.toml index 03cf2953e..b9fe18a7a 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.18.12" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" @@ -46,6 +47,7 @@ Colors = "0.12" DataFrames = "1" DataStructures = "0.18" Distributions = "0.25" +FillArrays = "1.11.0" GraphViz = "0.2" Graphs = "1" MacroTools = "0.5" diff --git a/src/Dagger.jl b/src/Dagger.jl index 848b54b72..e5ad12bf2 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -33,6 +33,8 @@ import TimespanLogging: timespan_start, timespan_finish import Adapt +import FillArrays: Fill + include("lib/util.jl") include("utils/dagdebug.jl") @@ -40,7 +42,8 @@ include("utils/dagdebug.jl") include("utils/locked-object.jl") include("utils/tasks.jl") -import MacroTools: @capture +import MacroTools: @capture, prewalk + include("options.jl") include("processor.jl") include("threadproc.jl") diff --git a/src/stencil.jl b/src/stencil.jl index 37e085dd0..4e85af451 100644 --- a/src/stencil.jl +++ b/src/stencil.jl @@ -3,51 +3,71 @@ const Read = In const Write = Out const ReadWrite = InOut -function get_neighbor_edge(arr, dim, dir, dist) +function load_neighbor_edge(arr, dim, dir, neigh_dist) if dir == -1 - start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - dist + 1) : firstindex(arr, i), ndims(arr))) + start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr))) stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr))) elseif dir == 1 start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr))) - stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + dist - 1) : lastindex(arr, i), ndims(arr))) + stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr))) end return collect(@view arr[start_idx:stop_idx]) end -function get_neighbor_corner(chunk, corner_side, neigh_dist) - start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(chunk, i) - neigh_dist + 1) : firstindex(chunk, i), ndims(chunk))) - stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(chunk, i) : (firstindex(chunk, i) + neigh_dist - 1), ndims(chunk))) - return collect(@view chunk[start_idx:stop_idx]) +function load_neighbor_corner(arr, corner_side, neigh_dist) + start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr))) + stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr))) + return collect(@view arr[start_idx:stop_idx]) end -function get_neighborhood_chunks(chunks, idx, neigh_dist, boundary) +function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary) + @assert neigh_dist isa Integer && neigh_dist > 0 "Neighborhood distance must be an Integer greater than 0" + + # FIXME: Depends on neigh_dist and chunk size chunk_dist = 1 # Get the center accesses = Any[chunks[idx]] + # Get the edges for dim in 1:ndims(chunks) for dir in (-1, +1) - if dir == -1 && idx[dim] == firstindex(chunks, dim) - new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks))) - elseif dir == +1 && idx[dim] == lastindex(chunks, dim) - new_idx = idx - CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks))) + new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks))) + if is_past_boundary(size(chunks), new_idx) + if boundary_has_transition(boundary) + new_idx = boundary_transition(boundary, new_idx, size(chunks)) + else + new_idx = idx + end + chunk = chunks[new_idx] + push!(accesses, Dagger.@spawn load_boundary_edge(boundary, chunk, dim, dir, neigh_dist)) else - new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks))) + chunk = chunks[new_idx] + push!(accesses, Dagger.@spawn load_neighbor_edge(chunk, dim, dir, neigh_dist)) end - chunk = chunks[new_idx] - push!(accesses, Dagger.@spawn get_neighbor_edge(chunk, dim, dir, neigh_dist)) end end + # Get the corners for corner_num in 1:(2^ndims(chunks)) corner_side = CartesianIndex(reverse(ntuple(ndims(chunks)) do i ((corner_num-1) >> (((ndims(chunks) - i) + 1) - 1)) & 1 end)) - corner_idx = CartesianIndex(ntuple(ndims(chunks)) do i + corner_new_idx = CartesianIndex(ntuple(ndims(chunks)) do i corner_shift = iszero(corner_side[i]) ? -1 : 1 - return mod1(idx[i] + corner_shift, size(chunks, i)) + return idx[i] + corner_shift end) - chunk = chunks[corner_idx] - push!(accesses, Dagger.@spawn get_neighbor_corner(chunk, corner_side, neigh_dist)) + if is_past_boundary(size(chunks), corner_new_idx) + if boundary_has_transition(boundary) + corner_new_idx = boundary_transition(boundary, corner_new_idx, size(chunks)) + else + corner_new_idx = idx + end + chunk = chunks[corner_new_idx] + push!(accesses, Dagger.@spawn load_boundary_corner(boundary, chunk, corner_side, neigh_dist)) + else + chunk = chunks[corner_new_idx] + push!(accesses, Dagger.@spawn load_neighbor_corner(chunk, corner_side, neigh_dist)) + end end + @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))" return accesses end @@ -66,16 +86,36 @@ function load_neighborhood(arr::HaloArray{T,N}, idx, neigh_dist) where {T,N} return collect(@view arr[start_idx:stop_idx]) end +is_past_boundary(size, idx) = any(ntuple(i -> idx[i] < 1 || idx[i] > size[i], length(size))) + struct Wrap end -boundary_init(::Wrap, arr, size) = similar(arr, eltype(arr), size) boundary_has_transition(::Wrap) = true -boundary_transition(::Wrap, idx, size) = mod1(idx, size) +boundary_transition(::Wrap, idx, size) = + CartesianIndex(ntuple(i -> mod1(idx[i], size[i]), length(size))) +load_boundary_edge(::Wrap, arr, dim, dir, neigh_dist) = load_neighbor_edge(arr, dim, dir, neigh_dist) +load_boundary_corner(::Wrap, arr, corner_side, neigh_dist) = load_neighbor_corner(arr, corner_side, neigh_dist) struct Pad{T} padval::T end -boundary_init(::Pad{T}, arr, size) where T = Fill(padval, size) boundary_has_transition(::Pad) = false +function load_boundary_edge(pad::Pad, arr, dim, dir, neigh_dist) + if dir == -1 + start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr))) + stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr))) + elseif dir == 1 + start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr))) + stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr))) + end + edge_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr)) + return Fill(pad.padval, edge_size) +end +function load_boundary_corner(pad::Pad, arr, corner_side, neigh_dist) + start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr))) + stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr))) + corner_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr)) + return Fill(pad.padval, corner_size) +end """ @stencil idx in range begin body end @@ -197,7 +237,7 @@ macro stencil(index_ex, orig_ex) neigh_dist, boundary = neighborhoods[read_var] deps_inner_ex = Expr(:block) @gensym neighbor_copy_var - push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, $boundary, map($Read, $get_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...))) + push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, $boundary, map($Read, $select_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...))) push!(deps_ex, Expr(:kw, read_var, :($Read($neighbor_copy_var)))) else push!(deps_ex, Expr(:kw, read_var, :($Read($chunks($read_var)[$chunk_idx])))) From bbf9dcece1c2836cadaf3dd2122a96b71463dc26 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 12 Aug 2024 14:13:11 -0500 Subject: [PATCH 5/9] fixup! fixup! fixup! datadeps: Add at-stencil helper --- docs/make.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 828594967..84e7dcd31 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -25,7 +25,10 @@ makedocs(; "Scopes" => "scopes.md", "Processors" => "processors.md", "Task Queues" => "task-queues.md", - "Datadeps" => "datadeps.md", + "Datadeps" => [ + "Basics" => "datadeps.md", + "Stencils" => "stencils.md", + ], "Option Propagation" => "propagation.md", "Logging and Visualization" => [ "Logging: Basics" => "logging.md", From c9600d287a7fc683671b714d80721881090de129 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 12 Aug 2024 14:13:58 -0500 Subject: [PATCH 6/9] stencils: Remove unnecessary index var and range --- docs/src/stencils.jl | 2 +- src/stencil.jl | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/src/stencils.jl b/docs/src/stencils.jl index 0f388d8a7..3cf552dc1 100644 --- a/docs/src/stencils.jl +++ b/docs/src/stencils.jl @@ -20,7 +20,7 @@ import Dagger: @stencil, Wrap anim = @animate for _ in 1:niters Dagger.spawn_datadeps() do - @stencil idx = tiles begin + @stencil begin outputs[idx] = begin nhood = @neighbors(tiles[idx], 1, Wrap()) neighs = sum(nhood) - tiles[idx] diff --git a/src/stencil.jl b/src/stencil.jl index 4e85af451..c292ace3f 100644 --- a/src/stencil.jl +++ b/src/stencil.jl @@ -131,7 +131,7 @@ A = zeros(Blocks(3, 3), Int, 9, 9) A[5, 5] = 1 B = zeros(Blocks(3, 3), Int, 9, 9) Dagger.@spawn_datadeps() do - @stencil idx in A begin + @stencil begin # Sum values of all neighbors with self A[idx] = sum(@neighbors(A[idx], 1, Wrap())) # Decrement all values by 1 @@ -142,11 +142,12 @@ Dagger.@spawn_datadeps() do end ``` -Each expression within an `@stencil` region that accesses `A[idx]` is -transformed into a set of tasks that operate on each chunk of `A`, and within -each task, elements of that chunk of `A` can be accessed. Elements of other -`DArray`s can also be accessed, such as `B[idx]`, so long as `B` has the same -size, shape, and chunk layout as `A`. +Each expression within an `@stencil` region that performs an in-place indexing +expression like `A[idx] = ...` is transformed into a set of tasks that operate +on each chunk of `A` or any other arrays specified as `A[idx]`, and within each +task, elements of that chunk of `A` can be accessed. Elements of multiple +`DArray`s can be accessed, such as `B[idx]`, so long as `B` has the same size, +shape, and chunk layout as `A`. Additionally, the `@neighbors` macro can be used to access a neighborhood of values around `A[idx]`, at a configurable distance (in this case, 1 element @@ -166,8 +167,7 @@ in `B[idx] = A[idx]`. Of course, pipelining and other optimizations may still occur, so long as they respect the sequential nature of `@stencil` (just like with other operations in `spawn_datadeps`). """ -macro stencil(index_ex, orig_ex) - @assert @capture(index_ex, index_var_ = index_range_) || @capture(index_ex, index_var_ in index_range_) "Invalid indexing expression: $index_ex" +macro stencil(orig_ex) @assert Meta.isexpr(orig_ex, :block) "Invalid stencil block: $orig_ex" # Collect access pattern information @@ -177,16 +177,16 @@ macro stencil(index_ex, orig_ex) inner_ex isa LineNumberNode && continue @assert @capture(inner_ex, write_ex_ = read_ex_) "Invalid update expression: $inner_ex" @assert @capture(write_ex, write_var_[write_idx_]) "Update expression requires a write: $write_ex" - @assert write_idx == index_var "Can only write to $index_var: $write_ex" accessed_vars = Set{Symbol}() read_vars = Set{Symbol}() neighborhoods = Dict{Symbol, Tuple{Any, Any}}() push!(accessed_vars, write_var) prewalk(read_ex) do read_inner_ex - if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == index_var + if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == write_idx push!(accessed_vars, read_var) push!(read_vars, read_var) - elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var + elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) + @assert read_idx == write_idx "Neighborhood access must be at the same index as the write: $read_inner_ex" push!(accessed_vars, read_var) push!(read_vars, read_var) neighborhoods[read_var] = (neigh_dist, boundary) @@ -207,10 +207,10 @@ macro stencil(index_ex, orig_ex) # Generate function with transformed body @gensym inner_index_var new_inner_ex_body = prewalk(inner_ex) do old_inner_ex - if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == index_var + if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == write_idx # Direct access return :($read_var[$inner_index_var]) - elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var + elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) # Neighborhood access return :($load_neighborhood($read_var, $inner_index_var, $neigh_dist)) end @@ -247,7 +247,7 @@ macro stencil(index_ex, orig_ex) # Generate loop push!(final_ex.args, quote - for $chunk_idx in $CartesianIndices($chunks($index_range)) + for $chunk_idx in $CartesianIndices($chunks($write_var)) $neighbor_copy_all_ex $spawn_ex end From 253013526f4ffd1cb431c8a6d163c876618e66d5 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 15 Aug 2024 17:34:58 -0400 Subject: [PATCH 7/9] stencils: Add GPU support --- Project.toml | 2 -- src/Dagger.jl | 2 -- src/stencil.jl | 59 +++++++++++++++++++++++++++--------------- src/utils/haloarray.jl | 39 +++++++--------------------- 4 files changed, 48 insertions(+), 54 deletions(-) diff --git a/Project.toml b/Project.toml index b9fe18a7a..03cf2953e 100644 --- a/Project.toml +++ b/Project.toml @@ -6,7 +6,6 @@ version = "0.18.12" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" -FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" @@ -47,7 +46,6 @@ Colors = "0.12" DataFrames = "1" DataStructures = "0.18" Distributions = "0.25" -FillArrays = "1.11.0" GraphViz = "0.2" Graphs = "1" MacroTools = "0.5" diff --git a/src/Dagger.jl b/src/Dagger.jl index e5ad12bf2..f4fc7bd6e 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -33,8 +33,6 @@ import TimespanLogging: timespan_start, timespan_finish import Adapt -import FillArrays: Fill - include("lib/util.jl") include("utils/dagdebug.jl") diff --git a/src/stencil.jl b/src/stencil.jl index c292ace3f..7c4ab99d2 100644 --- a/src/stencil.jl +++ b/src/stencil.jl @@ -11,12 +11,13 @@ function load_neighbor_edge(arr, dim, dir, neigh_dist) start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr))) stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr))) end - return collect(@view arr[start_idx:stop_idx]) + # FIXME: Don't collect + return move(thunk_processor(), collect(@view arr[start_idx:stop_idx])) end function load_neighbor_corner(arr, corner_side, neigh_dist) start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr))) stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr))) - return collect(@view arr[start_idx:stop_idx]) + return move(thunk_processor(), collect(@view arr[start_idx:stop_idx])) end function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary) @assert neigh_dist isa Integer && neigh_dist > 0 "Neighborhood distance must be an Integer greater than 0" @@ -71,19 +72,30 @@ function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary) @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))" return accesses end -function build_halo(neigh_dist, boundary, center::Array{T,N}, all_neighbors...) where {T,N} - # FIXME: Don't collect views - edges = collect.(all_neighbors[1:(2*N)]) - corners = collect.(all_neighbors[((2^N)+1):end]) +function build_halo(neigh_dist, boundary, center, all_neighbors...) + N = ndims(center) + edges = all_neighbors[1:(2*N)] + corners = all_neighbors[((2^N)+1):end] @assert length(edges) == 2*N && length(corners) == 2^N "Halo mismatch: edges=$(length(edges)) corners=$(length(corners))" - arr = HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N)) - return arr + return HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N)) end -function load_neighborhood(arr::HaloArray{T,N}, idx, neigh_dist) where {T,N} +function load_neighborhood(arr::HaloArray{T,N}, idx) where {T,N} + @assert all(arr.halo_width .== arr.halo_width[1]) + neigh_dist = arr.halo_width[1] start_idx = idx - CartesianIndex(ntuple(_->neigh_dist, ndims(arr))) stop_idx = idx + CartesianIndex(ntuple(_->neigh_dist, ndims(arr))) - # FIXME: Don't collect HaloArray view - return collect(@view arr[start_idx:stop_idx]) + return @view arr[start_idx:stop_idx] +end +function inner_stencil!(f, output, read_vars) + processor = thunk_processor() + inner_stencil_proc!(processor, f, output, read_vars) +end +# Non-KA (for CPUs) +function inner_stencil_proc!(::ThreadProc, f, output, read_vars) + for idx in CartesianIndices(output) + f(idx, output, read_vars) + end + return end is_past_boundary(size, idx) = any(ntuple(i -> idx[i] < 1 || idx[i] > size[i], length(size))) @@ -108,17 +120,19 @@ function load_boundary_edge(pad::Pad, arr, dim, dir, neigh_dist) stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr))) end edge_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr)) - return Fill(pad.padval, edge_size) + # FIXME: return Fill(pad.padval, edge_size) + return move(thunk_processor(), fill(pad.padval, edge_size)) end function load_boundary_corner(pad::Pad, arr, corner_side, neigh_dist) start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr))) stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr))) corner_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr)) - return Fill(pad.padval, corner_size) + # FIXME: return Fill(pad.padval, corner_size) + return move(thunk_processor(), fill(pad.padval, corner_size)) end """ - @stencil idx in range begin body end + @stencil begin body end Allows the specification of stencil operations within a `spawn_datadeps` region. The `idx` variable is used to iterate over `range`, which must be a @@ -205,21 +219,25 @@ macro stencil(orig_ex) @gensym chunk_idx # Generate function with transformed body - @gensym inner_index_var + @gensym inner_vars inner_index_var new_inner_ex_body = prewalk(inner_ex) do old_inner_ex if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == write_idx # Direct access - return :($read_var[$inner_index_var]) + if read_var == write_var + return :($write_var[$inner_index_var]) + else + return :($inner_vars.$read_var[$inner_index_var]) + end elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) # Neighborhood access - return :($load_neighborhood($read_var, $inner_index_var, $neigh_dist)) + return :($load_neighborhood($inner_vars.$read_var, $inner_index_var)) end return old_inner_ex end + new_inner_f = :(($inner_index_var, $write_var, $inner_vars)->$new_inner_ex_body) new_inner_ex = quote - for $inner_index_var in CartesianIndices($write_var) - $new_inner_ex_body - end + $inner_vars = (;$(read_vars...)) + $inner_stencil!($new_inner_f, $write_var, $inner_vars) end inner_fn = Expr(:->, Expr(:tuple, Expr(:parameters, write_var, read_vars...)), new_inner_ex) @@ -254,7 +272,6 @@ macro stencil(orig_ex) end) end - @show final_ex return esc(final_ex) end diff --git a/src/utils/haloarray.jl b/src/utils/haloarray.jl index 835131d7b..2e26ed1cf 100644 --- a/src/utils/haloarray.jl +++ b/src/utils/haloarray.jl @@ -1,8 +1,8 @@ # Define the HaloArray type with minimized halo storage -struct HaloArray{T,N,E,C,A,EA,CA} <: AbstractArray{T,N} +struct HaloArray{T,N,E,C,A,EAT<:Tuple,CAT<:Tuple} <: AbstractArray{T,N} center::A - edges::NTuple{E, EA} - corners::NTuple{C, CA} + edges::EAT + corners::CAT halo_width::NTuple{N,Int} end @@ -17,11 +17,11 @@ function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) w corners = ntuple(2^N) do i return Array{T,N}(undef, halo_width) end - return HaloArray{T,N,2N,2^N}(center, edges, corners, halo_width) + return HaloArray(center, edges, corners, halo_width) end -HaloArray(center::AT, edges::NTuple{E, EA}, corners::NTuple{C, CA}, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},C,E,CA,EA} = - HaloArray{T,N,E,C,AT,EA,CA}(center, edges, corners, halo_width) +HaloArray(center::AT, edges::EAT, corners::CAT, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},CAT<:Tuple,EAT<:Tuple} = + HaloArray{T,N,length(edges),length(corners),AT,EAT,CAT}(center, edges, corners, halo_width) Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width function Base.axes(tile::HaloArray{T,N,H}) where {T,N,H} @@ -57,10 +57,10 @@ function Base.getindex(tile::HaloArray{T,N}, I::Vararg{Int,N}) where {T,N} else for d in 1:N if I[d] < 1 - halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...) + halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N) return tile.edges[(2*(d-1))+1][halo_idx...] elseif I[d] > size(tile.center, d) - halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...) + halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N) return tile.edges[(2*(d-1))+2][halo_idx...] end end @@ -84,32 +84,13 @@ function Base.setindex!(tile::HaloArray{T,N}, value, I::Vararg{Int,N}) where {T, # Edge for d in 1:N if I[d] < 1 - halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...) + halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N) return tile.edges[(2*(d-1))+1][halo_idx...] = value elseif I[d] > size(tile.center, d) - halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...) + halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N) return tile.edges[(2*(d-1))+2][halo_idx...] = value end end end error("Index out of bounds") end - -#= -# Example usage -center_size = (3, 5) -halo_width = (1, 1) -tile = HaloArray{Float64, 2}(center_size, halo_width) - -# Set values in the center and halo -tile[2, 2] = 1.0 -tile[0, 2] = 2.0 # This should be in an edge -tile[0, 0] = 3.0 # This should be in a corner -tile[4, 6] = 4.0 # This should be in a corner - -# Get values from the center and halo -println(tile[2, 2]) # 1.0 -println(tile[0, 2]) # 2.0 -println(tile[0, 0]) # 3.0 -println(tile[4, 6]) # 4.0 -=# From 91e954e59c2c82438ba509e5214783c754edc50b Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 19 Aug 2024 10:50:47 -0400 Subject: [PATCH 8/9] scopes/DArray: Prevent GPU running setindex! --- src/array/indexing.jl | 3 ++- src/scopes.jl | 21 ++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/array/indexing.jl b/src/array/indexing.jl index 82f44fbff..1f622fa57 100644 --- a/src/array/indexing.jl +++ b/src/array/indexing.jl @@ -129,7 +129,8 @@ function Base.setindex!(A::DArray{T,N}, value, idx::NTuple{N,Int}) where {T,N} # Set the value part = A.chunks[part_idx...] space = memory_space(part) - scope = Dagger.scope(worker=root_worker_id(space)) + # FIXME: Do this correctly w.r.t memory space of part + scope = Dagger.scope(worker=root_worker_id(space), threads=:) return fetch(Dagger.@spawn scope=scope setindex!(part, value, offset_idx...)) end Base.setindex!(A::DArray, value, idx::Integer...) = diff --git a/src/scopes.jl b/src/scopes.jl index 834993c9f..29badbe70 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -325,13 +325,20 @@ function to_scope(sc::NamedTuple) else nothing end + all_threads = false threads = if haskey(sc, :thread) Int[sc.thread] elseif haskey(sc, :threads) - Int[sc.threads...] + if sc.threads == Colon() + all_threads = true + nothing + else + Int[sc.threads...] + end else nothing end + want_threads = all_threads || threads !== nothing # Simple cases if workers !== nothing && threads !== nothing @@ -341,18 +348,22 @@ function to_scope(sc::NamedTuple) end return simplified_union_scope(subscopes) elseif workers !== nothing && threads === nothing - subscopes = AbstractScope[ProcessScope(w) for w in workers] - return simplified_union_scope(subscopes) + subscopes = simplified_union_scope(AbstractScope[ProcessScope(w) for w in workers]) + if all_threads + return constrain(subscopes, ProcessorTypeScope(ThreadProc)) + else + return subscopes + end end # More complex cases that require querying the cluster # FIXME: Use per-field scope taint if workers === nothing - workers = procs() + workers = map(p->p.pid, filter(p->p isa OSProc, procs(Dagger.Sch.eager_context()))) end subscopes = AbstractScope[] for w in workers - if threads === nothing + if threads === nothing && want_threads threads = map(c->c.tid, filter(c->c isa ThreadProc, collect(children(OSProc(w))))) From e27c11406e7bf14237e0e59e6cdb60a00a0aa807 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 19 Aug 2024 10:52:20 -0400 Subject: [PATCH 9/9] datadeps: Fix unwritten != correct space --- src/datadeps.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datadeps.jl b/src/datadeps.jl index 3c1110820..9bbc4b325 100644 --- a/src/datadeps.jl +++ b/src/datadeps.jl @@ -677,8 +677,8 @@ function distribute_tasks!(queue::DataDepsTaskQueue) # Is the data written previously or now? arg, deps = unwrap_inout(arg) arg = arg isa DTask ? fetch(arg; raw=true) : arg - if !type_may_alias(typeof(arg)) || !has_writedep(state, arg, deps, task) - @dagdebug nothing :spawn_datadeps "($(repr(spec.f)))[$idx] Skipped copy-to (unwritten)" + if !type_may_alias(typeof(arg)) + @dagdebug nothing :spawn_datadeps "($(repr(spec.f)))[$idx] Skipped copy-to (immutable)" spec.args[idx] = pos => arg continue end