Skip to content

Commit

Permalink
Apply some Base.propagate_inbounds
Browse files Browse the repository at this point in the history
wip

Try simplifying multiply_matrix_at_index

try simplifying multiply_matrix_at_index
  • Loading branch information
charleskawczynski committed Jun 28, 2024
1 parent ad45727 commit 5c0e842
Show file tree
Hide file tree
Showing 7 changed files with 274 additions and 202 deletions.
42 changes: 28 additions & 14 deletions ext/cuda/operators_finite_difference.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,35 @@ function copyto_stencil_kernel!(
(v, i, j, h) = cart_ind((Nv, Nq, Nq, Nh), gid).I
hidx = (i, j, h)
idx = v - 1 + li
window =
idx < lw ?
LeftBoundaryWindow{Spaces.left_boundary_name(space)}() :
(
idx > rw ?
RightBoundaryWindow{Spaces.right_boundary_name(space)}() :
Interior()
if idx < lw
lwindow = LeftBoundaryWindow{Spaces.left_boundary_name(space)}()
setidx!(
space,
out,
idx,
hidx,
Operators.getidx(space, bc, lwindow, idx, hidx),
)
setidx!(
space,
out,
idx,
hidx,
Operators.getidx(space, bc, window, idx, hidx),
)
elseif idx > rw
rwindow =
RightBoundaryWindow{Spaces.right_boundary_name(space)}()
setidx!(
space,
out,
idx,
hidx,
Operators.getidx(space, bc, rwindow, idx, hidx),
)
else
iwindow = Interior()
setidx!(
space,
out,
idx,
hidx,
Operators.getidx(space, bc, iwindow, idx, hidx),
)
end
end
end
return nothing
Expand Down
4 changes: 2 additions & 2 deletions src/DataLayouts/DataLayouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ include("struct.jl")

abstract type AbstractData{S} end

Base.size(data::AbstractData, i::Integer) = size(data)[i]
@inline Base.size(data::AbstractData, i::Integer) = size(data)[i]

"""
(Ni, Nj, Nf, Nv, Nh) = universal_size(data::AbstractData)
Expand All @@ -61,7 +61,7 @@ Note: this is similar to `Base.size`, except
that `universal_size` does not return 1
for the number of field components.
"""
function universal_size(data::AbstractData)
@inline function universal_size(data::AbstractData)
s = size(data)
return (s[1], s[2], ncomponents(data), s[4], s[5])
end
Expand Down
2 changes: 1 addition & 1 deletion src/MatrixFields/band_matrix_row.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ const PentadiagonalMatrixRow{T} = BandMatrixRow{-2, 5, T}
Gets the indices of the lower and upper diagonals, `ld` and `ud`, of the given
subtype of `BandMatrixRow`.
"""
outer_diagonals(::Type{<:BandMatrixRow{ld, bw}}) where {ld, bw} =
@inline outer_diagonals(::Type{<:BandMatrixRow{ld, bw}}) where {ld, bw} =
(ld, ld + bw - 1)

"""
Expand Down
227 changes: 147 additions & 80 deletions src/MatrixFields/matrix_multiplication.jl
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,47 @@ boundary_modified_ud(_, ud, column_space, i) = ud
boundary_modified_ud(::BottomRightMatrixCorner, ud, column_space, i) =
min(Operators.right_idx(column_space) - i, ud)

# TODO: Use @propagate_inbounds here, and remove @inbounds from this function.
# As of Julia 1.8, doing this increases compilation time by more than an order
# of magnitude, and it also makes type inference fail for some complicated
# matrix field broadcast expressions (in particular, those that involve products
# of linear combinations of matrix fields). Not using @propagate_inbounds causes
# matrix field broadcast expressions to take roughly 3 or 4 times longer to
# evaluate, but this is less significant than the decrease in compilation time.
function multiply_matrix_at_index(loc, space, idx, hidx, matrix1, arg, bc)
Base.@propagate_inbounds function multiply_matrix_at_index(
loc,
space,
idx,
hidx,
matrix1,
arg,
bc,
)
if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
multiply_matrix_at_index_mat_mat(
loc,
space,
idx,
hidx,
matrix1,
arg,
bc,
)
else # matrix-vector multiplication
multiply_matrix_at_index_mat_vec(
loc,
space,
idx,
hidx,
matrix1,
arg,
bc,
)
end
end

Base.@propagate_inbounds function multiply_matrix_at_index_mat_mat(
loc,
space,
idx,
hidx,
matrix1,
arg,
bc,
)
lg = Geometry.LocalGeometry(space, idx, hidx)
prod_type = Operators.return_eltype(, matrix1, arg, typeof(lg))

Expand All @@ -359,81 +392,115 @@ function multiply_matrix_at_index(loc, space, idx, hidx, matrix1, arg, bc)
# recomputed multiple times.
matrix1_row = @inbounds Operators.getidx(space, matrix1, loc, idx, hidx)

if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
matrix2 = arg
column_space2 = column_axes(matrix2, column_space1)
ld2, ud2 = outer_diagonals(eltype(matrix2))
prod_ld, prod_ud = outer_diagonals(prod_type)
boundary_modified_prod_ld =
boundary_modified_ld(bc, prod_ld, column_space2, idx)
boundary_modified_prod_ud =
boundary_modified_ud(bc, prod_ud, column_space2, idx)

# Precompute the rows that are needed from matrix2 so that they do not
# get recomputed multiple times. To avoid inference issues at boundary
# points, this is implemented as a padded map from ld1 to ud1, instead
# of as a map from boundary_modified_ld1 to boundary_modified_ud1. For
# simplicity, use zero padding for rows that are outside the matrix.
# Wrap the rows in a BandMatrixRow so that they can be easily indexed.
matrix2_rows = unrolled_map((ld1:ud1...,)) do d
# TODO: Use @propagate_inbounds_meta instead of @inline_meta.
Base.@_inline_meta
if isnothing(bc) ||
boundary_modified_ld1 <= d <= boundary_modified_ud1
@inbounds Operators.getidx(space, matrix2, loc, idx + d, hidx)
else
zero(eltype(matrix2)) # This row is outside the matrix.
end
matrix2 = arg
column_space2 = column_axes(matrix2, column_space1)
ld2, ud2 = outer_diagonals(eltype(matrix2))
prod_ld, prod_ud = outer_diagonals(prod_type)
boundary_modified_prod_ld =
boundary_modified_ld(bc, prod_ld, column_space2, idx)
boundary_modified_prod_ud =
boundary_modified_ud(bc, prod_ud, column_space2, idx)

# Precompute the rows that are needed from matrix2 so that they do not
# get recomputed multiple times. To avoid inference issues at boundary
# points, this is implemented as a padded map from ld1 to ud1, instead
# of as a map from boundary_modified_ld1 to boundary_modified_ud1. For
# simplicity, use zero padding for rows that are outside the matrix.
# Wrap the rows in a BandMatrixRow so that they can be easily indexed.
nt_mr = ntuple-> ld1 + ζ - 1, Val(ud1 - ld1 + 1))
matrix2_rows = unrolled_map(nt_mr) do d
# TODO: Use @propagate_inbounds_meta instead of @inline_meta.
Base.@_inline_meta
if isnothing(bc) || boundary_modified_ld1 <= d <= boundary_modified_ud1
@inbounds Operators.getidx(space, matrix2, loc, idx + d, hidx)
else
zero(eltype(matrix2)) # This row is outside the matrix.
end
matrix2_rows_wrapper = BandMatrixRow{ld1}(matrix2_rows...)

# Precompute the zero value to avoid inference issues caused by passing
# prod_type into the function closure below.
zero_value = rzero(eltype(prod_type))

# Compute the entries of the product matrix row. To avoid inference
# issues at boundary points, this is implemented as a padded map from
# prod_ld to prod_ud, instead of as a map from boundary_modified_prod_ld
# to boundary_modified_prod_ud. For simplicity, use zero padding for
# entries that are outside the matrix. Wrap the entries in a
# BandMatrixRow before returning them.
prod_entries = map((prod_ld:prod_ud...,)) do prod_d
# TODO: Use @propagate_inbounds_meta instead of @inline_meta.
Base.@_inline_meta
if isnothing(bc) ||
boundary_modified_prod_ld <= prod_d <= boundary_modified_prod_ud
prod_entry = zero_value
min_d = max(boundary_modified_ld1, prod_d - ud2)
max_d = min(boundary_modified_ud1, prod_d - ld2)
@inbounds for d in min_d:max_d
value1 = matrix1_row[d]
value2 = matrix2_rows_wrapper[d][prod_d - d]
value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
prod_entry = radd(
prod_entry,
rmul_with_projection(value1, value2, value2_lg),
end
matrix2_rows_wrapper = BandMatrixRow{ld1}(matrix2_rows...)

# Precompute the zero value to avoid inference issues caused by passing
# prod_type into the function closure below.
zero_value = rzero(eltype(prod_type))

# Compute the entries of the product matrix row. To avoid inference
# issues at boundary points, this is implemented as a padded map from
# prod_ld to prod_ud, instead of as a map from boundary_modified_prod_ld
# to boundary_modified_prod_ud. For simplicity, use zero padding for
# entries that are outside the matrix. Wrap the entries in a
# BandMatrixRow before returning them.
N = prod_ud - prod_ld + 1
nt_pe = ntuple-> prod_ld + ξ - 1, Val(N))
prod_entries = unrolled_map(nt_pe) do prod_d
# TODO: Use @propagate_inbounds_meta instead of @inline_meta.
Base.@_inline_meta
if isnothing(bc) ||
boundary_modified_prod_ld <=
prod_d <=
boundary_modified_prod_ud
prod_entry = zero_value
min_d = max(
boundary_modified_ld1,
prod_d - ud2,
)
max_d = min(
boundary_modified_ud1,
prod_d - ld2,
)
@inbounds for d in min_d:max_d
value1 = matrix1_row[d]
value2 =
matrix2_rows_wrapper[d][prod_d - d]
value2_lg =
Geometry.LocalGeometry(
space,
idx + d,
hidx,
)
end # Using a for-loop is currently faster than using mapreduce.
prod_entry
else
zero_value # This entry is outside the matrix.
end
prod_entry = radd(
prod_entry,
rmul_with_projection(value1, value2, value2_lg),
)
end # Using a for-loop is currently faster than using mapreduce.
prod_entry
else
zero_value # This entry is outside the matrix.
end
return BandMatrixRow{prod_ld}(prod_entries...)
else # matrix-vector multiplication
vector = arg
prod_value = rzero(prod_type)
@inbounds for d in boundary_modified_ld1:boundary_modified_ud1
value1 = matrix1_row[d]
value2 = Operators.getidx(space, vector, loc, idx + d, hidx)
value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
prod_value = radd(
prod_value,
rmul_with_projection(value1, value2, value2_lg),
)
end # Using a for-loop is currently faster than using mapreduce.
return prod_value
end
end::NTuple{N, eltype(prod_type)}
return BandMatrixRow{prod_ld}(prod_entries...)
end

Base.@propagate_inbounds function multiply_matrix_at_index_mat_vec(
loc,
space,
idx,
hidx,
matrix1,
arg,
bc,
)
lg = Geometry.LocalGeometry(space, idx, hidx)
prod_type = Operators.return_eltype(, matrix1, arg, typeof(lg))

column_space1 = column_axes(matrix1, space)
ld1, ud1 = outer_diagonals(eltype(matrix1))
boundary_modified_ld1 = boundary_modified_ld(bc, ld1, column_space1, idx)
boundary_modified_ud1 = boundary_modified_ud(bc, ud1, column_space1, idx)

# Precompute the row that is needed from matrix1 so that it does not get
# recomputed multiple times.
matrix1_row = @inbounds Operators.getidx(space, matrix1, loc, idx, hidx)

vector = arg
prod_value = rzero(prod_type)::prod_type
@inbounds for d in boundary_modified_ld1:boundary_modified_ud1
value1 = matrix1_row[d]
value2 = Operators.getidx(space, vector, loc, idx + d, hidx)
value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
prod_value =
radd(prod_value, rmul_with_projection(value1, value2, value2_lg))
end # Using a for-loop is currently faster than using mapreduce.
return prod_value
end

Base.@propagate_inbounds Operators.stencil_interior(
Expand Down
Loading

0 comments on commit 5c0e842

Please sign in to comment.