Apply some Base.propagate_inbounds

wip Try simplifying multiply_matrix_at_index try simplifying multiply_matrix_at_index
CliMA · Jun 28, 2024 · 5c0e842 · 5c0e842
1 parent ad45727
commit 5c0e842
Show file tree

Hide file tree

Showing 7 changed files with 274 additions and 202 deletions.
diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl
@@ -66,21 +66,35 @@ function copyto_stencil_kernel!(
             (v, i, j, h) = cart_ind((Nv, Nq, Nq, Nh), gid).I
             hidx = (i, j, h)
             idx = v - 1 + li
-            window =
-                idx < lw ?
-                LeftBoundaryWindow{Spaces.left_boundary_name(space)}() :
-                (
-                    idx > rw ?
-                    RightBoundaryWindow{Spaces.right_boundary_name(space)}() :
-                    Interior()
+            if idx < lw
+                lwindow = LeftBoundaryWindow{Spaces.left_boundary_name(space)}()
+                setidx!(
+                    space,
+                    out,
+                    idx,
+                    hidx,
+                    Operators.getidx(space, bc, lwindow, idx, hidx),
                 )
-            setidx!(
-                space,
-                out,
-                idx,
-                hidx,
-                Operators.getidx(space, bc, window, idx, hidx),
-            )
+            elseif idx > rw
+                rwindow =
+                    RightBoundaryWindow{Spaces.right_boundary_name(space)}()
+                setidx!(
+                    space,
+                    out,
+                    idx,
+                    hidx,
+                    Operators.getidx(space, bc, rwindow, idx, hidx),
+                )
+            else
+                iwindow = Interior()
+                setidx!(
+                    space,
+                    out,
+                    idx,
+                    hidx,
+                    Operators.getidx(space, bc, iwindow, idx, hidx),
+                )
+            end
         end
     end
     return nothing

diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl
@@ -44,7 +44,7 @@ include("struct.jl")
 
 abstract type AbstractData{S} end
 
-Base.size(data::AbstractData, i::Integer) = size(data)[i]
+@inline Base.size(data::AbstractData, i::Integer) = size(data)[i]
 
 """
     (Ni, Nj, Nf, Nv, Nh) = universal_size(data::AbstractData)
@@ -61,7 +61,7 @@ Note: this is similar to `Base.size`, except
       that `universal_size` does not return 1
       for the number of field components.
 """
-function universal_size(data::AbstractData)
+@inline function universal_size(data::AbstractData)
     s = size(data)
     return (s[1], s[2], ncomponents(data), s[4], s[5])
 end

diff --git a/src/MatrixFields/band_matrix_row.jl b/src/MatrixFields/band_matrix_row.jl
@@ -37,7 +37,7 @@ const PentadiagonalMatrixRow{T} = BandMatrixRow{-2, 5, T}
 Gets the indices of the lower and upper diagonals, `ld` and `ud`, of the given
 subtype of `BandMatrixRow`.
 """
-outer_diagonals(::Type{<:BandMatrixRow{ld, bw}}) where {ld, bw} =
+@inline outer_diagonals(::Type{<:BandMatrixRow{ld, bw}}) where {ld, bw} =
     (ld, ld + bw - 1)
 
 """

diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl
@@ -339,14 +339,47 @@ boundary_modified_ud(_, ud, column_space, i) = ud
 boundary_modified_ud(::BottomRightMatrixCorner, ud, column_space, i) =
     min(Operators.right_idx(column_space) - i, ud)
 
-# TODO: Use @propagate_inbounds here, and remove @inbounds from this function.
-# As of Julia 1.8, doing this increases compilation time by more than an order
-# of magnitude, and it also makes type inference fail for some complicated
-# matrix field broadcast expressions (in particular, those that involve products
-# of linear combinations of matrix fields). Not using @propagate_inbounds causes
-# matrix field broadcast expressions to take roughly 3 or 4 times longer to
-# evaluate, but this is less significant than the decrease in compilation time.
-function multiply_matrix_at_index(loc, space, idx, hidx, matrix1, arg, bc)
+Base.@propagate_inbounds function multiply_matrix_at_index(
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+    bc,
+)
+    if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
+        multiply_matrix_at_index_mat_mat(
+            loc,
+            space,
+            idx,
+            hidx,
+            matrix1,
+            arg,
+            bc,
+        )
+    else # matrix-vector multiplication
+        multiply_matrix_at_index_mat_vec(
+            loc,
+            space,
+            idx,
+            hidx,
+            matrix1,
+            arg,
+            bc,
+        )
+    end
+end
+
+Base.@propagate_inbounds function multiply_matrix_at_index_mat_mat(
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+    bc,
+)
     lg = Geometry.LocalGeometry(space, idx, hidx)
     prod_type = Operators.return_eltype(⋅, matrix1, arg, typeof(lg))
 
@@ -359,81 +392,115 @@ function multiply_matrix_at_index(loc, space, idx, hidx, matrix1, arg, bc)
     # recomputed multiple times.
     matrix1_row = @inbounds Operators.getidx(space, matrix1, loc, idx, hidx)
 
-    if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
-        matrix2 = arg
-        column_space2 = column_axes(matrix2, column_space1)
-        ld2, ud2 = outer_diagonals(eltype(matrix2))
-        prod_ld, prod_ud = outer_diagonals(prod_type)
-        boundary_modified_prod_ld =
-            boundary_modified_ld(bc, prod_ld, column_space2, idx)
-        boundary_modified_prod_ud =
-            boundary_modified_ud(bc, prod_ud, column_space2, idx)
-
-        # Precompute the rows that are needed from matrix2 so that they do not
-        # get recomputed multiple times. To avoid inference issues at boundary
-        # points, this is implemented as a padded map from ld1 to ud1, instead
-        # of as a map from boundary_modified_ld1 to boundary_modified_ud1. For
-        # simplicity, use zero padding for rows that are outside the matrix.
-        # Wrap the rows in a BandMatrixRow so that they can be easily indexed.
-        matrix2_rows = unrolled_map((ld1:ud1...,)) do d
-            # TODO: Use @propagate_inbounds_meta instead of @inline_meta.
-            Base.@_inline_meta
-            if isnothing(bc) ||
-               boundary_modified_ld1 <= d <= boundary_modified_ud1
-                @inbounds Operators.getidx(space, matrix2, loc, idx + d, hidx)
-            else
-                zero(eltype(matrix2)) # This row is outside the matrix.
-            end
+    matrix2 = arg
+    column_space2 = column_axes(matrix2, column_space1)
+    ld2, ud2 = outer_diagonals(eltype(matrix2))
+    prod_ld, prod_ud = outer_diagonals(prod_type)
+    boundary_modified_prod_ld =
+        boundary_modified_ld(bc, prod_ld, column_space2, idx)
+    boundary_modified_prod_ud =
+        boundary_modified_ud(bc, prod_ud, column_space2, idx)
+
+    # Precompute the rows that are needed from matrix2 so that they do not
+    # get recomputed multiple times. To avoid inference issues at boundary
+    # points, this is implemented as a padded map from ld1 to ud1, instead
+    # of as a map from boundary_modified_ld1 to boundary_modified_ud1. For
+    # simplicity, use zero padding for rows that are outside the matrix.
+    # Wrap the rows in a BandMatrixRow so that they can be easily indexed.
+    nt_mr = ntuple(ζ -> ld1 + ζ - 1, Val(ud1 - ld1 + 1))
+    matrix2_rows = unrolled_map(nt_mr) do d
+        # TODO: Use @propagate_inbounds_meta instead of @inline_meta.
+        Base.@_inline_meta
+        if isnothing(bc) || boundary_modified_ld1 <= d <= boundary_modified_ud1
+            @inbounds Operators.getidx(space, matrix2, loc, idx + d, hidx)
+        else
+            zero(eltype(matrix2)) # This row is outside the matrix.
         end
-        matrix2_rows_wrapper = BandMatrixRow{ld1}(matrix2_rows...)
-
-        # Precompute the zero value to avoid inference issues caused by passing
-        # prod_type into the function closure below.
-        zero_value = rzero(eltype(prod_type))
-
-        # Compute the entries of the product matrix row. To avoid inference
-        # issues at boundary points, this is implemented as a padded map from
-        # prod_ld to prod_ud, instead of as a map from boundary_modified_prod_ld
-        # to boundary_modified_prod_ud. For simplicity, use zero padding for
-        # entries that are outside the matrix. Wrap the entries in a
-        # BandMatrixRow before returning them.
-        prod_entries = map((prod_ld:prod_ud...,)) do prod_d
-            # TODO: Use @propagate_inbounds_meta instead of @inline_meta.
-            Base.@_inline_meta
-            if isnothing(bc) ||
-               boundary_modified_prod_ld <= prod_d <= boundary_modified_prod_ud
-                prod_entry = zero_value
-                min_d = max(boundary_modified_ld1, prod_d - ud2)
-                max_d = min(boundary_modified_ud1, prod_d - ld2)
-                @inbounds for d in min_d:max_d
-                    value1 = matrix1_row[d]
-                    value2 = matrix2_rows_wrapper[d][prod_d - d]
-                    value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
-                    prod_entry = radd(
-                        prod_entry,
-                        rmul_with_projection(value1, value2, value2_lg),
+    end
+    matrix2_rows_wrapper = BandMatrixRow{ld1}(matrix2_rows...)
+
+    # Precompute the zero value to avoid inference issues caused by passing
+    # prod_type into the function closure below.
+    zero_value = rzero(eltype(prod_type))
+
+    # Compute the entries of the product matrix row. To avoid inference
+    # issues at boundary points, this is implemented as a padded map from
+    # prod_ld to prod_ud, instead of as a map from boundary_modified_prod_ld
+    # to boundary_modified_prod_ud. For simplicity, use zero padding for
+    # entries that are outside the matrix. Wrap the entries in a
+    # BandMatrixRow before returning them.
+    N = prod_ud - prod_ld + 1
+    nt_pe = ntuple(ξ -> prod_ld + ξ - 1, Val(N))
+    prod_entries = unrolled_map(nt_pe) do prod_d
+        # TODO: Use @propagate_inbounds_meta instead of @inline_meta.
+        Base.@_inline_meta
+        if isnothing(bc) ||
+           boundary_modified_prod_ld <=
+           prod_d <=
+           boundary_modified_prod_ud
+            prod_entry = zero_value
+            min_d = max(
+                boundary_modified_ld1,
+                prod_d - ud2,
+            )
+            max_d = min(
+                boundary_modified_ud1,
+                prod_d - ld2,
+            )
+            @inbounds for d in min_d:max_d
+                value1 = matrix1_row[d]
+                value2 =
+                    matrix2_rows_wrapper[d][prod_d - d]
+                value2_lg =
+                    Geometry.LocalGeometry(
+                        space,
+                        idx + d,
+                        hidx,
                     )
-                end # Using a for-loop is currently faster than using mapreduce.
-                prod_entry
-            else
-                zero_value # This entry is outside the matrix.
-            end
+                prod_entry = radd(
+                    prod_entry,
+                    rmul_with_projection(value1, value2, value2_lg),
+                )
+            end # Using a for-loop is currently faster than using mapreduce.
+            prod_entry
+        else
+            zero_value # This entry is outside the matrix.
         end
-        return BandMatrixRow{prod_ld}(prod_entries...)
-    else # matrix-vector multiplication
-        vector = arg
-        prod_value = rzero(prod_type)
-        @inbounds for d in boundary_modified_ld1:boundary_modified_ud1
-            value1 = matrix1_row[d]
-            value2 = Operators.getidx(space, vector, loc, idx + d, hidx)
-            value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
-            prod_value = radd(
-                prod_value,
-                rmul_with_projection(value1, value2, value2_lg),
-            )
-        end # Using a for-loop is currently faster than using mapreduce.
-        return prod_value
-    end
+    end::NTuple{N, eltype(prod_type)}
+    return BandMatrixRow{prod_ld}(prod_entries...)
+end
+
+Base.@propagate_inbounds function multiply_matrix_at_index_mat_vec(
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+    bc,
+)
+    lg = Geometry.LocalGeometry(space, idx, hidx)
+    prod_type = Operators.return_eltype(⋅, matrix1, arg, typeof(lg))
+
+    column_space1 = column_axes(matrix1, space)
+    ld1, ud1 = outer_diagonals(eltype(matrix1))
+    boundary_modified_ld1 = boundary_modified_ld(bc, ld1, column_space1, idx)
+    boundary_modified_ud1 = boundary_modified_ud(bc, ud1, column_space1, idx)
+
+    # Precompute the row that is needed from matrix1 so that it does not get
+    # recomputed multiple times.
+    matrix1_row = @inbounds Operators.getidx(space, matrix1, loc, idx, hidx)
+
+    vector = arg
+    prod_value = rzero(prod_type)::prod_type
+    @inbounds for d in boundary_modified_ld1:boundary_modified_ud1
+        value1 = matrix1_row[d]
+        value2 = Operators.getidx(space, vector, loc, idx + d, hidx)
+        value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
+        prod_value =
+            radd(prod_value, rmul_with_projection(value1, value2, value2_lg))
+    end # Using a for-loop is currently faster than using mapreduce.
+    return prod_value
 end
 
 Base.@propagate_inbounds Operators.stencil_interior(