From f002a5078cbaeb4b35692350b9c51fc9be330c55 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Wed, 22 Jun 2022 19:32:17 -0700
Subject: [PATCH 01/16] apply operators byslab

---
 src/DataLayouts/DataLayouts.jl   |  92 +++-
 src/DataLayouts/struct.jl        |  65 ++-
 src/Fields/Fields.jl             |   2 +-
 src/Fields/indices.jl            |  51 ++
 src/Operators/spectralelement.jl | 778 +++++++++++++++----------------
 test/DataLayouts/data2d.jl       |   6 +-
 6 files changed, 563 insertions(+), 431 deletions(-)

diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl
index 58e2f2ec84..dbec7a5a16 100644
--- a/src/DataLayouts/DataLayouts.jl
+++ b/src/DataLayouts/DataLayouts.jl
@@ -296,6 +296,19 @@ function replace_basetype(data::IJFH{S, Nij}, ::Type{T}) where {S, Nij, T}
     return IJFH{S′, Nij}(similar(array, T))
 end
 
+@inline function Base.getindex(data::IJFH{S}, i, j, _, _, h) where {S}
+    @inbounds get_struct(parent(data), S, Val(3), CartesianIndex(i, j, 1, h))
+end
+@inline function Base.setindex!(data::IJFH{S}, val, i, j, _, _, h) where {S}
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(3),
+        CartesianIndex(i, j, 1, h),
+    )
+end
+
+
 Base.length(data::IJFH) = size(parent(data), 4)
 
 @generated function _property_view(
@@ -455,6 +468,18 @@ end
     IFH{SS, Ni}(dataview)
 end
 
+@inline function Base.getindex(data::IFH{S}, i, _, _, _, h) where {S}
+    @inbounds get_struct(parent(data), S, Val(2), CartesianIndex(i, 1, h))
+end
+@inline function Base.setindex!(data::IFH{S}, val, i, _, _, _, h) where {S}
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(3),
+        CartesianIndex(i, 1, h),
+    )
+end
+
 # ======================
 # Data0D DataLayout
 # ======================
@@ -523,7 +548,7 @@ end
 end
 
 Base.@propagate_inbounds function Base.getindex(data::DataF{S}) where {S}
-    @inbounds get_struct(parent(data), S)
+    @inbounds get_struct(parent(data), S, Val(1), CartesianIndex(1))
 end
 
 @propagate_inbounds function Base.getindex(col::Data0D, I::CartesianIndex{5})
@@ -531,7 +556,12 @@ end
 end
 
 Base.@propagate_inbounds function Base.setindex!(data::DataF{S}, val) where {S}
-    @inbounds set_struct!(parent(data), convert(S, val))
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(1),
+        CartesianIndex(1),
+    )
 end
 
 @propagate_inbounds function Base.setindex!(
@@ -646,8 +676,7 @@ end
 ) where {S, Nij}
     @boundscheck (1 <= i <= Nij && 1 <= j <= Nij) ||
                  throw(BoundsError(data, (i, j)))
-    dataview = @inbounds view(parent(data), i, j, :)
-    @inbounds get_struct(dataview, S)
+    @inbounds get_struct(parent(data), S, Val(3), CartesianIndex(i, j, 1))
 end
 
 @inline function Base.setindex!(
@@ -658,8 +687,12 @@ end
 ) where {S, Nij}
     @boundscheck (1 <= i <= Nij && 1 <= j <= Nij) ||
                  throw(BoundsError(data, (i, j)))
-    dataview = @inbounds view(parent(data), i, j, :)
-    set_struct!(dataview, convert(S, val))
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(3),
+        CartesianIndex(i, j, 1),
+    )
 end
 
 @inline function column(data::IJF{S, Nij}, i, j) where {S, Nij}
@@ -758,14 +791,17 @@ end
 
 @inline function Base.getindex(data::IF{S, Ni}, i::Integer) where {S, Ni}
     @boundscheck (1 <= i <= Ni) || throw(BoundsError(data, (i,)))
-    dataview = @inbounds view(parent(data), i, :)
-    @inbounds get_struct(dataview, S)
+    @inbounds get_struct(parent(data), S, Val(2), CartesianIndex(i, 1))
 end
 
 @inline function Base.setindex!(data::IF{S, Ni}, val, i::Integer) where {S, Ni}
     @boundscheck (1 <= i <= Ni) || throw(BoundsError(data, (i,)))
-    dataview = @inbounds view(parent(data), i, :)
-    set_struct!(dataview, convert(S, val))
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(2),
+        CartesianIndex(i, 1),
+    )
 end
 
 @inline function column(data::IF{S, Ni}, i) where {S, Ni}
@@ -851,8 +887,7 @@ end
 @inline function Base.getindex(data::VF{S}, v::Integer) where {S}
     @boundscheck 1 <= v <= size(parent(data), 1) ||
                  throw(BoundsError(data, (v,)))
-    dataview = @inbounds view(parent(data), v, :)
-    @inbounds get_struct(dataview, S)
+    @inbounds get_struct(parent(data), S, Val(2), CartesianIndex(v, 1))
 end
 
 @propagate_inbounds function Base.getindex(
@@ -873,8 +908,12 @@ end
 @inline function Base.setindex!(data::VF{S}, val, v::Integer) where {S}
     @boundscheck (1 <= v <= length(parent(data))) ||
                  throw(BoundsError(data, (v,)))
-    dataview = @inbounds view(parent(data), v, :)
-    @inbounds set_struct!(dataview, convert(S, val))
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(2),
+        CartesianIndex(v, 1),
+    )
 end
 
 @inline function column(data::VF, i, h)
@@ -1039,6 +1078,19 @@ function gather(
         nothing
     end
 end
+
+@inline function Base.getindex(data::VIJFH{S}, i, j, _, v, h) where {S}
+    @inbounds get_struct(parent(data), S, Val(4), CartesianIndex(v, i, j, 1, h))
+end
+@inline function Base.setindex!(data::VIJFH{S}, val, i, j, _, v, h) where {S}
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(4),
+        CartesianIndex(v, i, j, 1, h),
+    )
+end
+
 # ======================
 # Data1DX DataLayout
 # ======================
@@ -1166,6 +1218,18 @@ end
     IFH{S, Nij}(dataview)
 end
 
+@inline function Base.getindex(data::VIFH{S}, i, _, _, v, h) where {S}
+    @inbounds get_struct(parent(data), S, Val(3), CartesianIndex(v, i, 1, h))
+end
+@inline function Base.setindex!(data::VIFH{S}, val, i, _, _, v, h) where {S}
+    @inbounds set_struct!(
+        parent(data),
+        convert(S, val),
+        Val(3),
+        CartesianIndex(v, i, 1, h),
+    )
+end
+
 # =========================================
 # Special DataLayouts for regular gridding
 # =========================================
diff --git a/src/DataLayouts/struct.jl b/src/DataLayouts/struct.jl
index ccd485b327..38abbf5686 100644
--- a/src/DataLayouts/struct.jl
+++ b/src/DataLayouts/struct.jl
@@ -164,16 +164,26 @@ typesize(::Type{T}, ::Type{S}) where {T, S} = div(sizeof(S), sizeof(T))
     return out
 end
 
+@inline offset_index(
+    start_index::CartesianIndex{N},
+    ::Val{D},
+    offset,
+) where {N, D} = CartesianIndex(
+    ntuple(n -> n == D ? start_index[n] + offset : start_index[n], N),
+)
+
 """
-    get_struct(array, S[, offset=0])
+    get_struct(array, S, Val(D), start_index)
 
-Construct an object of type `S` from the values of `array`, optionally offset by `offset` from the start of the array.
+Construct an object of type `S` packed along the `D` dimension, from the values of `array`,
+starting at `start_index`.
 """
 Base.@propagate_inbounds @generated function get_struct(
     array::AbstractArray{T},
     ::Type{S},
-    offset,
-) where {T, S}
+    ::Val{D},
+    start_index::CartesianIndex,
+) where {T, S, D}
     tup = :(())
     for i in 1:fieldcount(S)
         push!(
@@ -181,7 +191,8 @@ Base.@propagate_inbounds @generated function get_struct(
             :(get_struct(
                 array,
                 fieldtype(S, $i),
-                offset + fieldtypeoffset(T, S, $i),
+                Val($D),
+                offset_index(start_index, Val($D), fieldtypeoffset(T, S, $i)),
             )),
         )
     end
@@ -189,29 +200,37 @@ Base.@propagate_inbounds @generated function get_struct(
         Base.@_propagate_inbounds_meta
         @inbounds bypass_constructor(S, $tup)
     end
+    # else
+    #     Base.@_propagate_inbounds_meta
+    #     args = ntuple(fieldcount(S)) do i
+    #         get_struct(array, fieldtype(S, i), Val(D), offset_index(start_index, Val(D), fieldtypeoffset(T, S, i)))
+    #     end
+    #     return bypass_constructor(S, args)
+    # end
 end
 
 # recursion base case: hit array type is the same as the struct leaf type
 Base.@propagate_inbounds function get_struct(
     array::AbstractArray{S},
     ::Type{S},
-    offset,
-) where {S}
-    return @inbounds array[offset + 1]
+    ::Val{D},
+    start_index::CartesianIndex,
+) where {S, D}
+    @inbounds return array[start_index]
 end
 
-Base.@propagate_inbounds function get_struct(
-    array::AbstractArray{T},
-    ::Type{S},
-) where {T, S}
-    @inbounds get_struct(array, S, 0)
-end
+"""
+    set_struct!(array, val::S, Val(D), start_index)
 
+Store an object `val` of type `S` packed along the `D` dimension, into `array`,
+starting at `start_index`.
+"""
 Base.@propagate_inbounds @generated function set_struct!(
     array::AbstractArray{T},
     val::S,
-    offset,
-) where {T, S}
+    ::Val{D},
+    start_index::CartesianIndex,
+) where {T, S, D}
     ex = quote
         Base.@_propagate_inbounds_meta
     end
@@ -221,7 +240,8 @@ Base.@propagate_inbounds @generated function set_struct!(
             :(set_struct!(
                 array,
                 getfield(val, $i),
-                offset + fieldtypeoffset(T, S, $i),
+                Val($D),
+                offset_index(start_index, Val($D), fieldtypeoffset(T, S, $i)),
             )),
         )
     end
@@ -232,16 +252,13 @@ end
 Base.@propagate_inbounds function set_struct!(
     array::AbstractArray{S},
     val::S,
-    offset,
-) where {S}
-    @inbounds array[offset + 1] = val
+    ::Val{D},
+    index::CartesianIndex,
+) where {S, D}
+    @inbounds array[index] = val
     val
 end
 
-Base.@propagate_inbounds function set_struct!(array, val)
-    @inbounds set_struct!(array, val, 0)
-end
-
 # For complex nested types (ex. wrapped SMatrix) we hit a recursion limit and de-optimize
 # We know the recursion will terminate due to the fact that bitstype fields
 # cannot be self referential so there are no cycles in get/set_struct (bounded tree)
diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
index 03d78d4413..479b0b2d7c 100644
--- a/src/Fields/Fields.jl
+++ b/src/Fields/Fields.jl
@@ -8,7 +8,7 @@ import ..Domains
 import ..Topologies
 import ..Spaces: Spaces, AbstractSpace, AbstractPointSpace
 import ..Geometry: Geometry, Cartesian12Vector
-import ..Utilities: PlusHalf
+import ..Utilities: PlusHalf, half
 
 using ..RecursiveApply
 using ClimaComms
diff --git a/src/Fields/indices.jl b/src/Fields/indices.jl
index 71d7440d61..be0ba2da85 100644
--- a/src/Fields/indices.jl
+++ b/src/Fields/indices.jl
@@ -128,3 +128,54 @@ end
 
 # potential TODO:
 # - define a ColumnIndices type, make it work with https://github.com/JuliaFolds/FLoops.jl
+
+
+
+struct SlabIndex{VIdx}
+    v::VIdx
+    h::Int
+end
+
+
+Base.getindex(field::Field, slabidx::SlabIndex) = slab(field, slabidx)
+
+function slab(field::SpectralElementField, slabidx::SlabIndex{Nothing})
+    slab(field, slabidx.h)
+end
+function slab(
+    field::CenterExtrudedFiniteDifferenceField,
+    slabidx::SlabIndex{Int},
+)
+    slab(field, slabidx.v, slabidx.h)
+end
+function slab(
+    field::FaceExtrudedFiniteDifferenceField,
+    slabidx::SlabIndex{PlusHalf{Int}},
+)
+    slab(field, slabidx.v + half, slabidx.h)
+end
+
+function byslab(fn, space::Spaces.AbstractSpectralElementSpace)
+    Nh = Topologies.nlocalelems(space.topology)::Int
+    for h in 1:Nh
+        fn(SlabIndex(nothing, h))
+    end
+end
+function byslab(fn, space::Spaces.CenterExtrudedFiniteDifferenceSpace)
+    Nh = Topologies.nlocalelems(Spaces.topology(space))
+    Nv = Spaces.nlevels(space)
+    for h in 1:Nh
+        for v in 1:Nv
+            fn(SlabIndex(v, h))
+        end
+    end
+end
+function byslab(fn, space::Spaces.FaceExtrudedFiniteDifferenceSpace)
+    Nh = Topologies.nlocalelems(Spaces.topology(space))
+    Nv = Spaces.nlevels(space)
+    for h in 1:Nh
+        for v in 1:Nv
+            fn(SlabIndex(v - half, h))
+        end
+    end
+end
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index ab2bd09f37..c4e99f5b0a 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -12,9 +12,9 @@ Represents an operation that is applied to each element.
 Subtypes `Op` of this should define the following:
 - [`operator_return_eltype(::Op, ElTypes...)`](@ref)
 - [`allocate_work(::Op, args...)`](@ref)
-- [`apply_slab(::Op, work, args...)`](@ref)
+- [`apply_operator(::Op, work, args...)`](@ref)
 
-Additionally, the result type `OpResult <: OperatorSlabResult` of `apply_slab` should define `get_node(::OpResult, i, j)`.
+Additionally, the result type `OpResult <: OperatorSlabResult` of `apply_operator` should define `get_node(::OpResult, ij, slabidx)`.
 """
 abstract type SpectralElementOperator end
 
@@ -28,9 +28,26 @@ function operator_axes end
 operator_axes(space::Spaces.AbstractSpace) = ()
 operator_axes(space::Spaces.SpectralElementSpace1D) = (1,)
 operator_axes(space::Spaces.SpectralElementSpace2D) = (1, 2)
+operator_axes(space::Spaces.SpectralElementSpaceSlab1D) = (1,)
+operator_axes(space::Spaces.SpectralElementSpaceSlab2D) = (1, 2)
 operator_axes(space::Spaces.ExtrudedFiniteDifferenceSpace) =
     operator_axes(space.horizontal_space)
 
+
+function node_indices(space::Spaces.SpectralElementSpace1D)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    CartesianIndices((Nq,))
+end
+function node_indices(space::Spaces.SpectralElementSpace2D)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    CartesianIndices((Nq, Nq))
+end
+node_indices(space::Spaces.ExtrudedFiniteDifferenceSpace) =
+    node_indices(space.horizontal_space)
+
+
 """
     SpectralBroadcasted{Style}(op, args[,axes[, work]])
 
@@ -151,206 +168,201 @@ function Base.Broadcast.materialize(sbc::SpectralBroadcasted)
     copy(Base.Broadcast.instantiate(sbc))
 end
 
-Base.@propagate_inbounds function _inner_copyto!(
-    field_out::Field,
-    sbc::SpectralBroadcasted,
-    v::Integer,
-    h::Integer,
-)
-    slab_out = slab(field_out, v, h)
-    out_slab_space = slab(axes(sbc), v, h)
-    in_slab_space = slab(input_space(sbc), v, h)
-    _slab_args = _apply_slab_args(slab_args(sbc.args, v, h), v, h)
-    copy_slab!(
-        slab_out,
-        apply_slab(sbc.op, out_slab_space, in_slab_space, _slab_args...),
-    )
-end
-
-function _serial_copyto!(
-    field_out::Field,
-    sbc::SpectralBroadcasted,
-    Nv::Int,
-    Nh::Int,
-)
-    @inbounds for h in 1:Nh, v in 1:Nv
-        _inner_copyto!(field_out, sbc, v, h)
+function Base.copyto!(out::Field, sbc::SpectralBroadcasted)
+    Fields.byslab(axes(out)) do slabidx
+        copyto_slab!(out, sbc, slabidx)
     end
-    return field_out
-end
-
-function _threaded_copyto!(
-    field_out::Field,
-    sbc::SpectralBroadcasted,
-    Nv::Int,
-    Nh::Int,
-)
-    @inbounds begin
-        Threads.@threads for h in 1:Nh
-            for v in 1:Nv
-                _inner_copyto!(field_out, sbc, v, h)
-            end
-        end
-    end
-    return field_out
-end
-
-function Base.copyto!(field_out::Field, sbc::SpectralBroadcasted)
-    space = axes(field_out)
-    Nv = Spaces.nlevels(space)
-    Nh = Topologies.nlocalelems(Spaces.topology(space))
-    if enable_threading()
-        return _threaded_copyto!(field_out, sbc, Nv, Nh)
-    end
-    return _serial_copyto!(field_out, sbc, Nv, Nh)
+    return out
 end
 
 function Base.Broadcast.materialize!(dest, sbc::SpectralBroadcasted)
     copyto!(dest, Base.Broadcast.instantiate(sbc))
 end
 
-Base.@propagate_inbounds function slab(
-    sbc::SpectralBroadcasted{Style},
-    inds...,
-) where {Style <: SpectralStyle}
-    _args = slab_args(sbc.args, inds...)
-    _axes = slab(axes(sbc), inds...)
-    SpectralBroadcasted{Style}(sbc.op, _args, _axes, sbc.input_space)
-end
-
-Base.@propagate_inbounds function slab(
-    bc::Base.Broadcast.Broadcasted{Style},
-    inds...,
-) where {Style <: AbstractSpectralStyle}
-    _args = slab_args(bc.args, inds...)
-    _axes = slab(axes(bc), inds...)
-    Base.Broadcast.Broadcasted{Style}(bc.f, _args, _axes)
-end
-
 function Base.copyto!(
-    field_out::Field,
+    out::Field,
     bc::Base.Broadcast.Broadcasted{Style},
 ) where {Style <: AbstractSpectralStyle}
-    space = axes(field_out)
-    topology = Spaces.topology(space)
-    Nv = Spaces.nlevels(space)::Int
-    Nh = Topologies.nlocalelems(topology)::Int
-    @inbounds for h in 1:Nh, v in 1:Nv
-        slab_out = slab(field_out, v, h)
-        _slab_args = _apply_slab_args(slab_args(bc.args, v, h), v, h)
-        copy_slab!(
-            slab_out,
-            Base.Broadcast.Broadcasted{Style}(bc.f, _slab_args, axes(slab_out)),
-        )
+    Fields.byslab(axes(out)) do slabidx
+        copyto_slab!(out, bc, slabidx)
     end
-    return field_out
+    return out
 end
 
-function copy_slab!(slab_out::Fields.SlabField1D, res)
-    space = axes(slab_out)
-    Nq = Quadratures.degrees_of_freedom(space.quadrature_style)::Int
-    @inbounds for i in 1:Nq
-        set_node!(slab_out, i, get_node(res, i))
-    end
-    return slab_out
-end
+"""
+    copyto_slab!(out, bc, slabidx)
 
-function copy_slab!(slab_out::Fields.SlabField2D, res)
-    space = axes(slab_out)
-    Nq = Quadratures.degrees_of_freedom(space.quadrature_style)::Int
-    @inbounds for j in 1:Nq, i in 1:Nq
-        set_node!(slab_out, i, j, get_node(res, i, j))
+Copy the slab indexed by `slabidx` from `bc` to `out`.
+"""
+function copyto_slab!(out, bc, slabidx)
+    space = axes(out)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    rbc = resolve_operator(bc, slabidx)
+    @inbounds for ij in node_indices(axes(out))
+        set_node!(out, ij, slabidx, get_node(rbc, ij, slabidx))
     end
-    return slab_out
+    return nothing
 end
 
-# 1D get/set node
 
-# Recursively call get_node() on broadcast arguments in a way that is statically reducible by the optimizer
-# see Base.Broadcast.preprocess_args
-@inline node_args(args::Tuple, inds...) =
-    (get_node(args[1], inds...), node_args(Base.tail(args), inds...)...)
-@inline node_args(args::Tuple{Any}, inds...) = (get_node(args[1], inds...),)
-@inline node_args(args::Tuple{}, inds...) = ()
+"""
+    resolve_operator(bc, slabidx)
 
-# 1D intermediate slab data types
-Base.@propagate_inbounds function get_node(v::MVector, i)
-    v[i]
+Recursively evaluate any operators in `bc` at `slabidx`, replacing any
+`SpectralBroadcasted` objects with an `IF`/`IJF` object.
+"""
+function resolve_operator(bc::SpectralBroadcasted, slabidx)
+    args = _resolve_operator_args(slabidx, bc.args...)
+    apply_operator(bc.op, bc.axes, slabidx, args...)
 end
-
-Base.@propagate_inbounds function get_node(v::SVector, i)
-    v[i]
+function resolve_operator(
+    bc::Base.Broadcast.Broadcasted{CompositeSpectralStyle},
+    slabidx,
+)
+    args = _resolve_operator_args(slabidx, bc.args...)
+    Base.Broadcast.Broadcasted{CompositeSpectralStyle}(bc.f, args, bc.axes)
 end
+resolve_operator(x, slabidx) = x
+
+"""
+    _resolve_operator(slabidx, args...)
+
+Calls `resolve_operator(arg, slabidx)` for each `arg` in `args`
+"""
+_resolve_operator_args(slabidx) = ()
+@inline _resolve_operator_args(slabidx, arg, xargs...) = (
+    resolve_operator(arg, slabidx),
+    _resolve_operator_args(slabidx, xargs...)...,
+)
+
 
-Base.@propagate_inbounds function get_node(scalar, i)
+
+_get_node(ij, slabidx) = ()
+_get_node(ij, slabidx, arg, xargs...) =
+    (get_node(arg, ij, slabidx), _get_node(ij, slabidx, xargs...)...)
+
+Base.@propagate_inbounds function get_node(scalar, ij, slabidx)
     scalar[]
 end
-
-Base.@propagate_inbounds function get_node(field::Fields.SlabField1D, i)
-    getindex(Fields.field_values(field), i)
+Base.@propagate_inbounds function get_node(
+    field::Fields.Field,
+    ij::CartesianIndex{1},
+    slabidx,
+)
+    i, = Tuple(ij)
+    if field isa Fields.FaceExtrudedFiniteDifferenceField
+        v = slabidx.v + half
+    else
+        v = slabidx.v
+    end
+    h = slabidx.h
+    Fields.field_values(field)[i, nothing, nothing, v, h]
 end
-
-Base.@propagate_inbounds function get_node(bc::Base.Broadcast.Broadcasted, i)
-    bc.f(node_args(bc.args, i)...)
+Base.@propagate_inbounds function get_node(
+    field::Fields.Field,
+    ij::CartesianIndex{2},
+    slabidx,
+)
+    i, j = Tuple(ij)
+    if field isa Fields.FaceExtrudedFiniteDifferenceField
+        v = slabidx.v + half
+    else
+        v = slabidx.v
+    end
+    h = slabidx.h
+    Fields.field_values(field)[i, j, nothing, v, h]
 end
 
-Base.@propagate_inbounds set_node!(field::Fields.SlabField1D, i, val) =
-    setindex!(Fields.field_values(field), val, i)
 
-# 2D get/set node
-Base.@propagate_inbounds function get_node(v::MMatrix, i, j)
-    v[i, j]
-end
 
-Base.@propagate_inbounds function get_node(v::SMatrix, i, j)
-    v[i, j]
+Base.@propagate_inbounds function get_node(
+    bc::Base.Broadcast.Broadcasted,
+    ij,
+    slabidx,
+)
+    bc.f(_get_node(ij, slabidx, bc.args...)...)
 end
-
-Base.@propagate_inbounds function get_node(scalar, i, j)
-    scalar[]
+Base.@propagate_inbounds function get_node(
+    data::Union{DataLayouts.IJF, DataLayouts.IF},
+    ij,
+    slabidx,
+)
+    data[ij]
+end
+Base.@propagate_inbounds function get_node(
+    data::StaticArrays.SArray,
+    ij,
+    slabidx,
+)
+    data[ij]
 end
 
-Base.@propagate_inbounds function get_node(field::Fields.SlabField2D, i, j)
-    getindex(Fields.field_values(field), i, j)
+dont_limit = (args...) -> true
+for m in methods(get_node)
+    m.recursion_relation = dont_limit
 end
 
-Base.@propagate_inbounds function get_node(bc::Base.Broadcast.Broadcasted, i, j)
-    bc.f(node_args(bc.args, i, j)...)
+Base.@propagate_inbounds function get_local_geometry(
+    space::Spaces.AbstractSpectralElementSpace,
+    ij::CartesianIndex{1},
+    slabidx,
+)
+    i, = Tuple(ij)
+    h = slabidx.h
+    if space isa Spaces.FaceExtrudedFiniteDifferenceSpace
+        v = slabidx.v + half
+    else
+        v = slabidx.v
+    end
+    Spaces.local_geometry_data(space)[i, j, nothing, v, h]
+end
+Base.@propagate_inbounds function get_local_geometry(
+    space::Spaces.AbstractSpectralElementSpace,
+    ij::CartesianIndex{2},
+    slabidx,
+)
+    i, j = Tuple(ij)
+    h = slabidx.h
+    if space isa Spaces.FaceExtrudedFiniteDifferenceSpace
+        v = slabidx.v + half
+    else
+        v = slabidx.v
+    end
+    Spaces.local_geometry_data(space)[i, j, nothing, v, h]
 end
 
 Base.@propagate_inbounds function set_node!(
-    field::Fields.SlabField2D,
-    i,
-    j,
+    field::Fields.Field,
+    ij::CartesianIndex{1},
+    slabidx,
     val,
 )
-    setindex!(Fields.field_values(field), val, i, j)
+    i, = Tuple(ij)
+    if field isa Fields.FaceExtrudedFiniteDifferenceField
+        v = slabidx.v + half
+    else
+        v = slabidx.v
+    end
+    h = slabidx.h
+    Fields.field_values(field)[i, nothing, nothing, v, h] = val
 end
-
-# Broadcast recursive machinery
-@inline _apply_slab_args(args::Tuple, inds...) = (
-    _apply_slab(args[1], inds...),
-    _apply_slab_args(Base.tail(args), inds...)...,
-)
-@inline _apply_slab_args(args::Tuple{Any}, inds...) =
-    (_apply_slab(args[1], inds...),)
-@inline _apply_slab_args(args::Tuple{}, inds...) = ()
-
-@inline _apply_slab(x, inds...) = x
-@inline _apply_slab(sbc::SpectralBroadcasted, inds...) = apply_slab(
-    sbc.op,
-    slab(axes(sbc), inds...),
-    slab(input_space(sbc), inds...),
-    _apply_slab_args(sbc.args, inds...)...,
-)
-@inline _apply_slab(
-    bc::Base.Broadcast.Broadcasted{CompositeSpectralStyle},
-    inds...,
-) = Base.Broadcast.Broadcasted{CompositeSpectralStyle}(
-    bc.f,
-    _apply_slab_args(bc.args, inds...),
-    bc.axes,
+Base.@propagate_inbounds function set_node!(
+    field::Fields.Field,
+    ij::CartesianIndex{2},
+    slabidx,
+    val,
 )
+    i, j = Tuple(ij)
+    if field isa Fields.FaceExtrudedFiniteDifferenceField
+        v = slabidx.v + half
+    else
+        v = slabidx.v
+    end
+    h = slabidx.h
+    Fields.field_values(field)[i, j, nothing, v, h] = val
+end
+
 
 function Base.Broadcast.BroadcastStyle(
     ::Type{SB},
@@ -366,6 +378,10 @@ function Base.Broadcast.BroadcastStyle(
 end
 
 
+
+
+
+
 """
     div = Divergence()
     div.(u)
@@ -401,50 +417,53 @@ Divergence{()}(space) = Divergence{operator_axes(space)}()
 operator_return_eltype(op::Divergence{I}, ::Type{S}) where {I, S} =
     RecursiveApply.rmaptype(Geometry.divergence_result_type, S)
 
-function apply_slab(op::Divergence{(1,)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::Divergence{(1,)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MVector{Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        v = get_node(arg, ij, slabidx)
         Jv¹ =
             local_geometry.J ⊠ RecursiveApply.rmap(
                 v -> Geometry.contravariant1(v, local_geometry),
-                get_node(slab_data, i),
+                v,
             )
         for ii in 1:Nq
             out[ii] = out[ii] ⊞ (D[ii, i] ⊠ Jv¹)
         end
     end
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
-        out[i] = out[i] ⊠ local_geometry.invJ
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        out[i] = RecursiveApply.rdiv(out[i], local_geometry.J)
     end
-    return SVector(out)
+    return SArray(out)
 end
 
-function apply_slab(op::Divergence{(1, 2)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::Divergence{(1, 2)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        v = get_node(arg, ij, slabidx)
         Jv¹ =
             local_geometry.J ⊠ RecursiveApply.rmap(
                 v -> Geometry.contravariant1(v, local_geometry),
-                get_node(slab_data, i, j),
+                v,
             )
         for ii in 1:Nq
             out[ii, j] = out[ii, j] ⊞ (D[ii, i] ⊠ Jv¹)
@@ -452,17 +471,18 @@ function apply_slab(op::Divergence{(1, 2)}, slab_space, _, slab_data)
         Jv² =
             local_geometry.J ⊠ RecursiveApply.rmap(
                 v -> Geometry.contravariant2(v, local_geometry),
-                get_node(slab_data, i, j),
+                v,
             )
         for jj in 1:Nq
             out[i, jj] = out[i, jj] ⊞ (D[jj, j] ⊠ Jv²)
         end
     end
     @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
-        out[i, j] = out[i, j] ⊠ local_geometry.invJ
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        out[i, j] = RecursiveApply.rdiv(out[i, j], local_geometry.J)
     end
-    return SMatrix(out)
+    return SArray(out)
 end
 
 
@@ -510,50 +530,52 @@ WeakDivergence{()}(space) = WeakDivergence{operator_axes(space)}()
 operator_return_eltype(::WeakDivergence{I}, ::Type{S}) where {I, S} =
     RecursiveApply.rmaptype(Geometry.divergence_result_type, S)
 
-function apply_slab(op::WeakDivergence{(1,)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::WeakDivergence{(1,)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MVector{Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        v = get_node(arg, ij, slabidx)
         WJv¹ =
             local_geometry.WJ ⊠ RecursiveApply.rmap(
                 v -> Geometry.contravariant1(v, local_geometry),
-                get_node(slab_data, i),
+                v,
             )
         for ii in 1:Nq
             out[ii] = out[ii] ⊞ (D[i, ii] ⊠ WJv¹)
         end
     end
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
         out[i] = RecursiveApply.rdiv(out[i], ⊟(local_geometry.WJ))
     end
-    return SVector(out)
+    return SArray(out)
 end
 
-function apply_slab(op::WeakDivergence{(1, 2)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
-    RT = operator_return_eltype(op, eltype(slab_data))
-    # allocate temp output
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        v = get_node(arg, ij, slabidx)
         WJv¹ =
             local_geometry.WJ ⊠ RecursiveApply.rmap(
                 v -> Geometry.contravariant1(v, local_geometry),
-                get_node(slab_data, i, j),
+                v,
             )
         for ii in 1:Nq
             out[ii, j] = out[ii, j] ⊞ (D[i, ii] ⊠ WJv¹)
@@ -561,17 +583,18 @@ function apply_slab(op::WeakDivergence{(1, 2)}, slab_space, _, slab_data)
         WJv² =
             local_geometry.WJ ⊠ RecursiveApply.rmap(
                 v -> Geometry.contravariant2(v, local_geometry),
-                get_node(slab_data, i, j),
+                v,
             )
         for jj in 1:Nq
             out[i, jj] = out[i, jj] ⊞ (D[j, jj] ⊠ WJv²)
         end
     end
     @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
         out[i, j] = RecursiveApply.rdiv(out[i, j], ⊟(local_geometry.WJ))
     end
-    return SMatrix(out)
+    return SArray(out)
 end
 
 """
@@ -603,36 +626,38 @@ Gradient{()}(space) = Gradient{operator_axes(space)}()
 operator_return_eltype(::Gradient{I}, ::Type{S}) where {I, S} =
     RecursiveApply.rmaptype(T -> Geometry.gradient_result_type(Val(I), T), S)
 
-function apply_slab(op::Gradient{(1,)}, slab_space, _, slab_data)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::Gradient{(1,)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MVector{Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for i in 1:Nq
-        x = get_node(slab_data, i)
+        ij = CartesianIndex((i,))
+        x = get_node(arg, ij, slabidx)
         for ii in 1:Nq
             ∂f∂ξ = Geometry.Covariant1Vector(D[ii, i]) ⊗ x
             out[ii] += ∂f∂ξ
         end
     end
-    return SVector(out)
+    return SArray(out)
 end
 
-function apply_slab(op::Gradient{(1, 2)}, slab_space, _, slab_data)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::Gradient{(1, 2)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for j in 1:Nq, i in 1:Nq
-        x = get_node(slab_data, i, j)
+        ij = CartesianIndex((i, j))
+        x = get_node(arg, ij, slabidx)
         for ii in 1:Nq
             ∂f∂ξ₁ = Geometry.Covariant12Vector(D[ii, i], zero(eltype(D))) ⊗ x
             out[ii, j] = out[ii, j] ⊞ ∂f∂ξ₁
@@ -642,7 +667,7 @@ function apply_slab(op::Gradient{(1, 2)}, slab_space, _, slab_data)
             out[i, jj] = out[i, jj] ⊞ ∂f∂ξ₂
         end
     end
-    return SMatrix(out)
+    return SArray(out)
 end
 
 
@@ -687,47 +712,48 @@ WeakGradient{()}(space) = WeakGradient{operator_axes(space)}()
 operator_return_eltype(::WeakGradient{I}, ::Type{S}) where {I, S} =
     RecursiveApply.rmaptype(T -> Geometry.gradient_result_type(Val(I), T), S)
 
-function apply_slab(op::WeakGradient{(1,)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::WeakGradient{(1,)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MVector{Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
-        W = local_geometry.WJ * local_geometry.invJ
-        Wx = W ⊠ get_node(slab_data, i)
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        W = local_geometry.WJ / local_geometry.J
+        Wx = W ⊠ get_node(arg, ij, slabidx)
         for ii in 1:Nq
             Dᵀ₁Wf = Geometry.Covariant1Vector(D[i, ii]) ⊗ Wx
             out[ii] = out[ii] ⊟ Dᵀ₁Wf
         end
     end
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
-        W = local_geometry.WJ * local_geometry.invJ
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        W = local_geometry.WJ / local_geometry.J
         out[i] = RecursiveApply.rdiv(out[i], W)
     end
-    return SVector(out)
+    return SArray(out)
 end
 
-function apply_slab(op::WeakGradient{(1, 2)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::WeakGradient{(1, 2)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
-        W = local_geometry.WJ * local_geometry.invJ
-        Wx = W ⊠ get_node(slab_data, i, j)
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        W = local_geometry.WJ / local_geometry.J
+        Wx = W ⊠ get_node(arg, ij, slabidx)
         for ii in 1:Nq
             Dᵀ₁Wf = Geometry.Covariant12Vector(D[i, ii], zero(eltype(D))) ⊗ Wx
             out[ii, j] = out[ii, j] ⊟ Dᵀ₁Wf
@@ -738,11 +764,12 @@ function apply_slab(op::WeakGradient{(1, 2)}, slab_space, _, slab_data)
         end
     end
     @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
-        W = local_geometry.WJ * local_geometry.invJ
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        W = local_geometry.WJ / local_geometry.J
         out[i, j] = RecursiveApply.rdiv(out[i, j], W)
     end
-    return SMatrix(out)
+    return SArray(out)
 end
 
 abstract type CurlSpectralElementOperator <: SpectralElementOperator end
@@ -795,22 +822,21 @@ Curl{()}(space) = Curl{operator_axes(space)}()
 operator_return_eltype(::Curl{I}, ::Type{S}) where {I, S} =
     RecursiveApply.rmaptype(T -> Geometry.curl_result_type(Val(I), T), S)
 
-function apply_slab(op::Curl{(1,)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MVector{Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     if RT <: Geometry.Contravariant2Vector
         @inbounds for i in 1:Nq
-            v₃ = Geometry.covariant3(
-                get_node(slab_data, i),
-                slab_local_geometry[i],
-            )
+            ij = CartesianIndex((i,))
+            local_geometry = get_local_geometry(space, ij, slabidx)
+            v = get_node(arg, ij, slabidx)
+            v₃ = Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
                 D₁v₃ = D[ii, i] ⊠ v₃
                 out[ii] = out[ii] ⊞ Geometry.Contravariant2Vector(⊟(D₁v₃))
@@ -820,57 +846,46 @@ function apply_slab(op::Curl{(1,)}, slab_space, _, slab_data)
         error("invalid return type: $RT")
     end
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
-        out[i] = out[i] ⊠ local_geometry.invJ
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        out[i] = RecursiveApply.rdiv(out[i], local_geometry.J)
     end
-    return SVector(out)
+    return SArray(out)
 end
 
-function apply_slab(op::Curl{(1, 2)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     # input data is a Covariant12Vector field
     if RT <: Geometry.Contravariant3Vector
         @inbounds for j in 1:Nq, i in 1:Nq
-            v₁ = Geometry.covariant1(
-                get_node(slab_data, i, j),
-                slab_local_geometry[i, j],
-            )
+            ij = CartesianIndex((i, j))
+            local_geometry = get_local_geometry(space, ij, slabidx)
+            v = get_node(arg, ij, slabidx)
+            v₁ = Geometry.covariant1(v, local_geometry)
             for jj in 1:Nq
                 D₂v₁ = D[jj, j] ⊠ v₁
-                out[i, jj] =
-                    out[i, jj] ⊞ Geometry.Contravariant3Vector(
-                        ⊟(D₂v₁),
-                        slab_local_geometry[i, jj],
-                    )
+                out[i, jj] = out[i, jj] ⊞ Geometry.Contravariant3Vector(⊟(D₂v₁))
             end
-            v₂ = Geometry.covariant2(
-                get_node(slab_data, i, j),
-                slab_local_geometry[i, j],
-            )
+            v₂ = Geometry.covariant2(v, local_geometry)
             for ii in 1:Nq
                 D₁v₂ = D[ii, i] ⊠ v₂
-                out[ii, j] =
-                    out[ii, j] ⊞ Geometry.Contravariant3Vector(
-                        D₁v₂,
-                        slab_local_geometry[ii, j],
-                    )
+                out[ii, j] = out[ii, j] ⊞ Geometry.Contravariant3Vector(D₁v₂)
             end
         end
         # input data is a Covariant3Vector field
     elseif RT <: Geometry.Contravariant12Vector
         @inbounds for j in 1:Nq, i in 1:Nq
-            v₃ = Geometry.covariant3(
-                get_node(slab_data, i, j),
-                slab_local_geometry[i, j],
-            )
+            ij = CartesianIndex((i, j))
+            local_geometry = get_local_geometry(space, ij, slabidx)
+            v = get_node(arg, ij, slabidx)
+            v₃ = Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
                 D₁v₃ = D[ii, i] ⊠ v₃
                 out[ii, j] =
@@ -888,10 +903,11 @@ function apply_slab(op::Curl{(1, 2)}, slab_space, _, slab_data)
         error("invalid return type")
     end
     @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
-        out[i, j] = out[i, j] ⊠ local_geometry.invJ
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
+        out[i, j] = RecursiveApply.rdiv(out[i, j], local_geometry.J)
     end
-    return SMatrix(out)
+    return SArray(out)
 end
 
 """
@@ -937,26 +953,23 @@ WeakCurl{()}(space) = WeakCurl{operator_axes(space)}()
 operator_return_eltype(::WeakCurl{I}, ::Type{S}) where {I, S} =
     RecursiveApply.rmaptype(T -> Geometry.curl_result_type(Val(I), T), S)
 
-function apply_slab(op::WeakCurl{(1,)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MVector{Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     # input data is a Covariant3Vector field
     if RT <: Geometry.Contravariant2Vector
         @inbounds for i in 1:Nq
-            local_geometry = slab_local_geometry[i]
-            W = local_geometry.WJ * local_geometry.invJ
-            Wv₃ =
-                W ⊠ Geometry.covariant3(
-                    get_node(slab_data, i),
-                    slab_local_geometry[i],
-                )
+            ij = CartesianIndex((i,))
+            local_geometry = get_local_geometry(space, ij, slabidx)
+            v = get_node(arg, ij, slabidx)
+            W = local_geometry.WJ / local_geometry.J
+            Wv₃ = W ⊠ Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
                 Dᵀ₁Wv₃ = D[i, ii] ⊠ Wv₃
                 out[ii] = out[ii] ⊞ Geometry.Contravariant2Vector(⊟(Dᵀ₁Wv₃))
@@ -966,64 +979,49 @@ function apply_slab(op::WeakCurl{(1,)}, slab_space, _, slab_data)
         error("invalid return type: $RT")
     end
     @inbounds for i in 1:Nq
-        local_geometry = slab_local_geometry[i]
+        ij = CartesianIndex((i,))
+        local_geometry = get_local_geometry(space, ij, slabidx)
         out[i] = RecursiveApply.rdiv(out[i], local_geometry.WJ)
     end
-    return SVector(out)
+    return SArray(out)
 end
 
-function apply_slab(op::WeakCurl{(1, 2)}, slab_space, _, slab_data)
-    slab_local_geometry = Spaces.local_geometry_data(slab_space)
-    FT = Spaces.undertype(slab_space)
-    QS = Spaces.quadrature_style(slab_space)
+function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    RT = operator_return_eltype(op, eltype(slab_data))
+    RT = operator_return_eltype(op, eltype(arg))
     out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
     DataLayouts._mzero!(out, FT)
     # input data is a Covariant12Vector field
     if RT <: Geometry.Contravariant3Vector
         @inbounds for j in 1:Nq, i in 1:Nq
-            local_geometry = slab_local_geometry[i, j]
-            W = local_geometry.WJ * local_geometry.invJ
-            Wv₁ =
-                W ⊠ Geometry.covariant1(
-                    get_node(slab_data, i, j),
-                    slab_local_geometry[i, j],
-                )
+            ij = CartesianIndex((i, j))
+            local_geometry = get_local_geometry(space, ij, slabidx)
+            v = get_node(arg, ij, slabidx)
+            W = local_geometry.WJ / local_geometry.J
+            Wv₁ = W ⊠ Geometry.covariant1(v, local_geometry)
             for jj in 1:Nq
                 Dᵀ₂Wv₁ = D[j, jj] ⊠ Wv₁
-                out[i, jj] =
-                    out[i, jj] ⊞ Geometry.Contravariant3Vector(
-                        Dᵀ₂Wv₁,
-                        slab_local_geometry[i, jj],
-                    )
+                out[i, jj] = out[i, jj] ⊞ Geometry.Contravariant3Vector(Dᵀ₂Wv₁)
             end
-            Wv₂ =
-                W ⊠ Geometry.covariant2(
-                    get_node(slab_data, i, j),
-                    slab_local_geometry[i, j],
-                )
+            Wv₂ = W ⊠ Geometry.covariant2(v, local_geometry)
             for ii in 1:Nq
                 Dᵀ₁Wv₂ = D[i, ii] ⊠ Wv₂
                 out[ii, j] =
-                    out[ii, j] ⊞ Geometry.Contravariant3Vector(
-                        ⊟(Dᵀ₁Wv₂),
-                        slab_local_geometry[ii, j],
-                    )
+                    out[ii, j] ⊞ Geometry.Contravariant3Vector(⊟(Dᵀ₁Wv₂))
             end
         end
         # input data is a Covariant3Vector field
     elseif RT <: Geometry.Contravariant12Vector
         @inbounds for j in 1:Nq, i in 1:Nq
-            local_geometry = slab_local_geometry[i, j]
-            W = local_geometry.WJ * local_geometry.invJ
-            Wv₃ =
-                W ⊠ Geometry.covariant3(
-                    get_node(slab_data, i, j),
-                    slab_local_geometry[i, j],
-                )
+            ij = CartesianIndex((i, j))
+            local_geometry = get_local_geometry(space, ij, slabidx)
+            v = get_node(arg, ij, slabidx)
+            W = local_geometry.WJ / local_geometry.J
+            Wv₃ = W ⊠ Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
                 Dᵀ₁Wv₃ = D[i, ii] ⊠ Wv₃
                 out[ii, j] =
@@ -1040,11 +1038,12 @@ function apply_slab(op::WeakCurl{(1, 2)}, slab_space, _, slab_data)
     else
         error("invalid return type")
     end
-    @inbounds for j in 1:Nq, i in 1:Nq
-        local_geometry = slab_local_geometry[i, j]
+    for j in 1:Nq, i in 1:Nq
+        ij = CartesianIndex((i, j))
+        local_geometry = get_local_geometry(space, ij, slabidx)
         out[i, j] = RecursiveApply.rdiv(out[i, j], local_geometry.WJ)
     end
-    return SMatrix(out)
+    return SArray(out)
 end
 
 # interplation / restriction
@@ -1074,55 +1073,54 @@ struct Interpolate{I, S} <: TensorOperator
 end
 Interpolate(space) = Interpolate{operator_axes(space), typeof(space)}(space)
 
-function apply_slab(
-    op::Interpolate{(1,)},
-    slab_space_out,
-    slab_space_in,
-    slab_data,
-)
-    FT = Spaces.undertype(slab_space_out)
-    QS_in = Spaces.quadrature_style(slab_space_in)
-    QS_out = Spaces.quadrature_style(slab_space_out)
+function apply_operator(op::Interpolate{(1,)}, space_out, slabidx, arg)
+    FT = Spaces.undertype(space_out)
+    space_in = axes(arg)
+    QS_in = Spaces.quadrature_style(space_in)
+    QS_out = Spaces.quadrature_style(space_out)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     Imat = Quadratures.interpolation_matrix(FT, QS_out, QS_in)
-    S = eltype(slab_data)
+    S = eltype(arg)
     slab_data_out = MVector{Nq_out, S}(undef)
     @inbounds for i in 1:Nq_out
         # manually inlined rmatmul with slab_getnode
-        r = Imat[i, 1] ⊠ get_node(slab_data, 1)
+        r = Imat[i, 1] ⊠ get_node(arg, CartesianIndex((1,)), slabidx)
         for ii in 2:Nq_in
-            r = RecursiveApply.rmuladd(Imat[i, ii], get_node(slab_data, ii), r)
+            ij = CartesianIndex((ii,))
+            r = RecursiveApply.rmuladd(
+                Imat[i, ii],
+                get_node(arg, ii, slabidx),
+                r,
+            )
         end
         slab_data_out[i] = r
     end
-    return slab_data_out
+    return SArray(slab_data_out)
 end
 
-function apply_slab(
-    op::Interpolate{(1, 2)},
-    slab_space_out,
-    slab_space_in,
-    slab_data,
-)
-    FT = Spaces.undertype(slab_space_out)
-    QS_in = Spaces.quadrature_style(slab_space_in)
-    QS_out = Spaces.quadrature_style(slab_space_out)
+function apply_operator(op::Interpolate{(1, 2)}, space_out, slabidx, arg)
+    FT = Spaces.undertype(space_out)
+    space_in = axes(arg)
+    QS_in = Spaces.quadrature_style(space_in)
+    QS_out = Spaces.quadrature_style(space_out)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     Imat = Quadratures.interpolation_matrix(FT, QS_out, QS_in)
-    S = eltype(slab_data)
+    S = eltype(arg)
     # temporary storage
     temp = MArray{Tuple{Nq_out, Nq_in}, S, 2, Nq_out * Nq_in}(undef)
     slab_data_out = MArray{Tuple{Nq_out, Nq_out}, S, 2, Nq_out * Nq_out}(undef)
     @inbounds for j in 1:Nq_in, i in 1:Nq_out
         # manually inlined rmatmul1 with slab get_node
         # we do this to remove one allocated intermediate array
-        r = Imat[i, 1] ⊠ get_node(slab_data, 1, j)
+        ij = CartesianIndex((1, j))
+        r = Imat[i, 1] ⊠ get_node(arg, ij, slabidx)
         for ii in 2:Nq_in
+            ij = CartesianIndex((ii, j))
             r = RecursiveApply.rmuladd(
                 Imat[i, ii],
-                get_node(slab_data, ii, j),
+                get_node(arg, ij, slabidx),
                 r,
             )
         end
@@ -1163,69 +1161,71 @@ struct Restrict{I, S} <: TensorOperator
 end
 Restrict(space) = Restrict{operator_axes(space), typeof(space)}(space)
 
-function apply_slab(
-    op::Restrict{(1,)},
-    slab_space_out,
-    slab_space_in,
-    slab_data,
-)
-    FT = Spaces.undertype(slab_space_out)
-    QS_in = Spaces.quadrature_style(slab_space_in)
-    QS_out = Spaces.quadrature_style(slab_space_out)
+function apply_operator(op::Restrict{(1,)}, space_out, slabidx, arg)
+    FT = Spaces.undertype(space_out)
+    space_in = axes(arg)
+    QS_in = Spaces.quadrature_style(space_in)
+    QS_out = Spaces.quadrature_style(space_out)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     ImatT = Quadratures.interpolation_matrix(FT, QS_in, QS_out)' # transpose
-    S = eltype(slab_data)
+    S = eltype(arg)
     slab_data_out = MVector{Nq_out, S}(undef)
-    slab_local_geometry_in = Spaces.local_geometry_data(slab_space_in)
-    slab_local_geometry_out = Spaces.local_geometry_data(slab_space_out)
-    WJ_in = slab_local_geometry_in.WJ
-    WJ_out = slab_local_geometry_out.WJ
     @inbounds for i in 1:Nq_out
         # manually inlined rmatmul with slab get_node
-        r = ImatT[i, 1] ⊠ (WJ_in[1] ⊠ get_node(slab_data, 1))
+        ij = CartesianIndex((1,))
+        WJ = get_local_geometry(space_in, ij, slabidx).WJ
+        r = ImatT[i, 1] ⊠ (WJ ⊠ get_node(arg, ij, slabidx))
         for ii in 2:Nq_in
-            WJ_node = WJ_in[ii] ⊠ get_node(slab_data, ii)
-            r = RecursiveApply.rmuladd(ImatT[i, ii], WJ_node, r)
+            ij = CartesianIndex((ii,))
+            WJ = get_local_geometry(space_in, ij, slabidx).WJ
+            r = RecursiveApply.rmuladd(
+                ImatT[i, ii],
+                WJ ⊠ get_node(arg, ij, slabidx),
+                r,
+            )
         end
-        slab_data_out[i] = RecursiveApply.rdiv(r, WJ_out[i])
+        ij_out = CartesianIndex((i,))
+        WJ_out = get_local_geometry(space_out, ij_out, slabidx).WJ
+        slab_data_out[i] = RecursiveApply.rdiv(r, WJ_out)
     end
     return slab_data_out
 end
 
-function apply_slab(
-    op::Restrict{(1, 2)},
-    slab_space_out,
-    slab_space_in,
-    slab_data,
-)
-    FT = Spaces.undertype(slab_space_out)
-    QS_in = Spaces.quadrature_style(slab_space_in)
-    QS_out = Spaces.quadrature_style(slab_space_out)
+function apply_operator(op::Restrict{(1, 2)}, space_out, slabidx, arg)
+    FT = Spaces.undertype(space_out)
+    space_in = axes(arg)
+    QS_in = Spaces.quadrature_style(space_in)
+    QS_out = Spaces.quadrature_style(space_out)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     ImatT = Quadratures.interpolation_matrix(FT, QS_in, QS_out)' # transpose
-    S = eltype(slab_data)
+    S = eltype(arg)
     # temporary storage
     temp = MArray{Tuple{Nq_out, Nq_in}, S, 2, Nq_out * Nq_in}(undef)
     slab_data_out = MArray{Tuple{Nq_out, Nq_out}, S, 2, Nq_out * Nq_out}(undef)
-    slab_local_geometry_in = Spaces.local_geometry_data(slab_space_in)
-    slab_local_geometry_out = Spaces.local_geometry_data(slab_space_out)
-    WJ_in = slab_local_geometry_in.WJ
-    WJ_out = slab_local_geometry_out.WJ
     @inbounds for j in 1:Nq_in, i in 1:Nq_out
         # manually inlined rmatmul1 with slab get_node
-        r = ImatT[i, 1] ⊠ (WJ_in[1, j] ⊠ get_node(slab_data, 1, j))
+        ij = CartesianIndex((1, j))
+        WJ = get_local_geometry(space_in, ij, slabidx).WJ
+        r = ImatT[i, 1] ⊠ (WJ ⊠ get_node(arg, ij, slabidx))
         for ii in 2:Nq_in
-            WJ_node = WJ_in[ii, j] ⊠ get_node(slab_data, ii, j)
-            r = RecursiveApply.rmuladd(ImatT[i, ii], WJ_node, r)
+            ij = CartesianIndex((ii, j))
+            WJ = get_local_geometry(space_in, ij, slabidx).WJ
+            r = RecursiveApply.rmuladd(
+                ImatT[i, ii],
+                WJ ⊠ get_node(arg, ij, slabidx),
+                r,
+            )
         end
         temp[i, j] = r
     end
     @inbounds for j in 1:Nq_out, i in 1:Nq_out
+        ij_out = CartesianIndex((i, j))
+        WJ_out = get_local_geometry(space_out, ij_out, slabidx).WJ
         slab_data_out[i, j] = RecursiveApply.rdiv(
             RecursiveApply.rmatmul2(ImatT, temp, i, j),
-            WJ_out[i, j],
+            WJ_out,
         )
     end
     return SMatrix(slab_data_out)
diff --git a/test/DataLayouts/data2d.jl b/test/DataLayouts/data2d.jl
index 09ea858320..10575e29f1 100644
--- a/test/DataLayouts/data2d.jl
+++ b/test/DataLayouts/data2d.jl
@@ -31,10 +31,10 @@ end
 @testset "get_struct / set_struct!" begin
     array = [1.0, 2.0, 3.0]
     S = Tuple{Complex{Float64}, Float64}
-    @test get_struct(array, S) == (1.0 + 2.0im, 3.0)
-    set_struct!(array, (4.0 + 2.0im, 6.0))
+    @test get_struct(array, S, Val(1), CartesianIndex(1)) == (1.0 + 2.0im, 3.0)
+    set_struct!(array, (4.0 + 2.0im, 6.0), Val(1), CartesianIndex(1))
     @test array == [4.0, 2.0, 6.0]
-    @test get_struct(array, S) == (4.0 + 2.0im, 6.0)
+    @test get_struct(array, S, Val(1), CartesianIndex(1)) == (4.0 + 2.0im, 6.0)
 end
 
 @testset "IJFH" begin

From 3a647f8a385e57e8b02676beecb7e8e0d8dcf279 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Mon, 23 Jan 2023 17:43:55 -0800
Subject: [PATCH 02/16] kill input_space

---
 src/Operators/spectralelement.jl | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index c4e99f5b0a..473a4e96e7 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -55,33 +55,23 @@ This is similar to a `Base.Broadcast.Broadcasted` object, except it contains spa
 
 This is returned by `Base.Broadcast.broadcasted(op::SpectralElementOperator)`.
 """
-struct SpectralBroadcasted{Style, Op, Args, Axes, InputSpace} <:
+struct SpectralBroadcasted{Style, Op, Args, Axes} <:
        Base.AbstractBroadcasted
     op::Op
     args::Args
     axes::Axes
-    input_space::InputSpace
 end
 SpectralBroadcasted{Style}(
     op::Op,
     args::Args,
     axes::Axes = nothing,
-    input_space::InputSpace = nothing,
-) where {Style, Op, Args, Axes, InputSpace} =
-    SpectralBroadcasted{Style, Op, Args, Axes, InputSpace}(
+) where {Style, Op, Args, Axes} =
+    SpectralBroadcasted{Style, Op, Args, Axes}(
         op,
         args,
         axes,
-        input_space,
     )
 
-input_space(arg) = axes(arg)
-input_space(::SpectralElementOperator, space) = space
-
-input_space(sbc::SpectralBroadcasted) =
-    isnothing(sbc.input_space) ?
-    input_space(sbc.op, tuplemap(axes, sbc.args)...) : sbc.input_space
-
 return_space(::SpectralElementOperator, space) = space
 
 Base.axes(sbc::SpectralBroadcasted) =
@@ -119,13 +109,8 @@ function Base.Broadcast.instantiate(
         axes = sbc.axes
         Base.Broadcast.check_broadcast_axes(axes, args...)
     end
-    if sbc.input_space isa Nothing
-        inspace = input_space(sbc)
-    else
-        inspace = sbc.input_space
-    end
     op = typeof(op)(axes)
-    return SpectralBroadcasted{Style}(op, args, axes, inspace)
+    return SpectralBroadcasted{Style}(op, args, axes)
 end
 
 function Base.Broadcast.instantiate(
@@ -1049,7 +1034,6 @@ end
 # interplation / restriction
 abstract type TensorOperator <: SpectralElementOperator end
 
-input_space(op::TensorOperator, inspace) = inspace
 return_space(op::TensorOperator, inspace) = op.space
 operator_return_eltype(::TensorOperator, ::Type{S}) where {S} = S
 

From ea6941a889968c2b90ba6dff22ac774ff2688bfa Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Tue, 24 Jan 2023 16:04:18 -0800
Subject: [PATCH 03/16] switch to using Fields as temporaries

---
 src/DataLayouts/DataLayouts.jl   |  30 ++++++-
 src/Operators/spectralelement.jl | 139 ++++++++++++++++---------------
 2 files changed, 101 insertions(+), 68 deletions(-)

diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl
index dbec7a5a16..4ab7b8b528 100644
--- a/src/DataLayouts/DataLayouts.jl
+++ b/src/DataLayouts/DataLayouts.jl
@@ -15,7 +15,7 @@ indexes the underlying array as `[i,j,k,f,v,h]`
 module DataLayouts
 
 import Base: Base, @propagate_inbounds
-import StaticArrays: SOneTo, MArray
+import StaticArrays: SOneTo, MArray, SArray
 import ClimaComms
 
 import ..enable_threading, ..slab, ..slab_args, ..column, ..column_args, ..level
@@ -628,6 +628,14 @@ end
 
 rebuild(data::IJF{S, Nij}, array::A) where {S, Nij, A <: AbstractArray} =
     IJF{S, Nij}(array)
+function IJF{S, Nij}(::Type{MArray}, ::Type{T}) where {S, Nij, T}
+    Nf = typesize(T, S)
+    array = MArray{Tuple{Nij, Nij, Nf}, T, 3, Nij * Nij * Nf}(undef)
+    IJF{S, Nij}(array)
+end
+function SArray(ijf::IJF{S, Nij, <:MArray}) where {S, Nij}
+    IJF{S, Nij}(SArray(parent(ijf)))
+end
 
 function replace_basetype(data::IJF{S, Nij}, ::Type{T}) where {S, Nij, T}
     array = parent(data)
@@ -673,6 +681,9 @@ end
     data::IJF{S, Nij},
     i::Integer,
     j::Integer,
+    k = nothing,
+    v = nothing,
+    h = nothing,
 ) where {S, Nij}
     @boundscheck (1 <= i <= Nij && 1 <= j <= Nij) ||
                  throw(BoundsError(data, (i, j)))
@@ -752,6 +763,14 @@ function IF{S, Ni}(array::AbstractArray{T, 2}) where {S, Ni, T}
     check_basetype(T, S)
     IF{S, Ni, typeof(array)}(array)
 end
+function IF{S, Ni}(::Type{MArray}, ::Type{T}) where {S, Ni, T}
+    Nf = typesize(T, S)
+    array = MArray{Tuple{Ni, Nf}, T, 2, Ni * Nf}(undef)
+    IF{S, Ni}(array)
+end
+function SArray(data::IF{S, Ni, <:MArray}) where {S, Ni}
+    IF{S, Ni}(SArray(parent(data)))
+end
 
 function replace_basetype(data::IF{S, Ni}, ::Type{T}) where {S, Ni, T}
     array = parent(data)
@@ -789,7 +808,14 @@ end
     IF{SS, Ni}(dataview)
 end
 
-@inline function Base.getindex(data::IF{S, Ni}, i::Integer) where {S, Ni}
+@inline function Base.getindex(
+    data::IF{S, Ni},
+    i::Integer,
+    j = nothing,
+    k = nothing,
+    v = nothing,
+    h = nothing,
+) where {S, Ni}
     @boundscheck (1 <= i <= Ni) || throw(BoundsError(data, (i,)))
     @inbounds get_struct(parent(data), S, Val(2), CartesianIndex(i, 1))
 end
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index 473a4e96e7..80f547f8f8 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -55,8 +55,7 @@ This is similar to a `Base.Broadcast.Broadcasted` object, except it contains spa
 
 This is returned by `Base.Broadcast.broadcasted(op::SpectralElementOperator)`.
 """
-struct SpectralBroadcasted{Style, Op, Args, Axes} <:
-       Base.AbstractBroadcasted
+struct SpectralBroadcasted{Style, Op, Args, Axes} <: Base.AbstractBroadcasted
     op::Op
     args::Args
     axes::Axes
@@ -66,11 +65,7 @@ SpectralBroadcasted{Style}(
     args::Args,
     axes::Axes = nothing,
 ) where {Style, Op, Args, Axes} =
-    SpectralBroadcasted{Style, Op, Args, Axes}(
-        op,
-        args,
-        axes,
-    )
+    SpectralBroadcasted{Style, Op, Args, Axes}(op, args, axes)
 
 return_space(::SpectralElementOperator, space) = space
 
@@ -289,7 +284,10 @@ for m in methods(get_node)
 end
 
 Base.@propagate_inbounds function get_local_geometry(
-    space::Spaces.AbstractSpectralElementSpace,
+    space::Union{
+        Spaces.AbstractSpectralElementSpace,
+        Spaces.ExtrudedFiniteDifferenceSpace,
+    },
     ij::CartesianIndex{1},
     slabidx,
 )
@@ -300,10 +298,13 @@ Base.@propagate_inbounds function get_local_geometry(
     else
         v = slabidx.v
     end
-    Spaces.local_geometry_data(space)[i, j, nothing, v, h]
+    Spaces.local_geometry_data(space)[i, nothing, nothing, v, h]
 end
 Base.@propagate_inbounds function get_local_geometry(
-    space::Spaces.AbstractSpectralElementSpace,
+    space::Union{
+        Spaces.AbstractSpectralElementSpace,
+        Spaces.ExtrudedFiniteDifferenceSpace,
+    },
     ij::CartesianIndex{2},
     slabidx,
 )
@@ -409,8 +410,8 @@ function apply_operator(op::Divergence{(1,)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MVector{Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = IF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
@@ -429,7 +430,7 @@ function apply_operator(op::Divergence{(1,)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i] = RecursiveApply.rdiv(out[i], local_geometry.J)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 function apply_operator(op::Divergence{(1, 2)}, space, slabidx, arg)
@@ -439,8 +440,8 @@ function apply_operator(op::Divergence{(1, 2)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IJF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
@@ -467,7 +468,7 @@ function apply_operator(op::Divergence{(1, 2)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i, j] = RecursiveApply.rdiv(out[i, j], local_geometry.J)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 
@@ -522,8 +523,8 @@ function apply_operator(op::WeakDivergence{(1,)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MVector{Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
@@ -542,7 +543,7 @@ function apply_operator(op::WeakDivergence{(1,)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i] = RecursiveApply.rdiv(out[i], ⊟(local_geometry.WJ))
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
@@ -551,8 +552,9 @@ function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IJF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
+
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
@@ -579,7 +581,7 @@ function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i, j] = RecursiveApply.rdiv(out[i, j], ⊟(local_geometry.WJ))
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 """
@@ -618,8 +620,8 @@ function apply_operator(op::Gradient{(1,)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MVector{Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         x = get_node(arg, ij, slabidx)
@@ -628,7 +630,7 @@ function apply_operator(op::Gradient{(1,)}, space, slabidx, arg)
             out[ii] += ∂f∂ξ
         end
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 function apply_operator(op::Gradient{(1, 2)}, space, slabidx, arg)
@@ -638,8 +640,9 @@ function apply_operator(op::Gradient{(1, 2)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IJF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
+
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         x = get_node(arg, ij, slabidx)
@@ -652,7 +655,7 @@ function apply_operator(op::Gradient{(1, 2)}, space, slabidx, arg)
             out[i, jj] = out[i, jj] ⊞ ∂f∂ξ₂
         end
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 
@@ -704,8 +707,8 @@ function apply_operator(op::WeakGradient{(1,)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MVector{Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
@@ -722,7 +725,7 @@ function apply_operator(op::WeakGradient{(1,)}, space, slabidx, arg)
         W = local_geometry.WJ / local_geometry.J
         out[i] = RecursiveApply.rdiv(out[i], W)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 function apply_operator(op::WeakGradient{(1, 2)}, space, slabidx, arg)
@@ -732,8 +735,9 @@ function apply_operator(op::WeakGradient{(1, 2)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IJF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
+
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
@@ -754,7 +758,7 @@ function apply_operator(op::WeakGradient{(1, 2)}, space, slabidx, arg)
         W = local_geometry.WJ / local_geometry.J
         out[i, j] = RecursiveApply.rdiv(out[i, j], W)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 abstract type CurlSpectralElementOperator <: SpectralElementOperator end
@@ -814,8 +818,8 @@ function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MVector{Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
     if RT <: Geometry.Contravariant2Vector
         @inbounds for i in 1:Nq
             ij = CartesianIndex((i,))
@@ -835,7 +839,7 @@ function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i] = RecursiveApply.rdiv(out[i], local_geometry.J)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
@@ -845,8 +849,9 @@ function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IJF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
+
     # input data is a Covariant12Vector field
     if RT <: Geometry.Contravariant3Vector
         @inbounds for j in 1:Nq, i in 1:Nq
@@ -892,7 +897,7 @@ function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i, j] = RecursiveApply.rdiv(out[i, j], local_geometry.J)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 """
@@ -945,8 +950,8 @@ function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MVector{Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
     # input data is a Covariant3Vector field
     if RT <: Geometry.Contravariant2Vector
         @inbounds for i in 1:Nq
@@ -968,7 +973,7 @@ function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i] = RecursiveApply.rdiv(out[i], local_geometry.WJ)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
@@ -978,8 +983,9 @@ function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
     RT = operator_return_eltype(op, eltype(arg))
-    out = StaticArrays.MMatrix{Nq, Nq, RT}(undef)
-    DataLayouts._mzero!(out, FT)
+    out = DataLayouts.IJF{RT, Nq}(MArray, FT)
+    fill!(parent(out), zero(FT))
+
     # input data is a Covariant12Vector field
     if RT <: Geometry.Contravariant3Vector
         @inbounds for j in 1:Nq, i in 1:Nq
@@ -1028,7 +1034,7 @@ function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
         local_geometry = get_local_geometry(space, ij, slabidx)
         out[i, j] = RecursiveApply.rdiv(out[i, j], local_geometry.WJ)
     end
-    return SArray(out)
+    return Field(SArray(out), space)
 end
 
 # interplation / restriction
@@ -1065,22 +1071,23 @@ function apply_operator(op::Interpolate{(1,)}, space_out, slabidx, arg)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     Imat = Quadratures.interpolation_matrix(FT, QS_out, QS_in)
-    S = eltype(arg)
-    slab_data_out = MVector{Nq_out, S}(undef)
+    RT = eltype(arg)
+    out = DataLayouts.IF{RT, Nq_out}(MArray, FT)
     @inbounds for i in 1:Nq_out
         # manually inlined rmatmul with slab_getnode
-        r = Imat[i, 1] ⊠ get_node(arg, CartesianIndex((1,)), slabidx)
+        ij = CartesianIndex((1,))
+        r = Imat[i, 1] ⊠ get_node(arg, ij, slabidx)
         for ii in 2:Nq_in
             ij = CartesianIndex((ii,))
             r = RecursiveApply.rmuladd(
                 Imat[i, ii],
-                get_node(arg, ii, slabidx),
+                get_node(arg, ij, slabidx),
                 r,
             )
         end
-        slab_data_out[i] = r
+        out[i] = r
     end
-    return SArray(slab_data_out)
+    return Field(SArray(out), space_out)
 end
 
 function apply_operator(op::Interpolate{(1, 2)}, space_out, slabidx, arg)
@@ -1091,10 +1098,10 @@ function apply_operator(op::Interpolate{(1, 2)}, space_out, slabidx, arg)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     Imat = Quadratures.interpolation_matrix(FT, QS_out, QS_in)
-    S = eltype(arg)
+    RT = eltype(arg)
     # temporary storage
-    temp = MArray{Tuple{Nq_out, Nq_in}, S, 2, Nq_out * Nq_in}(undef)
-    slab_data_out = MArray{Tuple{Nq_out, Nq_out}, S, 2, Nq_out * Nq_out}(undef)
+    temp = DataLayouts.IJF{RT, max(Nq_in, Nq_out)}(MArray, FT)
+    out = DataLayouts.IJF{RT, Nq_out}(MArray, FT)
     @inbounds for j in 1:Nq_in, i in 1:Nq_out
         # manually inlined rmatmul1 with slab get_node
         # we do this to remove one allocated intermediate array
@@ -1111,9 +1118,9 @@ function apply_operator(op::Interpolate{(1, 2)}, space_out, slabidx, arg)
         temp[i, j] = r
     end
     @inbounds for j in 1:Nq_out, i in 1:Nq_out
-        slab_data_out[i, j] = RecursiveApply.rmatmul2(Imat, temp, i, j)
+        out[i, j] = RecursiveApply.rmatmul2(Imat, temp, i, j)
     end
-    return SMatrix(slab_data_out)
+    return Field(SArray(out), space_out)
 end
 
 
@@ -1153,8 +1160,8 @@ function apply_operator(op::Restrict{(1,)}, space_out, slabidx, arg)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     ImatT = Quadratures.interpolation_matrix(FT, QS_in, QS_out)' # transpose
-    S = eltype(arg)
-    slab_data_out = MVector{Nq_out, S}(undef)
+    RT = eltype(arg)
+    out = IF{RT, Nq_out}(MArray, FT)
     @inbounds for i in 1:Nq_out
         # manually inlined rmatmul with slab get_node
         ij = CartesianIndex((1,))
@@ -1171,9 +1178,9 @@ function apply_operator(op::Restrict{(1,)}, space_out, slabidx, arg)
         end
         ij_out = CartesianIndex((i,))
         WJ_out = get_local_geometry(space_out, ij_out, slabidx).WJ
-        slab_data_out[i] = RecursiveApply.rdiv(r, WJ_out)
+        out[i] = RecursiveApply.rdiv(r, WJ_out)
     end
-    return slab_data_out
+    return Field(SArray(out), space_out)
 end
 
 function apply_operator(op::Restrict{(1, 2)}, space_out, slabidx, arg)
@@ -1184,10 +1191,10 @@ function apply_operator(op::Restrict{(1, 2)}, space_out, slabidx, arg)
     Nq_in = Quadratures.degrees_of_freedom(QS_in)
     Nq_out = Quadratures.degrees_of_freedom(QS_out)
     ImatT = Quadratures.interpolation_matrix(FT, QS_in, QS_out)' # transpose
-    S = eltype(arg)
+    RT = eltype(arg)
     # temporary storage
-    temp = MArray{Tuple{Nq_out, Nq_in}, S, 2, Nq_out * Nq_in}(undef)
-    slab_data_out = MArray{Tuple{Nq_out, Nq_out}, S, 2, Nq_out * Nq_out}(undef)
+    temp = DataLayouts.IJF{RT, max(Nq_in, Nq_out)}(MArray, FT)
+    out = DataLayouts.IJF{RT, Nq_out}(MArray, FT)
     @inbounds for j in 1:Nq_in, i in 1:Nq_out
         # manually inlined rmatmul1 with slab get_node
         ij = CartesianIndex((1, j))
@@ -1207,12 +1214,12 @@ function apply_operator(op::Restrict{(1, 2)}, space_out, slabidx, arg)
     @inbounds for j in 1:Nq_out, i in 1:Nq_out
         ij_out = CartesianIndex((i, j))
         WJ_out = get_local_geometry(space_out, ij_out, slabidx).WJ
-        slab_data_out[i, j] = RecursiveApply.rdiv(
+        out[i, j] = RecursiveApply.rdiv(
             RecursiveApply.rmatmul2(ImatT, temp, i, j),
             WJ_out,
         )
     end
-    return SMatrix(slab_data_out)
+    return Field(SArray(out), space_out)
 end
 
 

From 05c69c1d631d3465078899f51fbe0b2366fabee5 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Wed, 25 Jan 2023 12:10:43 -0800
Subject: [PATCH 04/16] add doc

---
 src/Operators/spectralelement.jl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index 80f547f8f8..e90b6e5244 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -190,7 +190,14 @@ end
     resolve_operator(bc, slabidx)
 
 Recursively evaluate any operators in `bc` at `slabidx`, replacing any
-`SpectralBroadcasted` objects with an `IF`/`IJF` object.
+`SpectralBroadcasted` objects.
+
+- if `bc` is a regular `Broadcasted` object, return a new `Broadcasted` with `resolve_operator` called on each `arg`
+- if `bc` is a regular `SpectralBroadcasted` object:
+ - call `resolve_operator` called on each `arg`
+ - call `apply_operator`, returning the resulting "pseudo Field":  a `Field` with an
+ `IF`/`IJF` data object.
+- if `bc` is a `Field`, return that
 """
 function resolve_operator(bc::SpectralBroadcasted, slabidx)
     args = _resolve_operator_args(slabidx, bc.args...)

From b2d25f9f18ea04fc0f66db26ab771573622585ae Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 26 Jan 2023 16:13:08 -0800
Subject: [PATCH 05/16] WIP: CUDA

---
 src/Device.jl                                 |   5 +
 src/Fields/Fields.jl                          |   6 +
 src/Operators/Operators.jl                    |   3 +-
 src/Operators/spectralelement.jl              |  98 ++++-
 src/Spaces/Spaces.jl                          |   2 +-
 src/Spaces/spectralelement.jl                 |  17 +
 src/Topologies/Topologies.jl                  |   1 +
 .../spectralelement/rectilinear_cuda.jl       | 369 ++++++++++++++++++
 8 files changed, 494 insertions(+), 7 deletions(-)
 create mode 100644 test/Operators/spectralelement/rectilinear_cuda.jl

diff --git a/src/Device.jl b/src/Device.jl
index 980f8f0f1a..9d7507313f 100644
--- a/src/Device.jl
+++ b/src/Device.jl
@@ -8,4 +8,9 @@ device(; disablegpu = false) =
 device_array_type(::ClimaComms.CPU) = Array
 device_array_type(::ClimaComms.CUDA) = CUDA.CuArray
 
+device(ctx::ClimaComms.SingletonCommsContext) = ctx.device
+
+device(::Array) = ClimaComms.CPU()
+device(::CUDA.CuArray) = ClimaComms.CUDA()
+
 end
diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
index 479b0b2d7c..de611d246e 100644
--- a/src/Fields/Fields.jl
+++ b/src/Fields/Fields.jl
@@ -12,6 +12,7 @@ import ..Utilities: PlusHalf, half
 
 using ..RecursiveApply
 using ClimaComms
+import Adapt
 
 import StaticArrays, LinearAlgebra, Statistics, InteractiveUtils
 
@@ -48,6 +49,11 @@ comm_context(topology::Topologies.Topology2D) = topology.context
 comm_context(topology::T) where {T <: Topologies.AbstractTopology} =
     ClimaComms.SingletonCommsContext()
 
+Adapt.adapt_structure(to, field::Field) =
+    Field(
+        Adapt.adapt_structure(to, Fields.field_values(field)),
+        Adapt.adapt_structure(to, axes(field)),
+    )
 
 
 # Point Field
diff --git a/src/Operators/Operators.jl b/src/Operators/Operators.jl
index 8ed202c589..bc6bc721b0 100644
--- a/src/Operators/Operators.jl
+++ b/src/Operators/Operators.jl
@@ -2,9 +2,10 @@ module Operators
 
 import LinearAlgebra
 
-using StaticArrays
+using StaticArrays, CUDA
 
 import ..enable_threading, ..slab, ..slab_args, ..column, ..column_args
+import ..Device
 import ..DataLayouts: DataLayouts, Data2D, DataSlab2D
 import ..Geometry: Geometry, Covariant12Vector, Contravariant12Vector, ⊗
 import ..Spaces: Spaces, Quadratures, AbstractSpace
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index e90b6e5244..649e0a3f9c 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -4,6 +4,12 @@ struct SpectralStyle <: AbstractSpectralStyle end
 
 struct CompositeSpectralStyle <: AbstractSpectralStyle end
 
+struct CUDASpectralStyle <: AbstractSpectralStyle end
+
+import ClimaComms
+AbstractSpectralStyle(::ClimaComms.CPU) = SpectralStyle
+AbstractSpectralStyle(::ClimaComms.CUDA) = CUDASpectralStyle
+
 """
     SpectralElementOperator
 
@@ -55,17 +61,19 @@ This is similar to a `Base.Broadcast.Broadcasted` object, except it contains spa
 
 This is returned by `Base.Broadcast.broadcasted(op::SpectralElementOperator)`.
 """
-struct SpectralBroadcasted{Style, Op, Args, Axes} <: Base.AbstractBroadcasted
+struct SpectralBroadcasted{Style, Op, Args, Axes, Work} <: Base.AbstractBroadcasted
     op::Op
     args::Args
     axes::Axes
+    work::Work
 end
 SpectralBroadcasted{Style}(
     op::Op,
     args::Args,
     axes::Axes = nothing,
-) where {Style, Op, Args, Axes} =
-    SpectralBroadcasted{Style, Op, Args, Axes}(op, args, axes)
+    work::Work = nothing,
+) where {Style, Op, Args, Axes, Work} =
+    SpectralBroadcasted{Style, Op, Args, Axes, Work}(op, args, axes, work)
 
 return_space(::SpectralElementOperator, space) = space
 
@@ -92,8 +100,8 @@ Base.eltype(sbc::SpectralBroadcasted) =
 @inline instantiate_args(::Tuple{}) = ()
 
 function Base.Broadcast.instantiate(
-    sbc::SpectralBroadcasted{Style},
-) where {Style}
+    sbc::SpectralBroadcasted,
+)
     op = sbc.op
     # recursively instantiate the arguments to allocate intermediate work arrays
     args = instantiate_args(sbc.args)
@@ -105,9 +113,21 @@ function Base.Broadcast.instantiate(
         Base.Broadcast.check_broadcast_axes(axes, args...)
     end
     op = typeof(op)(axes)
+    Style = AbstractSpectralStyle(Device.device(axes))
     return SpectralBroadcasted{Style}(op, args, axes)
 end
 
+import Adapt
+
+Adapt.adapt_structure(to, sbc::SpectralBroadcasted{Style}) where {Style} =
+    SpectralBroadcasted{Style}(
+        sbc.op,
+        map(arg -> Adapt.adapt_structure(to, arg), sbc.args),
+        Adapt.adapt_structure(to, sbc.axes),
+    )
+
+
+
 function Base.Broadcast.instantiate(
     bc::Base.Broadcast.Broadcasted{Style},
 ) where {Style <: AbstractSpectralStyle}
@@ -155,6 +175,39 @@ function Base.copyto!(out::Field, sbc::SpectralBroadcasted)
     return out
 end
 
+function Base.copyto!(out::Field, sbc::SpectralBroadcasted{CUDASpectralStyle})
+    space = axes(out)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    Nh = Topologies.nlocalelems(Spaces.topology(space))
+    Nv = Spaces.nlevels(space)
+
+    @cuda threads=(Nq,Nq) blocks=(Nv,Nh) copyto_kernel!(out, sbc)
+    return out
+end
+
+
+function copyto_kernel!(out, sbc)
+    i = threadIdx().x
+    j = threadIdx().y
+    #if axes(out) isa Spaces.AbstractSpectralElementSpace
+        v = nothing
+    #elseif axes(out) isa Spaces.CenterExtrudedFiniteDifferenceSpace
+    #    v = blockIdx().x
+    #elseif axes(out) isa Spaces.FaceExtrudedFiniteDifferenceSpace
+    #    v = blockIdx().x - half
+    #end
+    h = blockIdx().y
+    ij = CartesianIndex((i,j))
+    slabidx = Fields.SlabIndex{Nothing}(v,h)
+
+    result = apply_operator_kernel(sbc.op, axes(out), ij, slabidx, sbc.args...)
+    set_node!(out, ij, slabidx, result)
+    return nothing
+end
+
+
+
 function Base.Broadcast.materialize!(dest, sbc::SpectralBroadcasted)
     copyto!(dest, Base.Broadcast.instantiate(sbc))
 end
@@ -666,6 +719,41 @@ function apply_operator(op::Gradient{(1, 2)}, space, slabidx, arg)
 end
 
 
+
+function apply_operator_kernel(op::Gradient{(1, 2)}, space, ij, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    D = Quadratures.differentiation_matrix(FT, QS)
+    # allocate temp output
+    IT = eltype(arg)
+    Nf = DataLayouts.typesize(FT, IT)
+    array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    work = IJF{IT, Nq}(array)
+    i,j = ij.I
+    work[i,j] = get_node(arg, ij, slabidx)
+    CUDA.sync_threads()
+
+    ∂f∂ξ₁ = D[i,1] * work[1,j]
+    ∂f∂ξ₂ = D[j,1] * work[i,1]
+    for k = 2:Nq
+        ∂f∂ξ₁ += D[i,k] * work[k,j]
+        ∂f∂ξ₂ += D[j,k] * work[i,k]
+    end
+    return Geometry.Covariant12Vector(∂f∂ξ₁,∂f∂ξ₂)
+end
+
+function allocate_shared(op::Gradient{(1,2)}, arg)
+    S = eltype(arg)
+    space = axes(arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    Nf = DataLayouts.typesize(FT, S)
+    array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    return IJF{S, Nq}(array)
+end
+
 """
     wgrad = WeakGradient()
     wgrad.(f)
diff --git a/src/Spaces/Spaces.jl b/src/Spaces/Spaces.jl
index 2c8e0318b4..ecd6c4f25f 100644
--- a/src/Spaces/Spaces.jl
+++ b/src/Spaces/Spaces.jl
@@ -22,7 +22,7 @@ import ..slab, ..column, ..level
 import ..Utilities: PlusHalf
 import ..DataLayouts, ..Geometry, ..Domains, ..Meshes, ..Topologies
 import ..Device
-using StaticArrays, ForwardDiff, LinearAlgebra, UnPack
+using StaticArrays, ForwardDiff, LinearAlgebra, UnPack, Adapt
 
 abstract type AbstractSpace end
 
diff --git a/src/Spaces/spectralelement.jl b/src/Spaces/spectralelement.jl
index e803fdc34b..afa1da45b8 100644
--- a/src/Spaces/spectralelement.jl
+++ b/src/Spaces/spectralelement.jl
@@ -20,6 +20,7 @@ function Base.show(io::IO, space::AbstractSpectralElementSpace)
     print(iio, " "^(indent + 2), space.quadrature_style)
 end
 
+Device.device(space::AbstractSpectralElementSpace) = Device.device(topology(space))
 topology(space::AbstractSpectralElementSpace) = space.topology
 quadrature_style(space::AbstractSpectralElementSpace) = space.quadrature_style
 
@@ -175,6 +176,21 @@ struct SpectralElementSpace2D{
     boundary_surface_geometries::BS
 end
 
+Adapt.adapt_structure(to, space::SpectralElementSpace2D) =
+SpectralElementSpace2D(
+    nothing, 
+    Adapt.adapt_structure(to, space.quadrature_style),
+    Adapt.adapt_structure(to, space.global_geometry),
+    Adapt.adapt_structure(to, space.local_geometry),
+    Adapt.adapt_structure(to, space.ghost_geometry),
+    Adapt.adapt_structure(to, space.local_dss_weights),
+    Adapt.adapt_structure(to, space.ghost_dss_weights),
+    Adapt.adapt_structure(to, space.internal_surface_geometry),
+    Adapt.adapt_structure(to, space.boundary_surface_geometries),
+)
+
+
+
 """
     SpectralElementSpace2D(topology, quadrature_style; enable_bubble)
 
@@ -772,3 +788,4 @@ function Base.iterate(
         return ((i, j), e), ((i, j), e)
     end
 end
+
diff --git a/src/Topologies/Topologies.jl b/src/Topologies/Topologies.jl
index 5de54430b7..6dda097e41 100644
--- a/src/Topologies/Topologies.jl
+++ b/src/Topologies/Topologies.jl
@@ -2,6 +2,7 @@ module Topologies
 
 using DocStringExtensions
 
+import ..Device
 import ..Geometry
 import ..Domains: Domains, coordinate_type
 import ..Meshes: Meshes, domain, coordinates
diff --git a/test/Operators/spectralelement/rectilinear_cuda.jl b/test/Operators/spectralelement/rectilinear_cuda.jl
new file mode 100644
index 0000000000..bceec3c120
--- /dev/null
+++ b/test/Operators/spectralelement/rectilinear_cuda.jl
@@ -0,0 +1,369 @@
+using Test
+using StaticArrays
+using ClimaComms, ClimaCore
+import ClimaCore:
+    Geometry, Fields, Domains, Topologies, Meshes, Spaces, Operators
+using LinearAlgebra, IntervalSets
+using CUDA
+
+FT = Float64
+domain = Domains.RectangleDomain(
+    Geometry.XPoint{FT}(-pi) .. Geometry.XPoint{FT}(pi),
+    Geometry.YPoint{FT}(-pi) .. Geometry.YPoint{FT}(pi);
+    x1periodic = true,
+    x2periodic = true,
+)
+
+Nq = 5
+quad = Spaces.Quadratures.GLL{Nq}()
+
+grid_mesh = Meshes.RectilinearMesh(domain, 17, 16)
+
+
+grid_topology_cpu =
+    Topologies.Topology2D(ClimaComms.SingletonCommsContext(ClimaCore.Device.device(;disablegpu=true)), grid_mesh)
+grid_space_cpu = Spaces.SpectralElementSpace2D(grid_topology_cpu, quad)
+coords_cpu = Fields.coordinate_field(grid_space_cpu)
+
+f_cpu = sin.(coords_cpu.x .+ 2 .* coords_cpu.y)
+
+grad = Operators.Gradient()
+gradf_cpu = grad.(f_cpu)
+
+
+grid_topology =
+    Topologies.Topology2D(ClimaComms.SingletonCommsContext(ClimaCore.Device.device()), grid_mesh)
+grid_space = Spaces.SpectralElementSpace2D(grid_topology, quad)
+coords = Fields.coordinate_field(grid_space)
+
+CUDA.allowscalar(false)
+f = sin.(coords.x .+ 2 .* coords.y)
+
+grad = Operators.Gradient()
+gradf = grad.(f)
+
+
+
+
+
+
+
+#==
+
+ts_mesh = Meshes.RectilinearMesh(domain, 17, 16)
+ts_topology = Topologies.Topology2D(ClimaComms.SingletonCommsContext(), ts_mesh)
+ts_space = Spaces.SpectralElementSpace2D(ts_topology, quad)
+ts_coords = Fields.coordinate_field(ts_space)
+
+grid_test_setup = (grid_topology, grid_space, grid_coords)
+ts_test_setup = (ts_topology, ts_space, ts_coords)
+
+@testset "interpolate / restrict" begin
+
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        INq = 9
+        Iquad = Spaces.Quadratures.GLL{INq}()
+        Ispace = Spaces.SpectralElementSpace2D(topology, Iquad)
+
+        I = Operators.Interpolate(Ispace)
+        R = Operators.Restrict(space)
+
+        f = sin.(coords.x .+ 2 .* coords.y)
+
+        interpolated_field = I.(f)
+        Spaces.weighted_dss2!(interpolated_field)
+
+        @test axes(interpolated_field).quadrature_style == Iquad
+        @test axes(interpolated_field).topology == topology
+
+        restrict_field = R.(f)
+        Spaces.weighted_dss2!(restrict_field)
+
+        @test axes(restrict_field).quadrature_style == quad
+        @test axes(restrict_field).topology == topology
+
+        interp_restrict_field = R.(I.(f))
+        Spaces.weighted_dss2!(interp_restrict_field)
+
+        @test axes(interp_restrict_field).quadrature_style == quad
+        @test axes(interp_restrict_field).topology == topology
+
+        @test norm(interp_restrict_field .- f) ≤ 3.0e-4
+    end
+end
+
+@testset "gradient" begin
+
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        f = sin.(coords.x .+ 2 .* coords.y)
+
+        grad = Operators.Gradient()
+        gradf = grad.(f)
+        Spaces.weighted_dss2!(gradf)
+
+        @test gradf ≈
+              Geometry.Covariant12Vector.(
+            Geometry.UVVector.(
+                cos.(coords.x .+ 2 .* coords.y),
+                2 .* cos.(coords.x .+ 2 .* coords.y),
+            ),
+        ) rtol = 1e-2
+
+        fv =
+            Geometry.UVVector.(
+                sin.(coords.x .+ 2 .* coords.y),
+                cos.(coords.x .+ 2 .* coords.y),
+            )
+        gradfv = Geometry.transform.(Ref(Geometry.UVAxis()), grad.(fv))
+        Spaces.weighted_dss2!(gradfv)
+        @test eltype(gradfv) <: Geometry.Axis2Tensor
+    end
+end
+
+
+@testset "weak gradient" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        f = sin.(coords.x .+ 2 .* coords.y)
+
+        wgrad = Operators.WeakGradient()
+        gradf = wgrad.(f)
+        Spaces.weighted_dss2!(gradf)
+
+        @test Geometry.UVVector.(gradf) ≈
+              Geometry.UVVector.(
+            cos.(coords.x .+ 2 .* coords.y),
+            2 .* cos.(coords.x .+ 2 .* coords.y),
+        ) rtol = 1e-2
+    end
+end
+
+@testset "curl" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        v =
+            Geometry.UVVector.(
+                sin.(coords.x .+ 2 .* coords.y),
+                cos.(3 .* coords.x .+ 4 .* coords.y),
+            )
+
+        curl = Operators.Curl()
+        curlv = curl.(Geometry.Covariant12Vector.(v))
+        Spaces.weighted_dss2!(curlv)
+        curlv_ref =
+            Geometry.Contravariant3Vector.(
+                .-3 .* sin.(3 .* coords.x .+ 4 .* coords.y) .-
+                2 .* cos.(coords.x .+ 2 .* coords.y),
+            )
+
+        @test curlv ≈ curlv_ref rtol = 1e-2
+    end
+end
+
+@testset "curl-curl" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        v =
+            Geometry.UVVector.(
+                sin.(coords.x .+ 2 .* coords.y),
+                cos.(3 .* coords.x .+ 2 .* coords.y),
+            )
+        curlv_ref =
+            .-3 .* sin.(3 .* coords.x .+ 2 .* coords.y) .-
+            2 .* cos.(coords.x .+ 2 .* coords.y)
+        curlcurlv_ref1 =
+            .-6 .* cos.(3 .* coords.x .+ 2 .* coords.y) .+
+            4 .* sin.(coords.x .+ 2 .* coords.y)
+        curlcurlv_ref2 =
+            9 .* cos.(3 .* coords.x .+ 2 .* coords.y) .-
+            2 .* sin.(coords.x .+ 2 .* coords.y)
+
+        curl = Operators.Curl()
+        curlcurlv =
+            curl.(
+                Geometry.Covariant3Vector.(
+                    curl.(Geometry.Covariant12Vector.(v)),
+                ),
+            )
+        Spaces.weighted_dss2!(curlcurlv)
+
+        @test Geometry.UVVector.(curlcurlv) ≈
+              Geometry.UVVector.(curlcurlv_ref1, curlcurlv_ref2) rtol = 4e-2
+    end
+end
+
+@testset "weak curl-strong curl" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        v =
+            Geometry.UVVector.(
+                sin.(coords.x .+ 2 .* coords.y),
+                cos.(3 .* coords.x .+ 2 .* coords.y),
+            )
+        curlv_ref =
+            .-3 .* sin.(3 .* coords.x .+ 2 .* coords.y) .-
+            2 .* cos.(coords.x .+ 2 .* coords.y)
+        curlcurlv_ref1 =
+            .-6 .* cos.(3 .* coords.x .+ 2 .* coords.y) .+
+            4 .* sin.(coords.x .+ 2 .* coords.y)
+        curlcurlv_ref2 =
+            9 .* cos.(3 .* coords.x .+ 2 .* coords.y) .-
+            2 .* sin.(coords.x .+ 2 .* coords.y)
+
+        curl = Operators.Curl()
+        wcurl = Operators.WeakCurl()
+        curlcurlv =
+            Geometry.UVVector.(
+                wcurl.(
+                    Geometry.Covariant3Vector.(
+                        curl.(Geometry.Covariant12Vector.(v)),
+                    ),
+                ),
+            )
+        Spaces.weighted_dss2!(curlcurlv)
+
+        @test curlcurlv ≈ Geometry.UVVector.(curlcurlv_ref1, curlcurlv_ref2) rtol =
+            4e-2
+    end
+end
+
+@testset "weak curl" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        v =
+            Geometry.UVVector.(
+                sin.(coords.x .+ 2 .* coords.y),
+                cos.(3 .* coords.x .+ 4 .* coords.y),
+            )
+
+        wcurl = Operators.WeakCurl()
+        curlv = wcurl.(Geometry.Covariant12Vector.(v))
+        Spaces.weighted_dss2!(curlv)
+        curlv_ref =
+            .-3 .* sin.(3 .* coords.x .+ 4 .* coords.y) .-
+            2 .* cos.(coords.x .+ 2 .* coords.y)
+
+        @test curlv ≈ Geometry.Contravariant3Vector.(curlv_ref) rtol = 1e-2
+    end
+end
+
+@testset "div" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        v =
+            Geometry.UVVector.(
+                sin.(coords.x .+ 2 .* coords.y),
+                cos.(3 .* coords.x .+ 2 .* coords.y),
+            )
+
+        div = Operators.Divergence()
+        divv = div.(v)
+        Spaces.weighted_dss2!(divv)
+        divv_ref =
+            cos.(coords.x .+ 2 .* coords.y) .-
+            2 .* sin.(3 .* coords.x .+ 2 .* coords.y)
+
+        @test divv ≈ divv_ref rtol = 1e-2
+    end
+end
+
+
+@testset "weak div" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        v =
+            Geometry.UVVector.(
+                sin.(coords.x .+ 2 .* coords.y),
+                cos.(3 .* coords.x .+ 2 .* coords.y),
+            )
+
+        wdiv = Operators.WeakDivergence()
+        divv = wdiv.(v)
+        Spaces.weighted_dss2!(divv)
+        divv_ref =
+            cos.(coords.x .+ 2 .* coords.y) .-
+            2 .* sin.(3 .* coords.x .+ 2 .* coords.y)
+
+        @test divv ≈ divv_ref rtol = 1e-2
+    end
+end
+
+
+@testset "annhilator property: curl-grad" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        f = sin.(coords.x .+ 2 .* coords.y)
+
+        grad = Operators.Gradient()
+        gradf = grad.(f)
+        Spaces.weighted_dss2!(gradf)
+
+        curl = Operators.Curl()
+        curlgradf = curl.(gradf)
+        Spaces.weighted_dss2!(curlgradf)
+
+        @test norm(curlgradf) < 1e-12
+    end
+end
+
+@testset "annhilator property: div-curl" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        v = Geometry.Covariant3Vector.(sin.(coords.x .+ 2 .* coords.y))
+        curl = Operators.Curl()
+        curlv = curl.(v)
+        Spaces.weighted_dss2!(curlv)
+
+        div = Operators.Divergence()
+        divcurlv = div.(curlv)
+        Spaces.weighted_dss2!(divcurlv)
+
+        @test norm(divcurlv) < 1e-12
+    end
+end
+
+@testset "scalar hyperdiffusion" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        k = 2
+        l = 3
+        y = @. sin(k * coords.x + l * coords.y)
+        ∇⁴y_ref = @. (k^2 + l^2)^2 * sin(k * coords.x + l * coords.y)
+
+        wdiv = Operators.WeakDivergence()
+        grad = Operators.Gradient()
+        χ = Spaces.weighted_dss2!(@. wdiv(grad(y)))
+        ∇⁴y = Spaces.weighted_dss2!(@. wdiv(grad(χ)))
+
+        @test ∇⁴y_ref ≈ ∇⁴y rtol = 2e-2
+    end
+end
+
+@testset "vector hyperdiffusion" begin
+    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
+        k = 2
+        l = 3
+        y = @. Geometry.UVVector(sin(k * coords.x + l * coords.y), 0.0)
+        ∇⁴y_ref = @. Geometry.UVVector(
+            (k^2 + l^2)^2 * sin(k * coords.x + l * coords.y),
+            0.0,
+        )
+        curl = Operators.Curl()
+        wcurl = Operators.WeakCurl()
+
+        sdiv = Operators.Divergence()
+        wgrad = Operators.WeakGradient()
+
+        χ = Spaces.weighted_dss2!(
+            @. Geometry.UVVector(wgrad(sdiv(y))) - Geometry.UVVector(
+                wcurl(
+                    Geometry.Covariant3Vector(
+                        curl(Geometry.Covariant12Vector(y)),
+                    ),
+                ),
+            )
+        )
+        ∇⁴y = Spaces.weighted_dss2!(
+            @. Geometry.UVVector(wgrad(sdiv(χ))) - Geometry.UVVector(
+                wcurl(
+                    Geometry.Covariant3Vector(
+                        curl(Geometry.Covariant12Vector(χ)),
+                    ),
+                ),
+            )
+        )
+
+        @test ∇⁴y_ref ≈ ∇⁴y rtol = 2e-2
+    end
+end
+==#
\ No newline at end of file

From b22119503795a4e47361a8daa507305130f50773 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Wed, 1 Feb 2023 16:56:30 -0800
Subject: [PATCH 06/16] so close

---
 src/Fields/Fields.jl             |   4 +-
 src/Fields/broadcast.jl          |   8 +
 src/Fields/indices.jl            |   4 +-
 src/Operators/Operators.jl       |   4 +-
 src/Operators/spectralelement.jl | 275 +++++++++++++++++++++----------
 src/Spaces/spectralelement.jl    |  18 +-
 6 files changed, 211 insertions(+), 102 deletions(-)

diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
index de611d246e..11a22e0bd8 100644
--- a/src/Fields/Fields.jl
+++ b/src/Fields/Fields.jl
@@ -51,8 +51,8 @@ comm_context(topology::T) where {T <: Topologies.AbstractTopology} =
 
 Adapt.adapt_structure(to, field::Field) =
     Field(
-        Adapt.adapt_structure(to, Fields.field_values(field)),
-        Adapt.adapt_structure(to, axes(field)),
+        Adapt.adapt(to, Fields.field_values(field)),
+        Adapt.adapt(to, axes(field)),
     )
 
 
diff --git a/src/Fields/broadcast.jl b/src/Fields/broadcast.jl
index fc0d247fa2..55c830ab50 100644
--- a/src/Fields/broadcast.jl
+++ b/src/Fields/broadcast.jl
@@ -30,6 +30,14 @@ Base.Broadcast.BroadcastStyle(
 
 Base.Broadcast.broadcastable(field::Field) = field
 
+function Adapt.adapt_structure(to, bc::Base.Broadcast.Broadcasted{Style}) where {Style<:AbstractFieldStyle}
+    Base.Broadcast.Broadcasted{Style}(
+        Adapt.adapt(to, bc.f),
+        Adapt.adapt(to, bc.args),
+        Adapt.adapt(to, bc.axes),
+    )
+end
+
 Base.eltype(bc::Base.Broadcast.Broadcasted{<:AbstractFieldStyle}) =
     Base.Broadcast.combine_eltypes(bc.f, bc.args)
 
diff --git a/src/Fields/indices.jl b/src/Fields/indices.jl
index be0ba2da85..a7c57ee003 100644
--- a/src/Fields/indices.jl
+++ b/src/Fields/indices.jl
@@ -131,9 +131,9 @@ end
 
 
 
-struct SlabIndex{VIdx}
+struct SlabIndex{VIdx,HIdx}
     v::VIdx
-    h::Int
+    h::HIdx
 end
 
 
diff --git a/src/Operators/Operators.jl b/src/Operators/Operators.jl
index bc6bc721b0..b25ce61806 100644
--- a/src/Operators/Operators.jl
+++ b/src/Operators/Operators.jl
@@ -1,9 +1,11 @@
 module Operators
 
-import LinearAlgebra
+import LinearAlgebra, Adapt
 
 using StaticArrays, CUDA
 
+import Base.Broadcast: Broadcasted
+
 import ..enable_threading, ..slab, ..slab_args, ..column, ..column_args
 import ..Device
 import ..DataLayouts: DataLayouts, Data2D, DataSlab2D
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index 649e0a3f9c..2ab94d6847 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -1,15 +1,34 @@
 abstract type AbstractSpectralStyle <: Fields.AbstractFieldStyle end
 
+"""
+    SpectralStyle()
+
+Broadcasting requires use of spectral-element operations.
+"""
 struct SpectralStyle <: AbstractSpectralStyle end
 
-struct CompositeSpectralStyle <: AbstractSpectralStyle end
+"""
+    SlabBlockSpectralStyle()
+
+Applies spectral-element operations using by making use of intermediate
+temporaries for each operator. This is used for CPU kernels.
+"""
+struct SlabBlockSpectralStyle <: AbstractSpectralStyle end
+
+"""
+    CUDASpectralStyle()
 
+Applies spectral-element operations by using threads for each node, and
+synchronizing when they occur. This is used for GPU kernels.
+"""
 struct CUDASpectralStyle <: AbstractSpectralStyle end
 
+
 import ClimaComms
-AbstractSpectralStyle(::ClimaComms.CPU) = SpectralStyle
+AbstractSpectralStyle(::ClimaComms.CPU) = SlabBlockSpectralStyle
 AbstractSpectralStyle(::ClimaComms.CUDA) = CUDASpectralStyle
 
+
 """
     SpectralElementOperator
 
@@ -75,6 +94,13 @@ SpectralBroadcasted{Style}(
 ) where {Style, Op, Args, Axes, Work} =
     SpectralBroadcasted{Style, Op, Args, Axes, Work}(op, args, axes, work)
 
+Adapt.adapt_structure(to, sbc::SpectralBroadcasted{Style}) where {Style} =
+    SpectralBroadcasted{Style}(
+        sbc.op,
+        Adapt.adapt(to, sbc.args),
+        Adapt.adapt(to, sbc.axes),
+    )
+ 
 return_space(::SpectralElementOperator, space) = space
 
 Base.axes(sbc::SpectralBroadcasted) =
@@ -117,20 +143,9 @@ function Base.Broadcast.instantiate(
     return SpectralBroadcasted{Style}(op, args, axes)
 end
 
-import Adapt
-
-Adapt.adapt_structure(to, sbc::SpectralBroadcasted{Style}) where {Style} =
-    SpectralBroadcasted{Style}(
-        sbc.op,
-        map(arg -> Adapt.adapt_structure(to, arg), sbc.args),
-        Adapt.adapt_structure(to, sbc.axes),
-    )
-
-
-
 function Base.Broadcast.instantiate(
-    bc::Base.Broadcast.Broadcasted{Style},
-) where {Style <: AbstractSpectralStyle}
+    bc::Base.Broadcast.Broadcasted{<:AbstractSpectralStyle},
+)
     # recursively instantiate the arguments to allocate intermediate work arrays
     args = instantiate_args(bc.args)
     # axes: same logic as Broadcasted
@@ -140,6 +155,7 @@ function Base.Broadcast.instantiate(
         axes = bc.axes
         Base.Broadcast.check_broadcast_axes(axes, args...)
     end
+    Style = AbstractSpectralStyle(Device.device(axes))
     return Base.Broadcast.Broadcasted{Style}(bc.f, args, axes)
 end
 
@@ -168,60 +184,19 @@ function Base.Broadcast.materialize(sbc::SpectralBroadcasted)
     copy(Base.Broadcast.instantiate(sbc))
 end
 
-function Base.copyto!(out::Field, sbc::SpectralBroadcasted)
-    Fields.byslab(axes(out)) do slabidx
-        copyto_slab!(out, sbc, slabidx)
-    end
-    return out
-end
-
-function Base.copyto!(out::Field, sbc::SpectralBroadcasted{CUDASpectralStyle})
-    space = axes(out)
-    QS = Spaces.quadrature_style(space)
-    Nq = Quadratures.degrees_of_freedom(QS)
-    Nh = Topologies.nlocalelems(Spaces.topology(space))
-    Nv = Spaces.nlevels(space)
-
-    @cuda threads=(Nq,Nq) blocks=(Nv,Nh) copyto_kernel!(out, sbc)
-    return out
-end
-
-
-function copyto_kernel!(out, sbc)
-    i = threadIdx().x
-    j = threadIdx().y
-    #if axes(out) isa Spaces.AbstractSpectralElementSpace
-        v = nothing
-    #elseif axes(out) isa Spaces.CenterExtrudedFiniteDifferenceSpace
-    #    v = blockIdx().x
-    #elseif axes(out) isa Spaces.FaceExtrudedFiniteDifferenceSpace
-    #    v = blockIdx().x - half
-    #end
-    h = blockIdx().y
-    ij = CartesianIndex((i,j))
-    slabidx = Fields.SlabIndex{Nothing}(v,h)
-
-    result = apply_operator_kernel(sbc.op, axes(out), ij, slabidx, sbc.args...)
-    set_node!(out, ij, slabidx, result)
-    return nothing
-end
-
-
-
 function Base.Broadcast.materialize!(dest, sbc::SpectralBroadcasted)
     copyto!(dest, Base.Broadcast.instantiate(sbc))
 end
 
-function Base.copyto!(
-    out::Field,
-    bc::Base.Broadcast.Broadcasted{Style},
-) where {Style <: AbstractSpectralStyle}
+# Functions for SlabBlockSpectralStyle
+function Base.copyto!(out::Field, sbc::Union{SpectralBroadcasted{SlabBlockSpectralStyle}, Broadcasted{SlabBlockSpectralStyle}})
     Fields.byslab(axes(out)) do slabidx
-        copyto_slab!(out, bc, slabidx)
+        copyto_slab!(out, sbc, slabidx)
     end
     return out
 end
 
+
 """
     copyto_slab!(out, bc, slabidx)
 
@@ -238,7 +213,6 @@ function copyto_slab!(out, bc, slabidx)
     return nothing
 end
 
-
 """
     resolve_operator(bc, slabidx)
 
@@ -252,16 +226,16 @@ Recursively evaluate any operators in `bc` at `slabidx`, replacing any
  `IF`/`IJF` data object.
 - if `bc` is a `Field`, return that
 """
-function resolve_operator(bc::SpectralBroadcasted, slabidx)
+function resolve_operator(bc::SpectralBroadcasted{SlabBlockSpectralStyle}, slabidx)
     args = _resolve_operator_args(slabidx, bc.args...)
     apply_operator(bc.op, bc.axes, slabidx, args...)
 end
 function resolve_operator(
-    bc::Base.Broadcast.Broadcasted{CompositeSpectralStyle},
+    bc::Base.Broadcast.Broadcasted{SlabBlockSpectralStyle},
     slabidx,
 )
     args = _resolve_operator_args(slabidx, bc.args...)
-    Base.Broadcast.Broadcasted{CompositeSpectralStyle}(bc.f, args, bc.axes)
+    Base.Broadcast.Broadcasted{SlabBlockSpectralStyle}(bc.f, args, bc.axes)
 end
 resolve_operator(x, slabidx) = x
 
@@ -277,6 +251,34 @@ _resolve_operator_args(slabidx) = ()
 )
 
 
+# Functions for CUDASpectralStyle
+function Base.copyto!(out::Field, sbc::Union{SpectralBroadcasted{CUDASpectralStyle}, Broadcasted{CUDASpectralStyle}})
+    space = axes(out)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    Nh = Topologies.nlocalelems(Spaces.topology(space))
+    Nv = Spaces.nlevels(space)
+
+    @cuda threads=(Nq,Nq) blocks=(Nv,Nh) copyto_kernel!(out, sbc)
+    return out
+end
+
+function copyto_kernel!(out::Fields.SpectralElementField2D, sbc)
+    i = threadIdx().x
+    j = threadIdx().y
+    v = nothing
+    h = blockIdx().y
+    ij = CartesianIndex((i,j))
+    slabidx = Fields.SlabIndex(v,h)
+    result = get_node(sbc, ij, slabidx)
+    set_node!(out, ij, slabidx, result)
+    return nothing
+end
+
+function get_node(sbc::SpectralBroadcasted{CUDASpectralStyle}, ij, slabidx)
+    apply_operator_kernel(sbc.op, axes(sbc), ij, slabidx, _get_node(ij, slabidx, sbc.args...)...)
+end
+
 
 _get_node(ij, slabidx) = ()
 _get_node(ij, slabidx, arg, xargs...) =
@@ -410,18 +412,15 @@ Base.@propagate_inbounds function set_node!(
 end
 
 
-function Base.Broadcast.BroadcastStyle(
-    ::Type{SB},
-) where {SB <: SpectralBroadcasted}
-    CompositeSpectralStyle()
-end
+Base.Broadcast.BroadcastStyle(
+    ::Type{<:SpectralBroadcasted{Style}},
+) where {Style} =
+    Style()
 
-function Base.Broadcast.BroadcastStyle(
-    ::CompositeSpectralStyle,
+Base.Broadcast.BroadcastStyle(
+    style::AbstractSpectralStyle,
     ::Fields.AbstractFieldStyle,
-)
-    CompositeSpectralStyle()
-end
+) = style
 
 
 
@@ -532,6 +531,46 @@ function apply_operator(op::Divergence{(1, 2)}, space, slabidx, arg)
 end
 
 
+function apply_operator_kernel(op::Divergence{(1, 2)}, space, ij, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    D = Quadratures.differentiation_matrix(FT, QS)
+
+    local_geometry = get_local_geometry(space, ij, slabidx)
+    
+    # allocate temp output
+    RT = operator_return_eltype(op, typeof(arg))
+    Nf = DataLayouts.typesize(FT, RT)
+    work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    Jv¹ = IJF{RT, Nq}(work1_array)
+    Jv² = IJF{RT, Nq}(work1_array)
+    i,j = ij.I
+
+    
+    Jv¹[i,j] = local_geometry.J ⊠ RecursiveApply.rmap(
+        v -> Geometry.contravariant1(v, local_geometry),
+        arg,
+    )
+    Jv²[i,j] = local_geometry.J ⊠ RecursiveApply.rmap(
+        v -> Geometry.contravariant2(v, local_geometry),
+        arg,
+    )
+
+    CUDA.sync_threads()
+
+    DJv = D[i,1] ⊠ Jv¹[1,j]
+    for k = 2:Nq
+        DJv = DJv ⊞ D[i,k] ⊠ Jv¹[k,j]
+    end
+    for k = 1:Nq
+        DJv = DJv ⊞ D[j,k] ⊠ Jv²[i,k]
+    end
+    return RecursiveApply.rdiv(DJv, local_geometry.J)
+end
+
+
 """
     wdiv = WeakDivergence()
     wdiv.(u)
@@ -644,6 +683,47 @@ function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
+
+function apply_operator_kernel(op::WeakDivergence{(1, 2)}, space, ij, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    D = Quadratures.differentiation_matrix(FT, QS)
+
+    local_geometry = get_local_geometry(space, ij, slabidx)
+    
+    # allocate temp output
+    RT = operator_return_eltype(op, typeof(arg))
+    Nf = DataLayouts.typesize(FT, RT)
+    work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    WJv¹ = IJF{RT, Nq}(work1_array)
+    WJv² = IJF{RT, Nq}(work1_array)
+    i,j = ij.I
+
+    
+    WJv¹[i,j] = local_geometry.WJ ⊠ RecursiveApply.rmap(
+        v -> Geometry.contravariant1(v, local_geometry),
+        arg,
+    )
+    WJv²[i,j] = local_geometry.WJ ⊠ RecursiveApply.rmap(
+        v -> Geometry.contravariant2(v, local_geometry),
+        arg,
+    )
+
+    CUDA.sync_threads()
+
+    DᵀWJv = D[1,i] ⊠ WJv¹[1,j]
+    for k = 2:Nq
+        DᵀWJv = DᵀWJv ⊞ D[k,i] ⊠ WJv¹[k,j]
+    end
+    for k = 1:Nq
+        DᵀWJv = DᵀWJv ⊞ D[k,j] ⊠ WJv²[i,k]
+    end
+    return ⊟(RecursiveApply.rdiv(DᵀWJv, local_geometry.WJ))
+end
+
+
 """
     grad = Gradient()
     grad.(f)
@@ -718,20 +798,19 @@ function apply_operator(op::Gradient{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
-
-
 function apply_operator_kernel(op::Gradient{(1, 2)}, space, ij, slabidx, arg)
     FT = Spaces.undertype(space)
     QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
     # allocate temp output
-    IT = eltype(arg)
+    IT = typeof(arg)
     Nf = DataLayouts.typesize(FT, IT)
     array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
     work = IJF{IT, Nq}(array)
     i,j = ij.I
-    work[i,j] = get_node(arg, ij, slabidx)
+    work[i,j] = arg
+
     CUDA.sync_threads()
 
     ∂f∂ξ₁ = D[i,1] * work[1,j]
@@ -743,16 +822,6 @@ function apply_operator_kernel(op::Gradient{(1, 2)}, space, ij, slabidx, arg)
     return Geometry.Covariant12Vector(∂f∂ξ₁,∂f∂ξ₂)
 end
 
-function allocate_shared(op::Gradient{(1,2)}, arg)
-    S = eltype(arg)
-    space = axes(arg)
-    FT = Spaces.undertype(space)
-    QS = Spaces.quadrature_style(space)
-    Nq = Quadratures.degrees_of_freedom(QS)
-    Nf = DataLayouts.typesize(FT, S)
-    array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
-    return IJF{S, Nq}(array)
-end
 
 """
     wgrad = WeakGradient()
@@ -856,6 +925,36 @@ function apply_operator(op::WeakGradient{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
+function apply_operator_kernel(op::WeakGradient{(1, 2)}, space, ij, slabidx, arg)
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    D = Quadratures.differentiation_matrix(FT, QS)
+
+    local_geometry = get_local_geometry(space, ij, slabidx)
+    W = local_geometry.WJ / local_geometry.J
+
+    # allocate temp output
+    IT = typeof(arg)
+    Nf = DataLayouts.typesize(FT, IT)
+    work_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    Wf = IJF{IT, Nq}(work_array)
+    i,j = ij.I
+    Wf[i,j] = W ⊠ arg
+
+    CUDA.sync_threads()
+
+    Dᵀ₁Wf = D[1,i] * work[1,j]
+    Dᵀ₂Wf = D[1,j] * work[i,1]
+    for k = 2:Nq
+        Dᵀ₁Wf += D[k,i] * work[k,j]
+        Dᵀ₂Wf += D[k,j] * work[i,k]
+    end
+    return Geometry.Covariant12Vector(RecursiveApply.rdiv(Dᵀ₁Wf,W),RecursiveApply.rdiv(Dᵀ₂Wf,W))
+end
+
+
+
 abstract type CurlSpectralElementOperator <: SpectralElementOperator end
 
 """
diff --git a/src/Spaces/spectralelement.jl b/src/Spaces/spectralelement.jl
index afa1da45b8..df5160643f 100644
--- a/src/Spaces/spectralelement.jl
+++ b/src/Spaces/spectralelement.jl
@@ -178,15 +178,15 @@ end
 
 Adapt.adapt_structure(to, space::SpectralElementSpace2D) =
 SpectralElementSpace2D(
-    nothing, 
-    Adapt.adapt_structure(to, space.quadrature_style),
-    Adapt.adapt_structure(to, space.global_geometry),
-    Adapt.adapt_structure(to, space.local_geometry),
-    Adapt.adapt_structure(to, space.ghost_geometry),
-    Adapt.adapt_structure(to, space.local_dss_weights),
-    Adapt.adapt_structure(to, space.ghost_dss_weights),
-    Adapt.adapt_structure(to, space.internal_surface_geometry),
-    Adapt.adapt_structure(to, space.boundary_surface_geometries),
+    nothing, # drop topology
+    Adapt.adapt(to, space.quadrature_style),
+    Adapt.adapt(to, space.global_geometry),
+    Adapt.adapt(to, space.local_geometry),
+    Adapt.adapt(to, space.ghost_geometry),
+    Adapt.adapt(to, space.local_dss_weights),
+    Adapt.adapt(to, space.ghost_dss_weights),
+    Adapt.adapt(to, space.internal_surface_geometry),
+    Adapt.adapt(to, space.boundary_surface_geometries),
 )
 
 

From 53c04364ac6c9575de500ee23f7bb2e68c4f21c6 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 2 Feb 2023 13:10:58 -0800
Subject: [PATCH 07/16] more operators

---
 src/Operators/spectralelement.jl | 145 ++++++++++++++++++++++++++-----
 1 file changed, 122 insertions(+), 23 deletions(-)

diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index 2ab94d6847..e2eea83f1d 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -545,9 +545,8 @@ function apply_operator_kernel(op::Divergence{(1, 2)}, space, ij, slabidx, arg)
     work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
     work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
     Jv¹ = IJF{RT, Nq}(work1_array)
-    Jv² = IJF{RT, Nq}(work1_array)
+    Jv² = IJF{RT, Nq}(work2_array)
     i,j = ij.I
-
     
     Jv¹[i,j] = local_geometry.J ⊠ RecursiveApply.rmap(
         v -> Geometry.contravariant1(v, local_geometry),
@@ -560,14 +559,13 @@ function apply_operator_kernel(op::Divergence{(1, 2)}, space, ij, slabidx, arg)
 
     CUDA.sync_threads()
 
-    DJv = D[i,1] ⊠ Jv¹[1,j]
+    D₁Jv¹ = D[i,1] ⊠ Jv¹[1,j]
+    D₂Jv² = D[j,1] ⊠ Jv²[i,1]
     for k = 2:Nq
-        DJv = DJv ⊞ D[i,k] ⊠ Jv¹[k,j]
-    end
-    for k = 1:Nq
-        DJv = DJv ⊞ D[j,k] ⊠ Jv²[i,k]
+        D₁Jv¹ = D₁Jv¹ ⊞ D[i,k] ⊠ Jv¹[k,j]
+        D₂Jv² = D₂Jv² ⊞ D[j,k] ⊠ Jv²[i,k]
     end
-    return RecursiveApply.rdiv(DJv, local_geometry.J)
+    return RecursiveApply.rdiv(D₁Jv¹ ⊞ D₂Jv², local_geometry.J)
 end
 
 
@@ -683,7 +681,6 @@ function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
-
 function apply_operator_kernel(op::WeakDivergence{(1, 2)}, space, ij, slabidx, arg)
     FT = Spaces.undertype(space)
     QS = Spaces.quadrature_style(space)
@@ -698,9 +695,8 @@ function apply_operator_kernel(op::WeakDivergence{(1, 2)}, space, ij, slabidx, a
     work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
     work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
     WJv¹ = IJF{RT, Nq}(work1_array)
-    WJv² = IJF{RT, Nq}(work1_array)
+    WJv² = IJF{RT, Nq}(work2_array)
     i,j = ij.I
-
     
     WJv¹[i,j] = local_geometry.WJ ⊠ RecursiveApply.rmap(
         v -> Geometry.contravariant1(v, local_geometry),
@@ -713,17 +709,15 @@ function apply_operator_kernel(op::WeakDivergence{(1, 2)}, space, ij, slabidx, a
 
     CUDA.sync_threads()
 
-    DᵀWJv = D[1,i] ⊠ WJv¹[1,j]
+    Dᵀ₁WJv¹ = D[1,i] ⊠ WJv¹[1,j]
+    Dᵀ₂WJv² = D[1,j] ⊠ WJv²[i,1]
     for k = 2:Nq
-        DᵀWJv = DᵀWJv ⊞ D[k,i] ⊠ WJv¹[k,j]
+        Dᵀ₁WJv¹ = Dᵀ₁WJv¹ ⊞ D[k,i] ⊠ WJv¹[k,j]
+        Dᵀ₂WJv² = Dᵀ₂WJv² ⊞ D[k,j] ⊠ WJv²[i,k]
     end
-    for k = 1:Nq
-        DᵀWJv = DᵀWJv ⊞ D[k,j] ⊠ WJv²[i,k]
-    end
-    return ⊟(RecursiveApply.rdiv(DᵀWJv, local_geometry.WJ))
+    return ⊟(RecursiveApply.rdiv(Dᵀ₁WJv¹ ⊞ Dᵀ₂WJv², local_geometry.WJ))
 end
 
-
 """
     grad = Gradient()
     grad.(f)
@@ -944,13 +938,13 @@ function apply_operator_kernel(op::WeakGradient{(1, 2)}, space, ij, slabidx, arg
 
     CUDA.sync_threads()
 
-    Dᵀ₁Wf = D[1,i] * work[1,j]
-    Dᵀ₂Wf = D[1,j] * work[i,1]
+    Dᵀ₁Wf = D[1,i] ⊠ Wf[1,j]
+    Dᵀ₂Wf = D[1,j] ⊠ Wf[i,1]
     for k = 2:Nq
-        Dᵀ₁Wf += D[k,i] * work[k,j]
-        Dᵀ₂Wf += D[k,j] * work[i,k]
+        Dᵀ₁Wf = Dᵀ₁Wf ⊞ D[k,i] ⊠ Wf[k,j]
+        Dᵀ₂Wf = Dᵀ₂Wf ⊞ D[k,j] ⊠ Wf[i,k]
     end
-    return Geometry.Covariant12Vector(RecursiveApply.rdiv(Dᵀ₁Wf,W),RecursiveApply.rdiv(Dᵀ₂Wf,W))
+    return Geometry.Covariant12Vector(⊟(RecursiveApply.rdiv(Dᵀ₁Wf, W)),⊟(RecursiveApply.rdiv(Dᵀ₂Wf, W)))
 end
 
 
@@ -1094,6 +1088,58 @@ function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
+function apply_operator_kernel(op::Curl{(1, 2)}, space, ij, slabidx, arg)
+
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    D = Quadratures.differentiation_matrix(FT, QS)
+
+    RT = operator_return_eltype(op, typeof(arg))
+    IT = eltype(arg)
+    Nf = DataLayouts.typesize(FT, IT)
+    local_geometry = get_local_geometry(space, ij, slabidx)
+    i,j = ij.I
+
+    if RT <: Geometry.Contravariant3Vector
+
+        work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        v₁ = IJF{IT, Nq}(work1_array)
+        v₂ = IJF{IT, Nq}(work2_array)
+        v₁[i,j] = Geometry.covariant1(arg, local_geometry)
+        v₂[i,j] = Geometry.covariant2(arg, local_geometry)
+
+        CUDA.sync_threads()
+
+        D₁v₂ = D[i, 1] ⊠ v₂[1,j]
+        D₂v₁ = D[j, 1] ⊠ v₁[i,1]
+        for k in 2:Nq
+            D₁v₂ = D₁v₂ ⊠ D[i, k] ⊠ v₂[k,j]
+            D₂v₁ = D₂v₁ ⊠ D[j, k] ⊠ v₁[i,k]
+        end
+        return Geometry.Contravariant3Vector(RecursiveApply.rdiv(D₁v₂ ⊟ D₂v₁, local_geometry.J))
+    elseif RT <: Geometry.Contravariant12Vector
+        work_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        v₃ = IJF{IT, Nq}(work_array)
+        v₃[i,j] = Geometry.covariant3(arg, local_geometry)
+
+        CUDA.sync_threads()
+
+        D₁v₃ = D[i, 1] ⊠ v₃[1,j]
+        D₂v₃ = D[j, 1] ⊠ v₃[i,1]
+        for k in 2:Nq
+            D₁v₃ = D₁v₃ ⊠ D[i, k] ⊠ v₃[k,j]
+            D₂v₃ = D₂v₃ ⊠ D[j, k] ⊠ v₃[i,k]
+        end
+        return Geometry.Contravariant12Vector(RecursiveApply.rdiv(D₂v₃,local_geometry.J) ⊟(RecursiveApply.rdiv(D₁v₃,local_geometry.J)))
+    else
+        error("invalid return type")
+    end
+end
+
+
+
 """
     wcurl = WeakCurl()
     wcurl.(u)
@@ -1231,6 +1277,59 @@ function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
+
+function apply_operator_kernel(op::WeakCurl{(1, 2)}, space, ij, slabidx, arg)
+
+    FT = Spaces.undertype(space)
+    QS = Spaces.quadrature_style(space)
+    Nq = Quadratures.degrees_of_freedom(QS)
+    D = Quadratures.differentiation_matrix(FT, QS)
+
+    RT = operator_return_eltype(op, typeof(arg))
+    IT = eltype(arg)
+    Nf = DataLayouts.typesize(FT, IT)
+    local_geometry = get_local_geometry(space, ij, slabidx)
+    W = local_geometry.WJ / local_geometry.J
+    i,j = ij.I
+
+    if RT <: Geometry.Contravariant3Vector
+
+        work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        Wv₁ = IJF{IT, Nq}(work1_array)
+        Wv₂ = IJF{IT, Nq}(work2_array)
+        Wv₁[i,j] = W ⊠ Geometry.covariant1(arg, local_geometry)
+        Wv₂[i,j] = W ⊠ Geometry.covariant2(arg, local_geometry)
+
+        CUDA.sync_threads()
+
+        Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1,j]
+        Dᵀ₂Wv₁ = D[1, j] ⊠ Wv₁[i,1]
+        for k in 2:Nq
+            Dᵀ₁v₂ = Dᵀ₁Wv₂ ⊠ D[k, i] ⊠ Wv₂[k,j]
+            Dᵀ₂v₁ = Dᵀ₂Wv₁ ⊠ D[k, j] ⊠ Wv₁[i,k]
+        end
+        return Geometry.Contravariant3Vector(RecursiveApply.rdiv(Dᵀ₂Wv₁ ⊟ Dᵀ₁Wv₂, local_geometry.WJ))
+    elseif RT <: Geometry.Contravariant12Vector
+        work_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        Wv₃ = IJF{IT, Nq}(work_array)
+        Wv₃[i,j] = W ⊠ Geometry.covariant3(arg, local_geometry)
+
+        CUDA.sync_threads()
+
+        Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1,j]
+        Dᵀ₂Wv₃ = D[1, j] ⊠ Wv₃[i,1]
+        for k in 2:Nq
+            Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊠ D[k, i] ⊠ Wv₃[k,j]
+            Dᵀ₂Wv₃ = Dᵀ₂Wv₃ ⊠ D[k, j] ⊠ Wv₃[i,k]
+        end
+        return Geometry.Contravariant12Vector(RecursiveApply.rdiv(Dᵀ₂Wv₃,local_geometry.WJ) ⊟(RecursiveApply.rdiv(Dᵀ₁Wv₃,local_geometry.WJ)))
+    else
+        error("invalid return type")
+    end
+end
+
+
 # interplation / restriction
 abstract type TensorOperator <: SpectralElementOperator end
 

From e76771c312a853848d4c666b0b6201ff78de4e99 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 2 Feb 2023 15:44:02 -0800
Subject: [PATCH 08/16] more operator updates

---
 examples/sphere/shallow_water_cuda.jl         |  68 +++++
 src/Fields/Fields.jl                          |   9 +-
 src/Fields/broadcast.jl                       |  10 +-
 src/Fields/indices.jl                         |   2 +-
 src/Operators/spectralelement.jl              | 262 +++++++++++-------
 src/Spaces/spectralelement.jl                 |  26 +-
 .../spectralelement/rectilinear_cuda.jl       |  16 +-
 7 files changed, 261 insertions(+), 132 deletions(-)

diff --git a/examples/sphere/shallow_water_cuda.jl b/examples/sphere/shallow_water_cuda.jl
index 16db587d0c..53e2ba7358 100644
--- a/examples/sphere/shallow_water_cuda.jl
+++ b/examples/sphere/shallow_water_cuda.jl
@@ -1,6 +1,10 @@
 using CUDA
 using ClimaComms
 using DocStringExtensions
+using LinearAlgebra
+using OrdinaryDiffEq
+
+CUDA.allowscalar(false)
 
 import ClimaCore:
     Device,
@@ -442,6 +446,47 @@ function set_initial_condition(
     return Y
 end
 
+function rhs!(dYdt, y, parameters, t)
+    #@nvtx "rhs!" color = colorant"red" begin
+        (; f, h_s, ghost_buffer, params) = parameters
+        (; D₄, g) = params
+
+        div = Operators.Divergence()
+        wdiv = Operators.WeakDivergence()
+        grad = Operators.Gradient()
+        wgrad = Operators.WeakGradient()
+        curl = Operators.Curl()
+        wcurl = Operators.WeakCurl()
+
+        # Compute hyperviscosity first
+        #@nvtx "Hyperviscosity (rhs!)" color = colorant"green" begin
+            @. dYdt.h = wdiv(grad(y.h))
+            @. dYdt.u =
+                wgrad(div(y.u)) - Geometry.Covariant12Vector(
+                    wcurl(Geometry.Covariant3Vector(curl(y.u))),
+                )
+
+            Spaces.weighted_dss2!(dYdt, ghost_buffer)
+
+            @. dYdt.h = -D₄ * wdiv(grad(dYdt.h))
+            # split to avoid "device kernel image is invalid (code 200, ERROR_INVALID_IMAGE)"
+            @. dYdt.u = wgrad(div(dYdt.u)) - Geometry.Covariant12Vector(wcurl(Geometry.Covariant3Vector(curl(dYdt.u))))
+            @. dYdt.u = -D₄ * dYdt.u
+        #end
+        #@nvtx "h and u (rhs!)" color = colorant"blue" begin
+            # Add in pieces
+            @. begin
+                dYdt.h += -wdiv(y.h * y.u)
+                dYdt.u +=
+                    -grad(g * (y.h + h_s) + norm(y.u)^2 / 2) #+
+                dYdt.u += y.u × (f + curl(y.u))
+            end
+            Spaces.weighted_dss2!(dYdt, ghost_buffer)
+        #end
+    #end
+    return dYdt
+end
+
 function shallow_water_driver_cuda(ARGS, ::Type{FT}) where {FT}
     device = Device.device()
     context = ClimaComms.SingletonCommsContext(device)
@@ -478,11 +523,34 @@ function shallow_water_driver_cuda(ARGS, ::Type{FT}) where {FT}
     f = set_coriolis_parameter(space, test)
     h_s = surface_topography(space, test)
     Y = set_initial_condition(space, test)
+    dYdt = similar(Y)
 
     ghost_buffer = Spaces.create_dss_buffer(Y)
     Spaces.weighted_dss_start2!(Y, ghost_buffer)
     Spaces.weighted_dss_internal2!(Y, ghost_buffer)
     Spaces.weighted_dss_ghost2!(Y, ghost_buffer)
+
+    parameters =
+    (; f = f, h_s = h_s, ghost_buffer = ghost_buffer, params = test.params)
+    rhs!(dYdt, Y, parameters, 0.0)
+
+    # Solve the ODE
+    dt = 9 * 60
+    T = 86400 * 2
+
+    prob = ODEProblem(rhs!, Y, (0.0, T), parameters)
+    integrator = OrdinaryDiffEq.init(
+        prob,
+        SSPRK33(),
+        dt = dt,
+        saveat = dt,
+        progress = true,
+        adaptive = false,
+        progress_message = (dt, u, p, t) -> t,
+    )
+
+
+    sol = @timev OrdinaryDiffEq.solve!(integrator)
     return nothing
 end
 
diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
index 11a22e0bd8..8872e952c1 100644
--- a/src/Fields/Fields.jl
+++ b/src/Fields/Fields.jl
@@ -49,11 +49,10 @@ comm_context(topology::Topologies.Topology2D) = topology.context
 comm_context(topology::T) where {T <: Topologies.AbstractTopology} =
     ClimaComms.SingletonCommsContext()
 
-Adapt.adapt_structure(to, field::Field) =
-    Field(
-        Adapt.adapt(to, Fields.field_values(field)),
-        Adapt.adapt(to, axes(field)),
-    )
+Adapt.adapt_structure(to, field::Field) = Field(
+    Adapt.adapt(to, Fields.field_values(field)),
+    Adapt.adapt(to, axes(field)),
+)
 
 
 # Point Field
diff --git a/src/Fields/broadcast.jl b/src/Fields/broadcast.jl
index 55c830ab50..ad770d52eb 100644
--- a/src/Fields/broadcast.jl
+++ b/src/Fields/broadcast.jl
@@ -30,7 +30,10 @@ Base.Broadcast.BroadcastStyle(
 
 Base.Broadcast.broadcastable(field::Field) = field
 
-function Adapt.adapt_structure(to, bc::Base.Broadcast.Broadcasted{Style}) where {Style<:AbstractFieldStyle}
+function Adapt.adapt_structure(
+    to,
+    bc::Base.Broadcast.Broadcasted{Style},
+) where {Style <: AbstractFieldStyle}
     Base.Broadcast.Broadcasted{Style}(
         Adapt.adapt(to, bc.f),
         Adapt.adapt(to, bc.args),
@@ -259,8 +262,13 @@ end
     return nothing
 end
 
+# types aren't isbits
+Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::Type{T}, args...) where {T} =
+    Base.Broadcast.broadcasted(fs, (x...) -> T(x...), args...)
+
 # Specialize handling of +, *, muladd, so that we can support broadcasting over NamedTuple element types
 # Required for ODE solvers
+
 Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::typeof(+), args...) =
     Base.Broadcast.broadcasted(fs, RecursiveApply.:⊞, args...)
 
diff --git a/src/Fields/indices.jl b/src/Fields/indices.jl
index a7c57ee003..865acae8b3 100644
--- a/src/Fields/indices.jl
+++ b/src/Fields/indices.jl
@@ -131,7 +131,7 @@ end
 
 
 
-struct SlabIndex{VIdx,HIdx}
+struct SlabIndex{VIdx, HIdx}
     v::VIdx
     h::HIdx
 end
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index e2eea83f1d..47183e52bb 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -80,7 +80,8 @@ This is similar to a `Base.Broadcast.Broadcasted` object, except it contains spa
 
 This is returned by `Base.Broadcast.broadcasted(op::SpectralElementOperator)`.
 """
-struct SpectralBroadcasted{Style, Op, Args, Axes, Work} <: Base.AbstractBroadcasted
+struct SpectralBroadcasted{Style, Op, Args, Axes, Work} <:
+       Base.AbstractBroadcasted
     op::Op
     args::Args
     axes::Axes
@@ -100,7 +101,7 @@ Adapt.adapt_structure(to, sbc::SpectralBroadcasted{Style}) where {Style} =
         Adapt.adapt(to, sbc.args),
         Adapt.adapt(to, sbc.axes),
     )
- 
+
 return_space(::SpectralElementOperator, space) = space
 
 Base.axes(sbc::SpectralBroadcasted) =
@@ -125,9 +126,7 @@ Base.eltype(sbc::SpectralBroadcasted) =
     (Base.Broadcast.instantiate(args[1]),)
 @inline instantiate_args(::Tuple{}) = ()
 
-function Base.Broadcast.instantiate(
-    sbc::SpectralBroadcasted,
-)
+function Base.Broadcast.instantiate(sbc::SpectralBroadcasted)
     op = sbc.op
     # recursively instantiate the arguments to allocate intermediate work arrays
     args = instantiate_args(sbc.args)
@@ -189,7 +188,13 @@ function Base.Broadcast.materialize!(dest, sbc::SpectralBroadcasted)
 end
 
 # Functions for SlabBlockSpectralStyle
-function Base.copyto!(out::Field, sbc::Union{SpectralBroadcasted{SlabBlockSpectralStyle}, Broadcasted{SlabBlockSpectralStyle}})
+function Base.copyto!(
+    out::Field,
+    sbc::Union{
+        SpectralBroadcasted{SlabBlockSpectralStyle},
+        Broadcasted{SlabBlockSpectralStyle},
+    },
+)
     Fields.byslab(axes(out)) do slabidx
         copyto_slab!(out, sbc, slabidx)
     end
@@ -226,7 +231,10 @@ Recursively evaluate any operators in `bc` at `slabidx`, replacing any
  `IF`/`IJF` data object.
 - if `bc` is a `Field`, return that
 """
-function resolve_operator(bc::SpectralBroadcasted{SlabBlockSpectralStyle}, slabidx)
+function resolve_operator(
+    bc::SpectralBroadcasted{SlabBlockSpectralStyle},
+    slabidx,
+)
     args = _resolve_operator_args(slabidx, bc.args...)
     apply_operator(bc.op, bc.axes, slabidx, args...)
 end
@@ -252,14 +260,20 @@ _resolve_operator_args(slabidx) = ()
 
 
 # Functions for CUDASpectralStyle
-function Base.copyto!(out::Field, sbc::Union{SpectralBroadcasted{CUDASpectralStyle}, Broadcasted{CUDASpectralStyle}})
+function Base.copyto!(
+    out::Field,
+    sbc::Union{
+        SpectralBroadcasted{CUDASpectralStyle},
+        Broadcasted{CUDASpectralStyle},
+    },
+)
     space = axes(out)
     QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     Nh = Topologies.nlocalelems(Spaces.topology(space))
     Nv = Spaces.nlevels(space)
 
-    @cuda threads=(Nq,Nq) blocks=(Nv,Nh) copyto_kernel!(out, sbc)
+    @cuda threads = (Nq, Nq) blocks = (Nv, Nh) copyto_kernel!(out, sbc)
     return out
 end
 
@@ -268,20 +282,26 @@ function copyto_kernel!(out::Fields.SpectralElementField2D, sbc)
     j = threadIdx().y
     v = nothing
     h = blockIdx().y
-    ij = CartesianIndex((i,j))
-    slabidx = Fields.SlabIndex(v,h)
+    ij = CartesianIndex((i, j))
+    slabidx = Fields.SlabIndex(v, h)
     result = get_node(sbc, ij, slabidx)
     set_node!(out, ij, slabidx, result)
     return nothing
 end
 
 function get_node(sbc::SpectralBroadcasted{CUDASpectralStyle}, ij, slabidx)
-    apply_operator_kernel(sbc.op, axes(sbc), ij, slabidx, _get_node(ij, slabidx, sbc.args...)...)
+    apply_operator_kernel(
+        sbc.op,
+        axes(sbc),
+        ij,
+        slabidx,
+        _get_node(ij, slabidx, sbc.args...)...,
+    )
 end
 
 
-_get_node(ij, slabidx) = ()
-_get_node(ij, slabidx, arg, xargs...) =
+@inline _get_node(ij, slabidx) = ()
+@inline _get_node(ij, slabidx, arg, xargs...) =
     (get_node(arg, ij, slabidx), _get_node(ij, slabidx, xargs...)...)
 
 Base.@propagate_inbounds function get_node(scalar, ij, slabidx)
@@ -414,8 +434,7 @@ end
 
 Base.Broadcast.BroadcastStyle(
     ::Type{<:SpectralBroadcasted{Style}},
-) where {Style} =
-    Style()
+) where {Style} = Style()
 
 Base.Broadcast.BroadcastStyle(
     style::AbstractSpectralStyle,
@@ -538,32 +557,34 @@ function apply_operator_kernel(op::Divergence{(1, 2)}, space, ij, slabidx, arg)
     D = Quadratures.differentiation_matrix(FT, QS)
 
     local_geometry = get_local_geometry(space, ij, slabidx)
-    
+
     # allocate temp output
     RT = operator_return_eltype(op, typeof(arg))
     Nf = DataLayouts.typesize(FT, RT)
-    work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
-    work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    work1_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
+    work2_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
     Jv¹ = IJF{RT, Nq}(work1_array)
     Jv² = IJF{RT, Nq}(work2_array)
-    i,j = ij.I
-    
-    Jv¹[i,j] = local_geometry.J ⊠ RecursiveApply.rmap(
-        v -> Geometry.contravariant1(v, local_geometry),
-        arg,
-    )
-    Jv²[i,j] = local_geometry.J ⊠ RecursiveApply.rmap(
-        v -> Geometry.contravariant2(v, local_geometry),
-        arg,
-    )
+    i, j = ij.I
+
+    Jv¹[i, j] =
+        local_geometry.J ⊠ RecursiveApply.rmap(
+            v -> Geometry.contravariant1(v, local_geometry),
+            arg,
+        )
+    Jv²[i, j] =
+        local_geometry.J ⊠ RecursiveApply.rmap(
+            v -> Geometry.contravariant2(v, local_geometry),
+            arg,
+        )
 
     CUDA.sync_threads()
 
-    D₁Jv¹ = D[i,1] ⊠ Jv¹[1,j]
-    D₂Jv² = D[j,1] ⊠ Jv²[i,1]
-    for k = 2:Nq
-        D₁Jv¹ = D₁Jv¹ ⊞ D[i,k] ⊠ Jv¹[k,j]
-        D₂Jv² = D₂Jv² ⊞ D[j,k] ⊠ Jv²[i,k]
+    D₁Jv¹ = D[i, 1] ⊠ Jv¹[1, j]
+    D₂Jv² = D[j, 1] ⊠ Jv²[i, 1]
+    for k in 2:Nq
+        D₁Jv¹ = D₁Jv¹ ⊞ D[i, k] ⊠ Jv¹[k, j]
+        D₂Jv² = D₂Jv² ⊞ D[j, k] ⊠ Jv²[i, k]
     end
     return RecursiveApply.rdiv(D₁Jv¹ ⊞ D₂Jv², local_geometry.J)
 end
@@ -681,39 +702,47 @@ function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
-function apply_operator_kernel(op::WeakDivergence{(1, 2)}, space, ij, slabidx, arg)
+function apply_operator_kernel(
+    op::WeakDivergence{(1, 2)},
+    space,
+    ij,
+    slabidx,
+    arg,
+)
     FT = Spaces.undertype(space)
     QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
 
     local_geometry = get_local_geometry(space, ij, slabidx)
-    
+
     # allocate temp output
     RT = operator_return_eltype(op, typeof(arg))
     Nf = DataLayouts.typesize(FT, RT)
-    work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
-    work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    work1_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
+    work2_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
     WJv¹ = IJF{RT, Nq}(work1_array)
     WJv² = IJF{RT, Nq}(work2_array)
-    i,j = ij.I
-    
-    WJv¹[i,j] = local_geometry.WJ ⊠ RecursiveApply.rmap(
-        v -> Geometry.contravariant1(v, local_geometry),
-        arg,
-    )
-    WJv²[i,j] = local_geometry.WJ ⊠ RecursiveApply.rmap(
-        v -> Geometry.contravariant2(v, local_geometry),
-        arg,
-    )
+    i, j = ij.I
+
+    WJv¹[i, j] =
+        local_geometry.WJ ⊠ RecursiveApply.rmap(
+            v -> Geometry.contravariant1(v, local_geometry),
+            arg,
+        )
+    WJv²[i, j] =
+        local_geometry.WJ ⊠ RecursiveApply.rmap(
+            v -> Geometry.contravariant2(v, local_geometry),
+            arg,
+        )
 
     CUDA.sync_threads()
 
-    Dᵀ₁WJv¹ = D[1,i] ⊠ WJv¹[1,j]
-    Dᵀ₂WJv² = D[1,j] ⊠ WJv²[i,1]
-    for k = 2:Nq
-        Dᵀ₁WJv¹ = Dᵀ₁WJv¹ ⊞ D[k,i] ⊠ WJv¹[k,j]
-        Dᵀ₂WJv² = Dᵀ₂WJv² ⊞ D[k,j] ⊠ WJv²[i,k]
+    Dᵀ₁WJv¹ = D[1, i] ⊠ WJv¹[1, j]
+    Dᵀ₂WJv² = D[1, j] ⊠ WJv²[i, 1]
+    for k in 2:Nq
+        Dᵀ₁WJv¹ = Dᵀ₁WJv¹ ⊞ D[k, i] ⊠ WJv¹[k, j]
+        Dᵀ₂WJv² = Dᵀ₂WJv² ⊞ D[k, j] ⊠ WJv²[i, k]
     end
     return ⊟(RecursiveApply.rdiv(Dᵀ₁WJv¹ ⊞ Dᵀ₂WJv², local_geometry.WJ))
 end
@@ -800,20 +829,20 @@ function apply_operator_kernel(op::Gradient{(1, 2)}, space, ij, slabidx, arg)
     # allocate temp output
     IT = typeof(arg)
     Nf = DataLayouts.typesize(FT, IT)
-    array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
     work = IJF{IT, Nq}(array)
-    i,j = ij.I
-    work[i,j] = arg
+    i, j = ij.I
+    work[i, j] = arg
 
     CUDA.sync_threads()
 
-    ∂f∂ξ₁ = D[i,1] * work[1,j]
-    ∂f∂ξ₂ = D[j,1] * work[i,1]
-    for k = 2:Nq
-        ∂f∂ξ₁ += D[i,k] * work[k,j]
-        ∂f∂ξ₂ += D[j,k] * work[i,k]
+    ∂f∂ξ₁ = D[i, 1] * work[1, j]
+    ∂f∂ξ₂ = D[j, 1] * work[i, 1]
+    for k in 2:Nq
+        ∂f∂ξ₁ += D[i, k] * work[k, j]
+        ∂f∂ξ₂ += D[j, k] * work[i, k]
     end
-    return Geometry.Covariant12Vector(∂f∂ξ₁,∂f∂ξ₂)
+    return Geometry.Covariant12Vector(∂f∂ξ₁, ∂f∂ξ₂)
 end
 
 
@@ -919,7 +948,13 @@ function apply_operator(op::WeakGradient{(1, 2)}, space, slabidx, arg)
     return Field(SArray(out), space)
 end
 
-function apply_operator_kernel(op::WeakGradient{(1, 2)}, space, ij, slabidx, arg)
+function apply_operator_kernel(
+    op::WeakGradient{(1, 2)},
+    space,
+    ij,
+    slabidx,
+    arg,
+)
     FT = Spaces.undertype(space)
     QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
@@ -931,20 +966,23 @@ function apply_operator_kernel(op::WeakGradient{(1, 2)}, space, ij, slabidx, arg
     # allocate temp output
     IT = typeof(arg)
     Nf = DataLayouts.typesize(FT, IT)
-    work_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+    work_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
     Wf = IJF{IT, Nq}(work_array)
-    i,j = ij.I
-    Wf[i,j] = W ⊠ arg
+    i, j = ij.I
+    Wf[i, j] = W ⊠ arg
 
     CUDA.sync_threads()
 
-    Dᵀ₁Wf = D[1,i] ⊠ Wf[1,j]
-    Dᵀ₂Wf = D[1,j] ⊠ Wf[i,1]
-    for k = 2:Nq
-        Dᵀ₁Wf = Dᵀ₁Wf ⊞ D[k,i] ⊠ Wf[k,j]
-        Dᵀ₂Wf = Dᵀ₂Wf ⊞ D[k,j] ⊠ Wf[i,k]
+    Dᵀ₁Wf = D[1, i] ⊠ Wf[1, j]
+    Dᵀ₂Wf = D[1, j] ⊠ Wf[i, 1]
+    for k in 2:Nq
+        Dᵀ₁Wf = Dᵀ₁Wf ⊞ D[k, i] ⊠ Wf[k, j]
+        Dᵀ₂Wf = Dᵀ₂Wf ⊞ D[k, j] ⊠ Wf[i, k]
     end
-    return Geometry.Covariant12Vector(⊟(RecursiveApply.rdiv(Dᵀ₁Wf, W)),⊟(RecursiveApply.rdiv(Dᵀ₂Wf, W)))
+    return Geometry.Covariant12Vector(
+        ⊟(RecursiveApply.rdiv(Dᵀ₁Wf, W)),
+        ⊟(RecursiveApply.rdiv(Dᵀ₂Wf, W)),
+    )
 end
 
 
@@ -1099,40 +1137,45 @@ function apply_operator_kernel(op::Curl{(1, 2)}, space, ij, slabidx, arg)
     IT = eltype(arg)
     Nf = DataLayouts.typesize(FT, IT)
     local_geometry = get_local_geometry(space, ij, slabidx)
-    i,j = ij.I
+    i, j = ij.I
 
+    # input data is a Covariant12Vector field
     if RT <: Geometry.Contravariant3Vector
-
-        work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
-        work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        work1_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
+        work2_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
         v₁ = IJF{IT, Nq}(work1_array)
         v₂ = IJF{IT, Nq}(work2_array)
-        v₁[i,j] = Geometry.covariant1(arg, local_geometry)
-        v₂[i,j] = Geometry.covariant2(arg, local_geometry)
+        v₁[i, j] = Geometry.covariant1(arg, local_geometry)
+        v₂[i, j] = Geometry.covariant2(arg, local_geometry)
 
         CUDA.sync_threads()
 
-        D₁v₂ = D[i, 1] ⊠ v₂[1,j]
-        D₂v₁ = D[j, 1] ⊠ v₁[i,1]
+        D₁v₂ = D[i, 1] ⊠ v₂[1, j]
+        D₂v₁ = D[j, 1] ⊠ v₁[i, 1]
         for k in 2:Nq
-            D₁v₂ = D₁v₂ ⊠ D[i, k] ⊠ v₂[k,j]
-            D₂v₁ = D₂v₁ ⊠ D[j, k] ⊠ v₁[i,k]
+            D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, j]
+            D₂v₁ = D₂v₁ ⊞ D[j, k] ⊠ v₁[i, k]
         end
-        return Geometry.Contravariant3Vector(RecursiveApply.rdiv(D₁v₂ ⊟ D₂v₁, local_geometry.J))
+        return Geometry.Contravariant3Vector(
+            RecursiveApply.rdiv(D₁v₂ ⊟ D₂v₁, local_geometry.J),
+        )
     elseif RT <: Geometry.Contravariant12Vector
-        work_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        work_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
         v₃ = IJF{IT, Nq}(work_array)
-        v₃[i,j] = Geometry.covariant3(arg, local_geometry)
+        v₃[i, j] = Geometry.covariant3(arg, local_geometry)
 
         CUDA.sync_threads()
 
-        D₁v₃ = D[i, 1] ⊠ v₃[1,j]
-        D₂v₃ = D[j, 1] ⊠ v₃[i,1]
+        D₁v₃ = D[i, 1] ⊠ v₃[1, j]
+        D₂v₃ = D[j, 1] ⊠ v₃[i, 1]
         for k in 2:Nq
-            D₁v₃ = D₁v₃ ⊠ D[i, k] ⊠ v₃[k,j]
-            D₂v₃ = D₂v₃ ⊠ D[j, k] ⊠ v₃[i,k]
+            D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, j]
+            D₂v₃ = D₂v₃ ⊞ D[j, k] ⊠ v₃[i, k]
         end
-        return Geometry.Contravariant12Vector(RecursiveApply.rdiv(D₂v₃,local_geometry.J) ⊟(RecursiveApply.rdiv(D₁v₃,local_geometry.J)))
+        return Geometry.Contravariant12Vector(
+            RecursiveApply.rdiv(D₂v₃, local_geometry.J),
+            ⊟(RecursiveApply.rdiv(D₁v₃, local_geometry.J)),
+        )
     else
         error("invalid return type")
     end
@@ -1290,40 +1333,45 @@ function apply_operator_kernel(op::WeakCurl{(1, 2)}, space, ij, slabidx, arg)
     Nf = DataLayouts.typesize(FT, IT)
     local_geometry = get_local_geometry(space, ij, slabidx)
     W = local_geometry.WJ / local_geometry.J
-    i,j = ij.I
+    i, j = ij.I
 
     if RT <: Geometry.Contravariant3Vector
 
-        work1_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
-        work2_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        work1_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
+        work2_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
         Wv₁ = IJF{IT, Nq}(work1_array)
         Wv₂ = IJF{IT, Nq}(work2_array)
-        Wv₁[i,j] = W ⊠ Geometry.covariant1(arg, local_geometry)
-        Wv₂[i,j] = W ⊠ Geometry.covariant2(arg, local_geometry)
+        Wv₁[i, j] = W ⊠ Geometry.covariant1(arg, local_geometry)
+        Wv₂[i, j] = W ⊠ Geometry.covariant2(arg, local_geometry)
 
         CUDA.sync_threads()
 
-        Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1,j]
-        Dᵀ₂Wv₁ = D[1, j] ⊠ Wv₁[i,1]
+        Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, j]
+        Dᵀ₂Wv₁ = D[1, j] ⊠ Wv₁[i, 1]
         for k in 2:Nq
-            Dᵀ₁v₂ = Dᵀ₁Wv₂ ⊠ D[k, i] ⊠ Wv₂[k,j]
-            Dᵀ₂v₁ = Dᵀ₂Wv₁ ⊠ D[k, j] ⊠ Wv₁[i,k]
+            Dᵀ₁v₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, j]
+            Dᵀ₂v₁ = Dᵀ₂Wv₁ ⊞ D[k, j] ⊠ Wv₁[i, k]
         end
-        return Geometry.Contravariant3Vector(RecursiveApply.rdiv(Dᵀ₂Wv₁ ⊟ Dᵀ₁Wv₂, local_geometry.WJ))
+        return Geometry.Contravariant3Vector(
+            RecursiveApply.rdiv(Dᵀ₂Wv₁ ⊟ Dᵀ₁Wv₂, local_geometry.WJ),
+        )
     elseif RT <: Geometry.Contravariant12Vector
-        work_array = CUDA.CuStaticSharedArray(FT, (Nq,Nq,Nf))
+        work_array = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nf))
         Wv₃ = IJF{IT, Nq}(work_array)
-        Wv₃[i,j] = W ⊠ Geometry.covariant3(arg, local_geometry)
+        Wv₃[i, j] = W ⊠ Geometry.covariant3(arg, local_geometry)
 
         CUDA.sync_threads()
 
-        Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1,j]
-        Dᵀ₂Wv₃ = D[1, j] ⊠ Wv₃[i,1]
+        Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, j]
+        Dᵀ₂Wv₃ = D[1, j] ⊠ Wv₃[i, 1]
         for k in 2:Nq
-            Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊠ D[k, i] ⊠ Wv₃[k,j]
-            Dᵀ₂Wv₃ = Dᵀ₂Wv₃ ⊠ D[k, j] ⊠ Wv₃[i,k]
+            Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, j]
+            Dᵀ₂Wv₃ = Dᵀ₂Wv₃ ⊞ D[k, j] ⊠ Wv₃[i, k]
         end
-        return Geometry.Contravariant12Vector(RecursiveApply.rdiv(Dᵀ₂Wv₃,local_geometry.WJ) ⊟(RecursiveApply.rdiv(Dᵀ₁Wv₃,local_geometry.WJ)))
+        return Geometry.Contravariant12Vector(
+            RecursiveApply.rdiv(Dᵀ₂Wv₃, local_geometry.WJ),
+            ⊟(RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ)),
+        )
     else
         error("invalid return type")
     end
diff --git a/src/Spaces/spectralelement.jl b/src/Spaces/spectralelement.jl
index df5160643f..7c2f166a68 100644
--- a/src/Spaces/spectralelement.jl
+++ b/src/Spaces/spectralelement.jl
@@ -20,7 +20,8 @@ function Base.show(io::IO, space::AbstractSpectralElementSpace)
     print(iio, " "^(indent + 2), space.quadrature_style)
 end
 
-Device.device(space::AbstractSpectralElementSpace) = Device.device(topology(space))
+Device.device(space::AbstractSpectralElementSpace) =
+    Device.device(topology(space))
 topology(space::AbstractSpectralElementSpace) = space.topology
 quadrature_style(space::AbstractSpectralElementSpace) = space.quadrature_style
 
@@ -177,17 +178,17 @@ struct SpectralElementSpace2D{
 end
 
 Adapt.adapt_structure(to, space::SpectralElementSpace2D) =
-SpectralElementSpace2D(
-    nothing, # drop topology
-    Adapt.adapt(to, space.quadrature_style),
-    Adapt.adapt(to, space.global_geometry),
-    Adapt.adapt(to, space.local_geometry),
-    Adapt.adapt(to, space.ghost_geometry),
-    Adapt.adapt(to, space.local_dss_weights),
-    Adapt.adapt(to, space.ghost_dss_weights),
-    Adapt.adapt(to, space.internal_surface_geometry),
-    Adapt.adapt(to, space.boundary_surface_geometries),
-)
+    SpectralElementSpace2D(
+        nothing, # drop topology
+        Adapt.adapt(to, space.quadrature_style),
+        Adapt.adapt(to, space.global_geometry),
+        Adapt.adapt(to, space.local_geometry),
+        Adapt.adapt(to, space.ghost_geometry),
+        Adapt.adapt(to, space.local_dss_weights),
+        Adapt.adapt(to, space.ghost_dss_weights),
+        Adapt.adapt(to, space.internal_surface_geometry),
+        Adapt.adapt(to, space.boundary_surface_geometries),
+    )
 
 
 
@@ -788,4 +789,3 @@ function Base.iterate(
         return ((i, j), e), ((i, j), e)
     end
 end
-
diff --git a/test/Operators/spectralelement/rectilinear_cuda.jl b/test/Operators/spectralelement/rectilinear_cuda.jl
index bceec3c120..ef47f54556 100644
--- a/test/Operators/spectralelement/rectilinear_cuda.jl
+++ b/test/Operators/spectralelement/rectilinear_cuda.jl
@@ -20,8 +20,12 @@ quad = Spaces.Quadratures.GLL{Nq}()
 grid_mesh = Meshes.RectilinearMesh(domain, 17, 16)
 
 
-grid_topology_cpu =
-    Topologies.Topology2D(ClimaComms.SingletonCommsContext(ClimaCore.Device.device(;disablegpu=true)), grid_mesh)
+grid_topology_cpu = Topologies.Topology2D(
+    ClimaComms.SingletonCommsContext(
+        ClimaCore.Device.device(; disablegpu = true),
+    ),
+    grid_mesh,
+)
 grid_space_cpu = Spaces.SpectralElementSpace2D(grid_topology_cpu, quad)
 coords_cpu = Fields.coordinate_field(grid_space_cpu)
 
@@ -31,8 +35,10 @@ grad = Operators.Gradient()
 gradf_cpu = grad.(f_cpu)
 
 
-grid_topology =
-    Topologies.Topology2D(ClimaComms.SingletonCommsContext(ClimaCore.Device.device()), grid_mesh)
+grid_topology = Topologies.Topology2D(
+    ClimaComms.SingletonCommsContext(ClimaCore.Device.device()),
+    grid_mesh,
+)
 grid_space = Spaces.SpectralElementSpace2D(grid_topology, quad)
 coords = Fields.coordinate_field(grid_space)
 
@@ -366,4 +372,4 @@ end
         @test ∇⁴y_ref ≈ ∇⁴y rtol = 2e-2
     end
 end
-==#
\ No newline at end of file
+==#

From 10ab6f8f47b15cda3171f8e9a3f7a7239ec2a99c Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 2 Feb 2023 17:44:44 -0800
Subject: [PATCH 09/16] add CUDA reduce

---
 src/DataLayouts/cuda.jl | 55 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/DataLayouts/cuda.jl b/src/DataLayouts/cuda.jl
index 6400acfd36..6d189d5a01 100644
--- a/src/DataLayouts/cuda.jl
+++ b/src/DataLayouts/cuda.jl
@@ -76,3 +76,58 @@ function Base.copyto!(
     CUDA.@cuda threads = (Nij, Nij) blocks = (Nh,) knl_copyto!(dest, bc)
     return dest
 end
+
+function knl_mapreduce!(dest, fn, op, src)
+
+    #=
+    nij, nh = size(dest)
+
+    thread_idx = CUDA.threadIdx().x
+    block_idx = CUDA.blockIdx().x
+    block_dim = CUDA.blockDim().x
+
+    # mapping to global idx to make insensitive
+    # to number of blocks / threads per device
+    global_idx = thread_idx + (block_idx - 1) * block_dim
+
+    nx, ny = nij, nij
+    i = global_idx % nx == 0 ? nx : global_idx % nx
+    j = cld(global_idx, nx)
+    h = ((global_idx-1) % (nx*nx)) + 1
+    =#
+
+    i = CUDA.threadIdx().x
+    j = CUDA.threadIdx().y
+
+    h = CUDA.blockIdx().x
+
+    p_dest = slab(dest, h)
+    p_src = slab(src, h)
+
+    if h == 1
+        @inbounds p_dest[i, j] = fn(p_src[i, j])
+    else
+        @inbounds p_dest[i, j] = op(p_dest[i, j], fn(p_src[i, j]))
+    end
+        
+    return nothing
+end
+
+function Base.reduce(
+    op::Op,
+    bc::Union{
+        IJFH{<:Any, Nij, A},
+        Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}},
+    },
+) where {Op, Nij, A <: CUDA.CuArray}
+    # mapreduce across DataSlab2D
+    _, _, _, _, Nh = size(bc)
+    S = eltype(bc)
+    T = eltype(A)
+    Nf = typesize(T, S)
+    # dest array
+    dest = IJF{S,Nij}(CuArray{T}(undef,Nij, Nij, Nf))
+    CUDA.@cuda threads = (Nij, Nij) blocks = (Nh,) knl_reduce!(dest, op, bc)
+    # copy to CPU, and final reduce
+    reduce(op, IJF{S,Nij}(Array(parent(dest))))
+end

From 4de5c8c8b590e5eb5d6a0f5e8bf0691e9998c036 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Mon, 6 Feb 2023 16:02:07 -0800
Subject: [PATCH 10/16] apply formatter

---
 examples/sphere/shallow_water_cuda.jl | 73 ++++++++++++++-------------
 src/DataLayouts/cuda.jl               |  6 +--
 src/Fields/broadcast.jl               |  7 ++-
 3 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/examples/sphere/shallow_water_cuda.jl b/examples/sphere/shallow_water_cuda.jl
index 53e2ba7358..12f4d2767b 100644
--- a/examples/sphere/shallow_water_cuda.jl
+++ b/examples/sphere/shallow_water_cuda.jl
@@ -448,41 +448,42 @@ end
 
 function rhs!(dYdt, y, parameters, t)
     #@nvtx "rhs!" color = colorant"red" begin
-        (; f, h_s, ghost_buffer, params) = parameters
-        (; D₄, g) = params
-
-        div = Operators.Divergence()
-        wdiv = Operators.WeakDivergence()
-        grad = Operators.Gradient()
-        wgrad = Operators.WeakGradient()
-        curl = Operators.Curl()
-        wcurl = Operators.WeakCurl()
-
-        # Compute hyperviscosity first
-        #@nvtx "Hyperviscosity (rhs!)" color = colorant"green" begin
-            @. dYdt.h = wdiv(grad(y.h))
-            @. dYdt.u =
-                wgrad(div(y.u)) - Geometry.Covariant12Vector(
-                    wcurl(Geometry.Covariant3Vector(curl(y.u))),
-                )
-
-            Spaces.weighted_dss2!(dYdt, ghost_buffer)
-
-            @. dYdt.h = -D₄ * wdiv(grad(dYdt.h))
-            # split to avoid "device kernel image is invalid (code 200, ERROR_INVALID_IMAGE)"
-            @. dYdt.u = wgrad(div(dYdt.u)) - Geometry.Covariant12Vector(wcurl(Geometry.Covariant3Vector(curl(dYdt.u))))
-            @. dYdt.u = -D₄ * dYdt.u
-        #end
-        #@nvtx "h and u (rhs!)" color = colorant"blue" begin
-            # Add in pieces
-            @. begin
-                dYdt.h += -wdiv(y.h * y.u)
-                dYdt.u +=
-                    -grad(g * (y.h + h_s) + norm(y.u)^2 / 2) #+
-                dYdt.u += y.u × (f + curl(y.u))
-            end
-            Spaces.weighted_dss2!(dYdt, ghost_buffer)
-        #end
+    (; f, h_s, ghost_buffer, params) = parameters
+    (; D₄, g) = params
+
+    div = Operators.Divergence()
+    wdiv = Operators.WeakDivergence()
+    grad = Operators.Gradient()
+    wgrad = Operators.WeakGradient()
+    curl = Operators.Curl()
+    wcurl = Operators.WeakCurl()
+
+    # Compute hyperviscosity first
+    #@nvtx "Hyperviscosity (rhs!)" color = colorant"green" begin
+    @. dYdt.h = wdiv(grad(y.h))
+    @. dYdt.u =
+        wgrad(div(y.u)) -
+        Geometry.Covariant12Vector(wcurl(Geometry.Covariant3Vector(curl(y.u))))
+
+    Spaces.weighted_dss2!(dYdt, ghost_buffer)
+
+    @. dYdt.h = -D₄ * wdiv(grad(dYdt.h))
+    # split to avoid "device kernel image is invalid (code 200, ERROR_INVALID_IMAGE)"
+    @. dYdt.u =
+        wgrad(div(dYdt.u)) - Geometry.Covariant12Vector(
+            wcurl(Geometry.Covariant3Vector(curl(dYdt.u))),
+        )
+    @. dYdt.u = -D₄ * dYdt.u
+    #end
+    #@nvtx "h and u (rhs!)" color = colorant"blue" begin
+    # Add in pieces
+    @. begin
+        dYdt.h += -wdiv(y.h * y.u)
+        dYdt.u += -grad(g * (y.h + h_s) + norm(y.u)^2 / 2) #+
+        dYdt.u += y.u × (f + curl(y.u))
+    end
+    Spaces.weighted_dss2!(dYdt, ghost_buffer)
+    #end
     #end
     return dYdt
 end
@@ -531,7 +532,7 @@ function shallow_water_driver_cuda(ARGS, ::Type{FT}) where {FT}
     Spaces.weighted_dss_ghost2!(Y, ghost_buffer)
 
     parameters =
-    (; f = f, h_s = h_s, ghost_buffer = ghost_buffer, params = test.params)
+        (; f = f, h_s = h_s, ghost_buffer = ghost_buffer, params = test.params)
     rhs!(dYdt, Y, parameters, 0.0)
 
     # Solve the ODE
diff --git a/src/DataLayouts/cuda.jl b/src/DataLayouts/cuda.jl
index 6d189d5a01..4dd729f1fa 100644
--- a/src/DataLayouts/cuda.jl
+++ b/src/DataLayouts/cuda.jl
@@ -109,7 +109,7 @@ function knl_mapreduce!(dest, fn, op, src)
     else
         @inbounds p_dest[i, j] = op(p_dest[i, j], fn(p_src[i, j]))
     end
-        
+
     return nothing
 end
 
@@ -126,8 +126,8 @@ function Base.reduce(
     T = eltype(A)
     Nf = typesize(T, S)
     # dest array
-    dest = IJF{S,Nij}(CuArray{T}(undef,Nij, Nij, Nf))
+    dest = IJF{S, Nij}(CuArray{T}(undef, Nij, Nij, Nf))
     CUDA.@cuda threads = (Nij, Nij) blocks = (Nh,) knl_reduce!(dest, op, bc)
     # copy to CPU, and final reduce
-    reduce(op, IJF{S,Nij}(Array(parent(dest))))
+    reduce(op, IJF{S, Nij}(Array(parent(dest))))
 end
diff --git a/src/Fields/broadcast.jl b/src/Fields/broadcast.jl
index ad770d52eb..e4bc9dd7b0 100644
--- a/src/Fields/broadcast.jl
+++ b/src/Fields/broadcast.jl
@@ -263,8 +263,11 @@ end
 end
 
 # types aren't isbits
-Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::Type{T}, args...) where {T} =
-    Base.Broadcast.broadcasted(fs, (x...) -> T(x...), args...)
+Base.Broadcast.broadcasted(
+    fs::AbstractFieldStyle,
+    ::Type{T},
+    args...,
+) where {T} = Base.Broadcast.broadcasted(fs, (x...) -> T(x...), args...)
 
 # Specialize handling of +, *, muladd, so that we can support broadcasting over NamedTuple element types
 # Required for ODE solvers

From 45d5a9297e02736ea6c1aabd6cb0d6da3ec8ee8a Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Wed, 8 Feb 2023 16:15:01 -0800
Subject: [PATCH 11/16] define device for extruded spaces

---
 src/Spaces/extruded.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Spaces/extruded.jl b/src/Spaces/extruded.jl
index ba37367287..3f4f39aee9 100644
--- a/src/Spaces/extruded.jl
+++ b/src/Spaces/extruded.jl
@@ -140,6 +140,8 @@ quadrature_style(space::ExtrudedFiniteDifferenceSpace) =
     space.horizontal_space.quadrature_style
 
 topology(space::ExtrudedFiniteDifferenceSpace) = space.horizontal_space.topology
+Device.device(space::ExtrudedFiniteDifferenceSpace) =
+    Device.device(topology(space))
 vertical_topology(space::ExtrudedFiniteDifferenceSpace) =
     space.vertical_topology
 

From e0277f5a4e602b94eca72f907d59b8f1591002c7 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 9 Feb 2023 13:21:54 -0800
Subject: [PATCH 12/16] define device for IntervalTopology

---
 src/Topologies/Topologies.jl | 2 ++
 src/Topologies/interval.jl   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/Topologies/Topologies.jl b/src/Topologies/Topologies.jl
index 6dda097e41..319306df09 100644
--- a/src/Topologies/Topologies.jl
+++ b/src/Topologies/Topologies.jl
@@ -2,6 +2,8 @@ module Topologies
 
 using DocStringExtensions
 
+import ClimaComms
+
 import ..Device
 import ..Geometry
 import ..Domains: Domains, coordinate_type
diff --git a/src/Topologies/interval.jl b/src/Topologies/interval.jl
index c89ef76191..5d0ce30ed1 100644
--- a/src/Topologies/interval.jl
+++ b/src/Topologies/interval.jl
@@ -10,6 +10,8 @@ struct IntervalTopology{M <: Meshes.IntervalMesh, B} <: AbstractIntervalTopology
     boundaries::B
 end
 
+Device.device(::IntervalTopology) = ClimaComms.CPU()
+
 function IntervalTopology(mesh::Meshes.IntervalMesh)
     if Domains.isperiodic(mesh.domain)
         boundaries = NamedTuple()

From 1910ed9350f176c21811ee4f1223078811ad4ab2 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 9 Feb 2023 17:30:49 -0800
Subject: [PATCH 13/16] add operator unit tests

---
 .../spectralelement/rectilinear_cuda.jl       | 29 +++++++++++++++----
 test/runtests.jl                              |  1 +
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/test/Operators/spectralelement/rectilinear_cuda.jl b/test/Operators/spectralelement/rectilinear_cuda.jl
index ef47f54556..52c4e6a158 100644
--- a/test/Operators/spectralelement/rectilinear_cuda.jl
+++ b/test/Operators/spectralelement/rectilinear_cuda.jl
@@ -30,10 +30,7 @@ grid_space_cpu = Spaces.SpectralElementSpace2D(grid_topology_cpu, quad)
 coords_cpu = Fields.coordinate_field(grid_space_cpu)
 
 f_cpu = sin.(coords_cpu.x .+ 2 .* coords_cpu.y)
-
-grad = Operators.Gradient()
-gradf_cpu = grad.(f_cpu)
-
+g_cpu = Geometry.UVVector.(sin.(coords_cpu.x), 2 .* cos.(coords_cpu.y))
 
 grid_topology = Topologies.Topology2D(
     ClimaComms.SingletonCommsContext(ClimaCore.Device.device()),
@@ -44,10 +41,32 @@ coords = Fields.coordinate_field(grid_space)
 
 CUDA.allowscalar(false)
 f = sin.(coords.x .+ 2 .* coords.y)
+g = Geometry.UVVector.(sin.(coords.x), 2 .* cos.(coords.y))
 
 grad = Operators.Gradient()
-gradf = grad.(f)
+wgrad = Operators.WeakGradient()
+
+@test Array(parent(grad.(f))) ≈ parent(grad.(f_cpu))
+@test Array(parent(wgrad.(f))) ≈ parent(wgrad.(f_cpu))
+
+div = Operators.Divergence()
+wdiv = Operators.WeakDivergence()
+
+@test Array(parent(div.(g))) ≈ parent(div.(g_cpu))
+@test Array(parent(wdiv.(g))) ≈ parent(wdiv.(g_cpu))
+@test Array(parent(div.(grad.(f)))) ≈ parent(div.(grad.(f_cpu))) # composite
+
+curl = Operators.Curl()
+wcurl = Operators.WeakCurl()
 
+@test Array(parent(curl.(Geometry.Covariant12Vector.(g)))) ≈
+      parent(curl.(Geometry.Covariant12Vector.(g_cpu)))
+@test Array(parent(curl.(Geometry.Covariant3Vector.(f)))) ≈
+      parent(curl.(Geometry.Covariant3Vector.(f_cpu)))
+@test Array(parent(wcurl.(Geometry.Covariant12Vector.(g)))) ≈
+      parent(wcurl.(Geometry.Covariant12Vector.(g)))
+@test Array(parent(wcurl.(Geometry.Covariant3Vector.(f)))) ≈
+      parent(wcurl.(Geometry.Covariant3Vector.(f_cpu)))
 
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 9348bcd03c..0faf444497 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -99,6 +99,7 @@ if "CUDA" in ARGS
     @safetestset "GPU - data" begin @time include("gpu/data.jl") end
     @safetestset "GPU - device" begin @time include("gpu/device.jl") end
     @safetestset "Spaces - serial CUDA DSS" begin @time include("Spaces/ddss1.jl") end
+    @safetestset "Operators - CUDA" begin @time include("Operators/spectralelement/rectilinear_cuda.jl") end
 end
 
 #! format: on

From 943d4cd6a2905ba8b649d55d40c01723bc1e7ebf Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Fri, 10 Feb 2023 10:37:22 -0800
Subject: [PATCH 14/16] fix WeakCurl, update tests

---
 src/Operators/spectralelement.jl              |   8 +-
 .../spectralelement/rectilinear_cuda.jl       | 335 +-----------------
 2 files changed, 11 insertions(+), 332 deletions(-)

diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index 47183e52bb..64f6a744ec 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -1349,8 +1349,8 @@ function apply_operator_kernel(op::WeakCurl{(1, 2)}, space, ij, slabidx, arg)
         Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, j]
         Dᵀ₂Wv₁ = D[1, j] ⊠ Wv₁[i, 1]
         for k in 2:Nq
-            Dᵀ₁v₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, j]
-            Dᵀ₂v₁ = Dᵀ₂Wv₁ ⊞ D[k, j] ⊠ Wv₁[i, k]
+            Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, j]
+            Dᵀ₂Wv₁ = Dᵀ₂Wv₁ ⊞ D[k, j] ⊠ Wv₁[i, k]
         end
         return Geometry.Contravariant3Vector(
             RecursiveApply.rdiv(Dᵀ₂Wv₁ ⊟ Dᵀ₁Wv₂, local_geometry.WJ),
@@ -1369,8 +1369,8 @@ function apply_operator_kernel(op::WeakCurl{(1, 2)}, space, ij, slabidx, arg)
             Dᵀ₂Wv₃ = Dᵀ₂Wv₃ ⊞ D[k, j] ⊠ Wv₃[i, k]
         end
         return Geometry.Contravariant12Vector(
-            RecursiveApply.rdiv(Dᵀ₂Wv₃, local_geometry.WJ),
-            ⊟(RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ)),
+            ⊟(RecursiveApply.rdiv(Dᵀ₂Wv₃, local_geometry.WJ)),
+            RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ),
         )
     else
         error("invalid return type")
diff --git a/test/Operators/spectralelement/rectilinear_cuda.jl b/test/Operators/spectralelement/rectilinear_cuda.jl
index 52c4e6a158..18cafae580 100644
--- a/test/Operators/spectralelement/rectilinear_cuda.jl
+++ b/test/Operators/spectralelement/rectilinear_cuda.jl
@@ -30,7 +30,11 @@ grid_space_cpu = Spaces.SpectralElementSpace2D(grid_topology_cpu, quad)
 coords_cpu = Fields.coordinate_field(grid_space_cpu)
 
 f_cpu = sin.(coords_cpu.x .+ 2 .* coords_cpu.y)
-g_cpu = Geometry.UVVector.(sin.(coords_cpu.x), 2 .* cos.(coords_cpu.y))
+g_cpu =
+    Geometry.UVVector.(
+        sin.(coords_cpu.x),
+        2 .* cos.(coords_cpu.y .+ coords_cpu.x),
+    )
 
 grid_topology = Topologies.Topology2D(
     ClimaComms.SingletonCommsContext(ClimaCore.Device.device()),
@@ -41,7 +45,7 @@ coords = Fields.coordinate_field(grid_space)
 
 CUDA.allowscalar(false)
 f = sin.(coords.x .+ 2 .* coords.y)
-g = Geometry.UVVector.(sin.(coords.x), 2 .* cos.(coords.y))
+g = Geometry.UVVector.(sin.(coords.x), 2 .* cos.(coords.y .+ coords.x))
 
 grad = Operators.Gradient()
 wgrad = Operators.WeakGradient()
@@ -64,331 +68,6 @@ wcurl = Operators.WeakCurl()
 @test Array(parent(curl.(Geometry.Covariant3Vector.(f)))) ≈
       parent(curl.(Geometry.Covariant3Vector.(f_cpu)))
 @test Array(parent(wcurl.(Geometry.Covariant12Vector.(g)))) ≈
-      parent(wcurl.(Geometry.Covariant12Vector.(g)))
+      parent(wcurl.(Geometry.Covariant12Vector.(g_cpu)))
 @test Array(parent(wcurl.(Geometry.Covariant3Vector.(f)))) ≈
       parent(wcurl.(Geometry.Covariant3Vector.(f_cpu)))
-
-
-
-
-
-
-#==
-
-ts_mesh = Meshes.RectilinearMesh(domain, 17, 16)
-ts_topology = Topologies.Topology2D(ClimaComms.SingletonCommsContext(), ts_mesh)
-ts_space = Spaces.SpectralElementSpace2D(ts_topology, quad)
-ts_coords = Fields.coordinate_field(ts_space)
-
-grid_test_setup = (grid_topology, grid_space, grid_coords)
-ts_test_setup = (ts_topology, ts_space, ts_coords)
-
-@testset "interpolate / restrict" begin
-
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        INq = 9
-        Iquad = Spaces.Quadratures.GLL{INq}()
-        Ispace = Spaces.SpectralElementSpace2D(topology, Iquad)
-
-        I = Operators.Interpolate(Ispace)
-        R = Operators.Restrict(space)
-
-        f = sin.(coords.x .+ 2 .* coords.y)
-
-        interpolated_field = I.(f)
-        Spaces.weighted_dss2!(interpolated_field)
-
-        @test axes(interpolated_field).quadrature_style == Iquad
-        @test axes(interpolated_field).topology == topology
-
-        restrict_field = R.(f)
-        Spaces.weighted_dss2!(restrict_field)
-
-        @test axes(restrict_field).quadrature_style == quad
-        @test axes(restrict_field).topology == topology
-
-        interp_restrict_field = R.(I.(f))
-        Spaces.weighted_dss2!(interp_restrict_field)
-
-        @test axes(interp_restrict_field).quadrature_style == quad
-        @test axes(interp_restrict_field).topology == topology
-
-        @test norm(interp_restrict_field .- f) ≤ 3.0e-4
-    end
-end
-
-@testset "gradient" begin
-
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        f = sin.(coords.x .+ 2 .* coords.y)
-
-        grad = Operators.Gradient()
-        gradf = grad.(f)
-        Spaces.weighted_dss2!(gradf)
-
-        @test gradf ≈
-              Geometry.Covariant12Vector.(
-            Geometry.UVVector.(
-                cos.(coords.x .+ 2 .* coords.y),
-                2 .* cos.(coords.x .+ 2 .* coords.y),
-            ),
-        ) rtol = 1e-2
-
-        fv =
-            Geometry.UVVector.(
-                sin.(coords.x .+ 2 .* coords.y),
-                cos.(coords.x .+ 2 .* coords.y),
-            )
-        gradfv = Geometry.transform.(Ref(Geometry.UVAxis()), grad.(fv))
-        Spaces.weighted_dss2!(gradfv)
-        @test eltype(gradfv) <: Geometry.Axis2Tensor
-    end
-end
-
-
-@testset "weak gradient" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        f = sin.(coords.x .+ 2 .* coords.y)
-
-        wgrad = Operators.WeakGradient()
-        gradf = wgrad.(f)
-        Spaces.weighted_dss2!(gradf)
-
-        @test Geometry.UVVector.(gradf) ≈
-              Geometry.UVVector.(
-            cos.(coords.x .+ 2 .* coords.y),
-            2 .* cos.(coords.x .+ 2 .* coords.y),
-        ) rtol = 1e-2
-    end
-end
-
-@testset "curl" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        v =
-            Geometry.UVVector.(
-                sin.(coords.x .+ 2 .* coords.y),
-                cos.(3 .* coords.x .+ 4 .* coords.y),
-            )
-
-        curl = Operators.Curl()
-        curlv = curl.(Geometry.Covariant12Vector.(v))
-        Spaces.weighted_dss2!(curlv)
-        curlv_ref =
-            Geometry.Contravariant3Vector.(
-                .-3 .* sin.(3 .* coords.x .+ 4 .* coords.y) .-
-                2 .* cos.(coords.x .+ 2 .* coords.y),
-            )
-
-        @test curlv ≈ curlv_ref rtol = 1e-2
-    end
-end
-
-@testset "curl-curl" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        v =
-            Geometry.UVVector.(
-                sin.(coords.x .+ 2 .* coords.y),
-                cos.(3 .* coords.x .+ 2 .* coords.y),
-            )
-        curlv_ref =
-            .-3 .* sin.(3 .* coords.x .+ 2 .* coords.y) .-
-            2 .* cos.(coords.x .+ 2 .* coords.y)
-        curlcurlv_ref1 =
-            .-6 .* cos.(3 .* coords.x .+ 2 .* coords.y) .+
-            4 .* sin.(coords.x .+ 2 .* coords.y)
-        curlcurlv_ref2 =
-            9 .* cos.(3 .* coords.x .+ 2 .* coords.y) .-
-            2 .* sin.(coords.x .+ 2 .* coords.y)
-
-        curl = Operators.Curl()
-        curlcurlv =
-            curl.(
-                Geometry.Covariant3Vector.(
-                    curl.(Geometry.Covariant12Vector.(v)),
-                ),
-            )
-        Spaces.weighted_dss2!(curlcurlv)
-
-        @test Geometry.UVVector.(curlcurlv) ≈
-              Geometry.UVVector.(curlcurlv_ref1, curlcurlv_ref2) rtol = 4e-2
-    end
-end
-
-@testset "weak curl-strong curl" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        v =
-            Geometry.UVVector.(
-                sin.(coords.x .+ 2 .* coords.y),
-                cos.(3 .* coords.x .+ 2 .* coords.y),
-            )
-        curlv_ref =
-            .-3 .* sin.(3 .* coords.x .+ 2 .* coords.y) .-
-            2 .* cos.(coords.x .+ 2 .* coords.y)
-        curlcurlv_ref1 =
-            .-6 .* cos.(3 .* coords.x .+ 2 .* coords.y) .+
-            4 .* sin.(coords.x .+ 2 .* coords.y)
-        curlcurlv_ref2 =
-            9 .* cos.(3 .* coords.x .+ 2 .* coords.y) .-
-            2 .* sin.(coords.x .+ 2 .* coords.y)
-
-        curl = Operators.Curl()
-        wcurl = Operators.WeakCurl()
-        curlcurlv =
-            Geometry.UVVector.(
-                wcurl.(
-                    Geometry.Covariant3Vector.(
-                        curl.(Geometry.Covariant12Vector.(v)),
-                    ),
-                ),
-            )
-        Spaces.weighted_dss2!(curlcurlv)
-
-        @test curlcurlv ≈ Geometry.UVVector.(curlcurlv_ref1, curlcurlv_ref2) rtol =
-            4e-2
-    end
-end
-
-@testset "weak curl" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        v =
-            Geometry.UVVector.(
-                sin.(coords.x .+ 2 .* coords.y),
-                cos.(3 .* coords.x .+ 4 .* coords.y),
-            )
-
-        wcurl = Operators.WeakCurl()
-        curlv = wcurl.(Geometry.Covariant12Vector.(v))
-        Spaces.weighted_dss2!(curlv)
-        curlv_ref =
-            .-3 .* sin.(3 .* coords.x .+ 4 .* coords.y) .-
-            2 .* cos.(coords.x .+ 2 .* coords.y)
-
-        @test curlv ≈ Geometry.Contravariant3Vector.(curlv_ref) rtol = 1e-2
-    end
-end
-
-@testset "div" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        v =
-            Geometry.UVVector.(
-                sin.(coords.x .+ 2 .* coords.y),
-                cos.(3 .* coords.x .+ 2 .* coords.y),
-            )
-
-        div = Operators.Divergence()
-        divv = div.(v)
-        Spaces.weighted_dss2!(divv)
-        divv_ref =
-            cos.(coords.x .+ 2 .* coords.y) .-
-            2 .* sin.(3 .* coords.x .+ 2 .* coords.y)
-
-        @test divv ≈ divv_ref rtol = 1e-2
-    end
-end
-
-
-@testset "weak div" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        v =
-            Geometry.UVVector.(
-                sin.(coords.x .+ 2 .* coords.y),
-                cos.(3 .* coords.x .+ 2 .* coords.y),
-            )
-
-        wdiv = Operators.WeakDivergence()
-        divv = wdiv.(v)
-        Spaces.weighted_dss2!(divv)
-        divv_ref =
-            cos.(coords.x .+ 2 .* coords.y) .-
-            2 .* sin.(3 .* coords.x .+ 2 .* coords.y)
-
-        @test divv ≈ divv_ref rtol = 1e-2
-    end
-end
-
-
-@testset "annhilator property: curl-grad" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        f = sin.(coords.x .+ 2 .* coords.y)
-
-        grad = Operators.Gradient()
-        gradf = grad.(f)
-        Spaces.weighted_dss2!(gradf)
-
-        curl = Operators.Curl()
-        curlgradf = curl.(gradf)
-        Spaces.weighted_dss2!(curlgradf)
-
-        @test norm(curlgradf) < 1e-12
-    end
-end
-
-@testset "annhilator property: div-curl" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        v = Geometry.Covariant3Vector.(sin.(coords.x .+ 2 .* coords.y))
-        curl = Operators.Curl()
-        curlv = curl.(v)
-        Spaces.weighted_dss2!(curlv)
-
-        div = Operators.Divergence()
-        divcurlv = div.(curlv)
-        Spaces.weighted_dss2!(divcurlv)
-
-        @test norm(divcurlv) < 1e-12
-    end
-end
-
-@testset "scalar hyperdiffusion" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        k = 2
-        l = 3
-        y = @. sin(k * coords.x + l * coords.y)
-        ∇⁴y_ref = @. (k^2 + l^2)^2 * sin(k * coords.x + l * coords.y)
-
-        wdiv = Operators.WeakDivergence()
-        grad = Operators.Gradient()
-        χ = Spaces.weighted_dss2!(@. wdiv(grad(y)))
-        ∇⁴y = Spaces.weighted_dss2!(@. wdiv(grad(χ)))
-
-        @test ∇⁴y_ref ≈ ∇⁴y rtol = 2e-2
-    end
-end
-
-@testset "vector hyperdiffusion" begin
-    for (topology, space, coords) in (grid_test_setup, ts_test_setup)
-        k = 2
-        l = 3
-        y = @. Geometry.UVVector(sin(k * coords.x + l * coords.y), 0.0)
-        ∇⁴y_ref = @. Geometry.UVVector(
-            (k^2 + l^2)^2 * sin(k * coords.x + l * coords.y),
-            0.0,
-        )
-        curl = Operators.Curl()
-        wcurl = Operators.WeakCurl()
-
-        sdiv = Operators.Divergence()
-        wgrad = Operators.WeakGradient()
-
-        χ = Spaces.weighted_dss2!(
-            @. Geometry.UVVector(wgrad(sdiv(y))) - Geometry.UVVector(
-                wcurl(
-                    Geometry.Covariant3Vector(
-                        curl(Geometry.Covariant12Vector(y)),
-                    ),
-                ),
-            )
-        )
-        ∇⁴y = Spaces.weighted_dss2!(
-            @. Geometry.UVVector(wgrad(sdiv(χ))) - Geometry.UVVector(
-                wcurl(
-                    Geometry.Covariant3Vector(
-                        curl(Geometry.Covariant12Vector(χ)),
-                    ),
-                ),
-            )
-        )
-
-        @test ∇⁴y_ref ≈ ∇⁴y rtol = 2e-2
-    end
-end
-==#

From 5a964387e45c0eacee5cd215a6af88671019a2b9 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 23 Feb 2023 14:23:39 -0800
Subject: [PATCH 15/16] add CUDA shallowwater example to buildkite

---
 .buildkite/pipeline.yml               | 9 +++++++++
 examples/sphere/shallow_water_cuda.jl | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index dcc7522643..713a4b8822 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -544,6 +544,15 @@ steps:
         artifact_paths:
           - "examples/sphere/output/cg_sphere_shallowwater_rossby_haurwitz_alpha0/*"
 
+      - label: ":flower_playing_cards: CUDA Shallow-water 2D sphere"
+        key: "cuda_shallowwater_2d_cg_sphere"
+        command:
+          - "julia --color=yes --project=examples examples/sphere/shallow_water_cuda.jl"
+        artifact_paths:
+          - "examples/sphere/output/cuda_shallowwater_2d_cg_sphere/*"
+        agent:
+          slurm_gpu: 1
+
   - group: "Examples hybrid sphere"
     steps:
 
diff --git a/examples/sphere/shallow_water_cuda.jl b/examples/sphere/shallow_water_cuda.jl
index 0263547d1e..77bfe90338 100644
--- a/examples/sphere/shallow_water_cuda.jl
+++ b/examples/sphere/shallow_water_cuda.jl
@@ -550,9 +550,9 @@ function shallow_water_driver_cuda(ARGS, ::Type{FT}) where {FT}
         adaptive = false,
         progress_message = (dt, u, p, t) -> t,
     )
-    =#
 
     sol = @timev OrdinaryDiffEq.solve!(integrator)
+    =#
     return nothing
 end
 

From f9ab346a0e6a3b1db5fcbfb6231806d82aaacbf1 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Thu, 23 Feb 2023 14:25:26 -0800
Subject: [PATCH 16/16] remove broken mapreduce

---
 .buildkite/pipeline.yml |  4 +--
 src/DataLayouts/cuda.jl | 55 -----------------------------------------
 2 files changed, 2 insertions(+), 57 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 713a4b8822..7e1fc963da 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -550,8 +550,8 @@ steps:
           - "julia --color=yes --project=examples examples/sphere/shallow_water_cuda.jl"
         artifact_paths:
           - "examples/sphere/output/cuda_shallowwater_2d_cg_sphere/*"
-        agent:
-          slurm_gpu: 1
+        agents:
+          slurm_gpus: 1
 
   - group: "Examples hybrid sphere"
     steps:
diff --git a/src/DataLayouts/cuda.jl b/src/DataLayouts/cuda.jl
index 4dd729f1fa..6400acfd36 100644
--- a/src/DataLayouts/cuda.jl
+++ b/src/DataLayouts/cuda.jl
@@ -76,58 +76,3 @@ function Base.copyto!(
     CUDA.@cuda threads = (Nij, Nij) blocks = (Nh,) knl_copyto!(dest, bc)
     return dest
 end
-
-function knl_mapreduce!(dest, fn, op, src)
-
-    #=
-    nij, nh = size(dest)
-
-    thread_idx = CUDA.threadIdx().x
-    block_idx = CUDA.blockIdx().x
-    block_dim = CUDA.blockDim().x
-
-    # mapping to global idx to make insensitive
-    # to number of blocks / threads per device
-    global_idx = thread_idx + (block_idx - 1) * block_dim
-
-    nx, ny = nij, nij
-    i = global_idx % nx == 0 ? nx : global_idx % nx
-    j = cld(global_idx, nx)
-    h = ((global_idx-1) % (nx*nx)) + 1
-    =#
-
-    i = CUDA.threadIdx().x
-    j = CUDA.threadIdx().y
-
-    h = CUDA.blockIdx().x
-
-    p_dest = slab(dest, h)
-    p_src = slab(src, h)
-
-    if h == 1
-        @inbounds p_dest[i, j] = fn(p_src[i, j])
-    else
-        @inbounds p_dest[i, j] = op(p_dest[i, j], fn(p_src[i, j]))
-    end
-
-    return nothing
-end
-
-function Base.reduce(
-    op::Op,
-    bc::Union{
-        IJFH{<:Any, Nij, A},
-        Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}},
-    },
-) where {Op, Nij, A <: CUDA.CuArray}
-    # mapreduce across DataSlab2D
-    _, _, _, _, Nh = size(bc)
-    S = eltype(bc)
-    T = eltype(A)
-    Nf = typesize(T, S)
-    # dest array
-    dest = IJF{S, Nij}(CuArray{T}(undef, Nij, Nij, Nf))
-    CUDA.@cuda threads = (Nij, Nij) blocks = (Nh,) knl_reduce!(dest, op, bc)
-    # copy to CPU, and final reduce
-    reduce(op, IJF{S, Nij}(Array(parent(dest))))
-end