diff --git a/examples/madness/mra-device/functionnode.h b/examples/madness/mra-device/functionnode.h
index b64e513ea..b2514f4bc 100644
--- a/examples/madness/mra-device/functionnode.h
+++ b/examples/madness/mra-device/functionnode.h
@@ -52,11 +52,11 @@ namespace mra {
     }
 
     template <typename T>
-    void distancesq(const Coordinate<T,3>& p, const TensorView<T,2>& q, T* rsq, std::size_t N) {
+    void distancesq(const Coordinate<T,3>& p, const TensorView<T,1>& q, T* rsq, std::size_t N) {
         const T x = p(0);
 #ifdef __CUDA_ARCH__
-        int tid = threadIdx.x*blockDim.x + threadIdx.y;
-        for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) {
+        int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x;
+        for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) {
             T xx = q(0,i) - x;
             rsq[i] = xx*xx;
         }
@@ -73,8 +73,8 @@ namespace mra {
         const T x = p(0);
         const T y = p(1);
 #ifdef __CUDA_ARCH__
-        int tid = threadIdx.x*blockDim.x + threadIdx.y;
-        for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) {
+        int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x;
+        for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) {
             T xx = q(0,i) - x;
             T yy = q(1,i) - y;
             rsq[i] = xx*xx + yy*yy;
@@ -89,13 +89,13 @@ namespace mra {
     }
 
     template <typename T>
-    void distancesq(const Coordinate<T,3>& p, const TensorView<T,2>& q, T* rsq, std::size_t N) {
+    void distancesq(const Coordinate<T,3>& p, const TensorView<T,3>& q, T* rsq, std::size_t N) {
         const T x = p(0);
         const T y = p(1);
         const T z = p(2);
 #ifdef __CUDA_ARCH__
-        int tid = threadIdx.x*blockDim.x + threadIdx.y;
-        for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) {
+        int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x;
+        for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) {
             T xx = q(0,i) - x;
             T yy = q(1,i) - y;
             T zz = q(2,i) - z;
diff --git a/examples/madness/mra-device/kernels.cu b/examples/madness/mra-device/kernels.cu
index 3bf66278b..e5e265904 100644
--- a/examples/madness/mra-device/kernels.cu
+++ b/examples/madness/mra-device/kernels.cu
@@ -8,7 +8,7 @@
 
 /// Make outer product of quadrature points for vectorized algorithms
 template<typename T>
-__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::integral_constant<1>) {
+__device__ void make_xvec(const mra::TensorView<T,2>& x, mra::TensorView<T,2>& xvec, std::integral_constant<1>) {
   /* uses threads in 3 dimensions */
   xvec = x;
   /* TensorView assignment synchronizes */
@@ -16,7 +16,7 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::
 
 /// Make outer product of quadrature points for vectorized algorithms
 template<typename T>
-__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::integral_constant<2>) {
+__device__ void make_xvec(const mra::TensorView<T,2>& x, mra::TensorView<T,2>& xvec, std::integral_constant<2>) {
   const std::size_t K = x.dim(1);
   if (threadId.z == 0) {
     for (size_t i=blockIdx.y; i<K; i += blockDim.y) {
@@ -32,7 +32,7 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::
 
 /// Make outer product of quadrature points for vectorized algorithms
 template<typename T>
-__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::integral_constant<3>) {
+__device__ void make_xvec(const mra::TensorView<T,2>& x, mra::TensorView<T,2>& xvec, std::integral_constant<3>) {
   const std::size_t K = x.dim(1);
   for (size_t i=threadIdx.z; i<K; i += blockDim.z) {
     for (size_t j=blockIdx.y; j<K; j += blockDim.y) {
@@ -48,17 +48,17 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::
 }
 
 
-template <typename functorT, typename T, Dimension NDIM>
+template <typename functorT, typename T, mra::Dimension NDIM>
 __device__
 void fcube(const functorT& f,
-           const Key<NDIM>& key,
+           const mra::Key<NDIM>& key,
            const T thresh,
            // output
-           TensorView<T,2>& values,
+           mra::TensorView<T,2>& values,
            std::size_t K,
            // temporaries
-           TensorView<T, NDIM> x,
-           TensorView<T, 2> xvec) {
+           mra::TensorView<T, NDIM> x,
+           mra::TensorView<T, 2> xvec) {
   if (is_negligible(f,Domain<NDIM>:: template bounding_box<T>(key),truncate_tol(key,thresh))) {
       values = 0.0;
       /* TensorView assigment synchronizes */
@@ -149,7 +149,7 @@ std::array<mra::Slice, NDIM> get_child_slice(mra::Key<NDIM> key, std::size_t K,
   return slices;
 }
 
-template<typename Fn, typename T, std::size_t NDIM>
+template<typename Fn, typename T, mra::Dimension NDIM>
 __global__ fcoeffs_kernel1(
   const Fn f,
   mra::Key<NDIM> key,
@@ -180,7 +180,7 @@ __global__ fcoeffs_kernel1(
   }
 }
 
-template<typename T, std::size_t NDIM>
+template<typename T, mra::Dimension NDIM>
 __global__ fcoeffs_kernel2(
   mra::Key<NDIM> key,
   T* coeffs_ptr,
@@ -217,7 +217,7 @@ __global__ fcoeffs_kernel2(
   }
 }
 
-template<typename Fn, typename T, std::size_t NDIM>
+template<typename Fn, typename T, mra::Dimension NDIM>
 void submit_fcoeffs_kernel(
   const Fn* fn,
   const mra::Key<NDIM>& key,
diff --git a/examples/madness/mra-device/kernels.h b/examples/madness/mra-device/kernels.h
index e26c27038..51f38e6ca 100644
--- a/examples/madness/mra-device/kernels.h
+++ b/examples/madness/mra-device/kernels.h
@@ -12,7 +12,7 @@ typedef int cudaStream;
 
 /* Returns the total size of temporary memory needed for
  * the project() kernel. */
-template<std::size_t NDIM>
+template<mra::Dimension NDIM>
 std::size_t project_tmp_size(std::size_t K) {
   const size_t K2NDIM = std::pow(K,NDIM);
   return (NDIM*K2NDIM) // xvec in fcube
@@ -20,7 +20,7 @@ std::size_t project_tmp_size(std::size_t K) {
 }
 
 /* Explicitly instantiated for 1, 2, 3 dimensional Gaussians */
-template<typename Fn, typename T, std::size_t NDIM>
+template<typename Fn, typename T, mra::Dimension NDIM>
 void submit_fcoeffs_kernel(
   const Fn* fn,
   const mra::Key<NDIM>& key,
diff --git a/examples/madness/mra-device/mrattg-device.cc b/examples/madness/mra-device/mrattg-device.cc
index be5072eaa..477a4f8e8 100644
--- a/examples/madness/mra-device/mrattg-device.cc
+++ b/examples/madness/mra-device/mrattg-device.cc
@@ -8,6 +8,7 @@
 #include "functionfunctor.h"
 #include "../../mrakey.h"
 
+#if 0
 /// Project the scaling coefficients using screening and test norm of difference coeffs.  Return true if difference coeffs negligible.
 template <typename FnT, typename T, mra::Dimension NDIM>
 static bool fcoeffs(
@@ -60,6 +61,7 @@ static bool fcoeffs(
   }
   co_return status;
 }
+#endif // 0
 
 template<typename FnT, typename T, mra::Dimension NDIM>
 auto make_project(
@@ -93,7 +95,49 @@ auto make_project(
     }
     else {
       /* here we actually compute: first select a device */
-      result.is_leaf = fcoeffs(f, functiondata, key, thresh, coeffs);
+      //result.is_leaf = fcoeffs(f, functiondata, key, thresh, coeffs);
+      /**
+       * BEGIN FCOEFFS HERE
+       * TODO: figure out a way to outline this into a function or coroutine
+       */
+
+      /* global function data */
+      // TODO: need to make our own FunctionData with dynamic K
+      const auto& phibar = functiondata.get_phibar();
+      const auto& hgT = functiondata.get_hgT();
+
+      const std::size_t K = coeffs.dim(0);
+
+      /* temporaries */
+      bool is_leaf;
+      auto is_leaf_scratch = ttg::make_scratch(&is_leaf, ttg::scope::Allocate);
+      const std::size_t tmp_size = project_tmp_size<NDIM>(K);
+      T* tmp = new T[tmp_size]; // TODO: move this into make_scratch()
+      auto tmp_scratch = ttg::make_scratch(tmp, ttg::scope::Allocate, tmp_size);
+
+      /* TODO: cannot do this from a function, need to move it into the main task */
+      co_await ttg::device::select(f, coeffs.buffer(), phibar.buffer(),
+                                   hgT.buffer(), tmp_scratch, is_leaf_scratch);
+      auto coeffs_view = coeffs.current_view();
+      auto phibar_view = phibar.current_view();
+      auto hgT_view    = hgT.current_view();
+      T* tmp_device = tmp_scratch.device_ptr();
+      bool *is_leaf_device = is_leaf_scratch.device_ptr();
+      FnT* f_ptr = f.current_device_ptr();
+
+      /* submit the kernel */
+      submit_fcoeffs_kernel(f_ptr, key, coeffs_view, phibar_view, hgT_view, tmp_device,
+                            is_leaf_device, ttg::device::current_stream());
+
+      /* wait and get is_leaf back */
+      co_await ttg::device::wait(is_leaf_scratch);
+      result.is_leaf = is_leaf;
+      /* todo: is this safe? */
+      delete[] tmp;
+      /**
+       * END FCOEFFS HERE
+       */
+
       if (!result.is_leaf) {
         std::vector<mra::Key<NDIM>> bcast_keys;
         for (auto child : children(key)) bcast_keys.push_back(child);
diff --git a/examples/madness/mra-device/tensor.h b/examples/madness/mra-device/tensor.h
index 3b4887a4c..77a7ceaef 100644
--- a/examples/madness/mra-device/tensor.h
+++ b/examples/madness/mra-device/tensor.h
@@ -17,7 +17,8 @@ namespace mra {
     using value_type = std::decay_t<T>;
     using size_type = std::size_t;
     using allocator_type = Allocator;
-    using view_type = TensorView<T, NDIM>;
+    using view_type = TensorView<value_type, NDIM>;
+    using const_view_type = TensorView<std::add_const_t<value_type>, NDIM>;
     using buffer_type = ttg::Buffer<value_type, allocator_type>;
 
     static constexpr Dimension ndim() { return NDIM; }
@@ -75,11 +76,15 @@ namespace mra {
       return std::reduce(&m_dims[0], &m_dims[ndim()], 1, std::multiplies<size_type>{});
     }
 
-    auto get_buffer() {
+    size_type dim(Dimension dim) const {
+      return m_dims[dim];
+    }
+
+    auto& buffer() {
       return m_buffer;
     }
 
-    const auto get_buffer() const {
+    const auto& buffer() const {
       return m_buffer;
     }
 
@@ -91,10 +96,17 @@ namespace mra {
       return m_buffer.host_ptr();
     }
 
-    /* returns a view for the current memory space */
+    /* returns a view for the current memory space
+     * TODO: handle const correctness (const Tensor should return a const TensorView)*/
     view_type current_view() {
       return view_type(m_buffer.current_device_ptr(), m_dims);
     }
+
+    /* returns a view for the current memory space
+     * TODO: handle const correctness (const Tensor should return a const TensorView)*/
+    const_view_type current_view() const {
+      return const_view_type(m_buffer.current_device_ptr(), m_dims);
+    }
   };
 
 } // namespace mra
diff --git a/examples/madness/mra-device/tensorview.h b/examples/madness/mra-device/tensorview.h
index 41398e820..1c66d767d 100644
--- a/examples/madness/mra-device/tensorview.h
+++ b/examples/madness/mra-device/tensorview.h
@@ -66,8 +66,9 @@ namespace mra {
   class TensorView {
   public:
     using value_type = std::decay_t<T>;
+    using const_value_type = std::add_const_t<value_type>;
     using size_type = std::size_t;
-    static constexpr int ndim() { return NDIM; }
+    static constexpr Dimension ndim() { return NDIM; }
     using dims_array_t = std::array<size_type, ndim()>;
     static constexpr bool is_tensor() { return true; }
 
@@ -157,7 +158,7 @@ namespace mra {
 
   public:
     template<typename... Dims>
-    TensorView(value_type *ptr, Dims... dims)
+    TensorView(T *ptr, Dims... dims)
     : m_dims({dims...})
     , m_ptr(ptr)
     {
@@ -169,7 +170,7 @@ namespace mra {
       }
     }
 
-    TensorView(value_type *ptr, const dims_array_t& dims)
+    TensorView(T *ptr, const dims_array_t& dims)
     : m_dims(dims)
     , m_ptr(ptr)
     { }
@@ -197,7 +198,7 @@ namespace mra {
       return std::reduce(&m_dims[0], &m_dims[ndim()-1], 1, std::multiplies<size_type>{});
     }
 
-    size_type dim(size_type d) const {
+    size_type dim(Dimension d) const {
       return m_dims[d];
     }
 
@@ -273,7 +274,7 @@ namespace mra {
 
   private:
     dims_array_t m_dims;
-    value_type *m_ptr;
+    T *m_ptr; // may be const or non-const
   };