diff --git a/examples/madness/mra-device/functionnode.h b/examples/madness/mra-device/functionnode.h index b64e513ea..b2514f4bc 100644 --- a/examples/madness/mra-device/functionnode.h +++ b/examples/madness/mra-device/functionnode.h @@ -52,11 +52,11 @@ namespace mra { } template - void distancesq(const Coordinate& p, const TensorView& q, T* rsq, std::size_t N) { + void distancesq(const Coordinate& p, const TensorView& q, T* rsq, std::size_t N) { const T x = p(0); #ifdef __CUDA_ARCH__ - int tid = threadIdx.x*blockDim.x + threadIdx.y; - for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) { + int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x; + for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) { T xx = q(0,i) - x; rsq[i] = xx*xx; } @@ -73,8 +73,8 @@ namespace mra { const T x = p(0); const T y = p(1); #ifdef __CUDA_ARCH__ - int tid = threadIdx.x*blockDim.x + threadIdx.y; - for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) { + int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x; + for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) { T xx = q(0,i) - x; T yy = q(1,i) - y; rsq[i] = xx*xx + yy*yy; @@ -89,13 +89,13 @@ namespace mra { } template - void distancesq(const Coordinate& p, const TensorView& q, T* rsq, std::size_t N) { + void distancesq(const Coordinate& p, const TensorView& q, T* rsq, std::size_t N) { const T x = p(0); const T y = p(1); const T z = p(2); #ifdef __CUDA_ARCH__ - int tid = threadIdx.x*blockDim.x + threadIdx.y; - for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) { + int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x; + for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) { T xx = q(0,i) - x; T yy = q(1,i) - y; T zz = q(2,i) - z; diff --git a/examples/madness/mra-device/kernels.cu b/examples/madness/mra-device/kernels.cu index 3bf66278b..e5e265904 100644 --- a/examples/madness/mra-device/kernels.cu +++ b/examples/madness/mra-device/kernels.cu @@ -8,7 +8,7 @@ /// Make outer product of quadrature points for vectorized algorithms template -__device__ void make_xvec(const TensorView& x, TensorView& xvec, std::integral_constant<1>) { +__device__ void make_xvec(const mra::TensorView& x, mra::TensorView& xvec, std::integral_constant<1>) { /* uses threads in 3 dimensions */ xvec = x; /* TensorView assignment synchronizes */ @@ -16,7 +16,7 @@ __device__ void make_xvec(const TensorView& x, TensorView& xvec, std:: /// Make outer product of quadrature points for vectorized algorithms template -__device__ void make_xvec(const TensorView& x, TensorView& xvec, std::integral_constant<2>) { +__device__ void make_xvec(const mra::TensorView& x, mra::TensorView& xvec, std::integral_constant<2>) { const std::size_t K = x.dim(1); if (threadId.z == 0) { for (size_t i=blockIdx.y; i& x, TensorView& xvec, std:: /// Make outer product of quadrature points for vectorized algorithms template -__device__ void make_xvec(const TensorView& x, TensorView& xvec, std::integral_constant<3>) { +__device__ void make_xvec(const mra::TensorView& x, mra::TensorView& xvec, std::integral_constant<3>) { const std::size_t K = x.dim(1); for (size_t i=threadIdx.z; i& x, TensorView& xvec, std:: } -template +template __device__ void fcube(const functorT& f, - const Key& key, + const mra::Key& key, const T thresh, // output - TensorView& values, + mra::TensorView& values, std::size_t K, // temporaries - TensorView x, - TensorView xvec) { + mra::TensorView x, + mra::TensorView xvec) { if (is_negligible(f,Domain:: template bounding_box(key),truncate_tol(key,thresh))) { values = 0.0; /* TensorView assigment synchronizes */ @@ -149,7 +149,7 @@ std::array get_child_slice(mra::Key key, std::size_t K, return slices; } -template +template __global__ fcoeffs_kernel1( const Fn f, mra::Key key, @@ -180,7 +180,7 @@ __global__ fcoeffs_kernel1( } } -template +template __global__ fcoeffs_kernel2( mra::Key key, T* coeffs_ptr, @@ -217,7 +217,7 @@ __global__ fcoeffs_kernel2( } } -template +template void submit_fcoeffs_kernel( const Fn* fn, const mra::Key& key, diff --git a/examples/madness/mra-device/kernels.h b/examples/madness/mra-device/kernels.h index e26c27038..51f38e6ca 100644 --- a/examples/madness/mra-device/kernels.h +++ b/examples/madness/mra-device/kernels.h @@ -12,7 +12,7 @@ typedef int cudaStream; /* Returns the total size of temporary memory needed for * the project() kernel. */ -template +template std::size_t project_tmp_size(std::size_t K) { const size_t K2NDIM = std::pow(K,NDIM); return (NDIM*K2NDIM) // xvec in fcube @@ -20,7 +20,7 @@ std::size_t project_tmp_size(std::size_t K) { } /* Explicitly instantiated for 1, 2, 3 dimensional Gaussians */ -template +template void submit_fcoeffs_kernel( const Fn* fn, const mra::Key& key, diff --git a/examples/madness/mra-device/mrattg-device.cc b/examples/madness/mra-device/mrattg-device.cc index be5072eaa..477a4f8e8 100644 --- a/examples/madness/mra-device/mrattg-device.cc +++ b/examples/madness/mra-device/mrattg-device.cc @@ -8,6 +8,7 @@ #include "functionfunctor.h" #include "../../mrakey.h" +#if 0 /// Project the scaling coefficients using screening and test norm of difference coeffs. Return true if difference coeffs negligible. template static bool fcoeffs( @@ -60,6 +61,7 @@ static bool fcoeffs( } co_return status; } +#endif // 0 template auto make_project( @@ -93,7 +95,49 @@ auto make_project( } else { /* here we actually compute: first select a device */ - result.is_leaf = fcoeffs(f, functiondata, key, thresh, coeffs); + //result.is_leaf = fcoeffs(f, functiondata, key, thresh, coeffs); + /** + * BEGIN FCOEFFS HERE + * TODO: figure out a way to outline this into a function or coroutine + */ + + /* global function data */ + // TODO: need to make our own FunctionData with dynamic K + const auto& phibar = functiondata.get_phibar(); + const auto& hgT = functiondata.get_hgT(); + + const std::size_t K = coeffs.dim(0); + + /* temporaries */ + bool is_leaf; + auto is_leaf_scratch = ttg::make_scratch(&is_leaf, ttg::scope::Allocate); + const std::size_t tmp_size = project_tmp_size(K); + T* tmp = new T[tmp_size]; // TODO: move this into make_scratch() + auto tmp_scratch = ttg::make_scratch(tmp, ttg::scope::Allocate, tmp_size); + + /* TODO: cannot do this from a function, need to move it into the main task */ + co_await ttg::device::select(f, coeffs.buffer(), phibar.buffer(), + hgT.buffer(), tmp_scratch, is_leaf_scratch); + auto coeffs_view = coeffs.current_view(); + auto phibar_view = phibar.current_view(); + auto hgT_view = hgT.current_view(); + T* tmp_device = tmp_scratch.device_ptr(); + bool *is_leaf_device = is_leaf_scratch.device_ptr(); + FnT* f_ptr = f.current_device_ptr(); + + /* submit the kernel */ + submit_fcoeffs_kernel(f_ptr, key, coeffs_view, phibar_view, hgT_view, tmp_device, + is_leaf_device, ttg::device::current_stream()); + + /* wait and get is_leaf back */ + co_await ttg::device::wait(is_leaf_scratch); + result.is_leaf = is_leaf; + /* todo: is this safe? */ + delete[] tmp; + /** + * END FCOEFFS HERE + */ + if (!result.is_leaf) { std::vector> bcast_keys; for (auto child : children(key)) bcast_keys.push_back(child); diff --git a/examples/madness/mra-device/tensor.h b/examples/madness/mra-device/tensor.h index 3b4887a4c..77a7ceaef 100644 --- a/examples/madness/mra-device/tensor.h +++ b/examples/madness/mra-device/tensor.h @@ -17,7 +17,8 @@ namespace mra { using value_type = std::decay_t; using size_type = std::size_t; using allocator_type = Allocator; - using view_type = TensorView; + using view_type = TensorView; + using const_view_type = TensorView, NDIM>; using buffer_type = ttg::Buffer; static constexpr Dimension ndim() { return NDIM; } @@ -75,11 +76,15 @@ namespace mra { return std::reduce(&m_dims[0], &m_dims[ndim()], 1, std::multiplies{}); } - auto get_buffer() { + size_type dim(Dimension dim) const { + return m_dims[dim]; + } + + auto& buffer() { return m_buffer; } - const auto get_buffer() const { + const auto& buffer() const { return m_buffer; } @@ -91,10 +96,17 @@ namespace mra { return m_buffer.host_ptr(); } - /* returns a view for the current memory space */ + /* returns a view for the current memory space + * TODO: handle const correctness (const Tensor should return a const TensorView)*/ view_type current_view() { return view_type(m_buffer.current_device_ptr(), m_dims); } + + /* returns a view for the current memory space + * TODO: handle const correctness (const Tensor should return a const TensorView)*/ + const_view_type current_view() const { + return const_view_type(m_buffer.current_device_ptr(), m_dims); + } }; } // namespace mra diff --git a/examples/madness/mra-device/tensorview.h b/examples/madness/mra-device/tensorview.h index 41398e820..1c66d767d 100644 --- a/examples/madness/mra-device/tensorview.h +++ b/examples/madness/mra-device/tensorview.h @@ -66,8 +66,9 @@ namespace mra { class TensorView { public: using value_type = std::decay_t; + using const_value_type = std::add_const_t; using size_type = std::size_t; - static constexpr int ndim() { return NDIM; } + static constexpr Dimension ndim() { return NDIM; } using dims_array_t = std::array; static constexpr bool is_tensor() { return true; } @@ -157,7 +158,7 @@ namespace mra { public: template - TensorView(value_type *ptr, Dims... dims) + TensorView(T *ptr, Dims... dims) : m_dims({dims...}) , m_ptr(ptr) { @@ -169,7 +170,7 @@ namespace mra { } } - TensorView(value_type *ptr, const dims_array_t& dims) + TensorView(T *ptr, const dims_array_t& dims) : m_dims(dims) , m_ptr(ptr) { } @@ -197,7 +198,7 @@ namespace mra { return std::reduce(&m_dims[0], &m_dims[ndim()-1], 1, std::multiplies{}); } - size_type dim(size_type d) const { + size_type dim(Dimension d) const { return m_dims[d]; } @@ -273,7 +274,7 @@ namespace mra { private: dims_array_t m_dims; - value_type *m_ptr; + T *m_ptr; // may be const or non-const };