diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h index 5a5023bd9..0c79dec7f 100644 --- a/ttg/ttg/device/task.h +++ b/ttg/ttg/device/task.h @@ -22,6 +22,7 @@ namespace ttg::device { impl_data_t impl_data; ttg::scope scope; bool is_const; + bool is_scratch; }; template @@ -31,10 +32,11 @@ namespace ttg::device { /* extract buffer information from to_device_t */ template - auto extract_buffer_data(detail::to_device_t& a) { - return std::array{ + auto extract_buffer_data(detail::to_device_t& a, std::index_sequence) { + return std::array{ {TTG_IMPL_NS::buffer_data(std::get(a.ties)), - std::get(a.ties).scope(), std::get(a.ties)}...}; + std::get(a.ties).scope(), std::is_const_v>, + ttg::meta::is_devicescratch_v>}...}; } } // namespace detail @@ -46,12 +48,16 @@ namespace ttg::device { Input() { } template Input(Args&&... args) - : m_data{{TTG_IMPL_NS::buffer_data(args), args.scope(), std::is_const_v}...} + : m_data{{TTG_IMPL_NS::buffer_data(args), args.scope(), + std::is_const_v>, + ttg::meta::is_devicescratch_v}...} { } template void add(T&& v) { - m_data.emplace_back(TTG_IMPL_NS::buffer_data(v), v.scope(), std::is_const_v); + using type = std::decay_t; + m_data.emplace_back(TTG_IMPL_NS::buffer_data(v), v.scope(), std::is_const_v, + ttg::meta::is_devicescratch_v); } ttg::span span() { @@ -610,7 +616,7 @@ namespace ttg::device { template ttg::suspend_always await_transform(detail::to_device_t&& a) { - auto arr = detail::extract_buffer_data(a); + auto arr = detail::extract_buffer_data(a, std::make_index_sequence{}); bool need_transfer = !(TTG_IMPL_NS::register_device_memory(ttg::span(arr))); /* TODO: are we allowed to not suspend here and launch the kernel directly? */ m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER; diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index 08d27952c..f3f8efe47 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -127,9 +127,9 @@ namespace ttg_parsec { for (i = 0; i < span.size(); ++i) { /* get_parsec_data is overloaded for buffer and devicescratch */ parsec_data_t* data = span[i].impl_data; - /* TODO: check whether the device is current */ - bool is_const = span[i].is_const; ttg::scope scope = span[i].scope; + bool is_const = span[i].is_const; + bool is_scratch = span[i].is_scratch; if (nullptr != data) { auto access = PARSEC_FLOW_ACCESS_RW; @@ -139,6 +139,11 @@ namespace ttg_parsec { access = PARSEC_FLOW_ACCESS_READ; } + if (is_scratch) { + /* mark the flow as temporary so we can discard it easily */ + access |= TTG_PARSEC_FLOW_ACCESS_TMP; + } + /* build the flow */ /* TODO: reuse the flows of the task class? How can we control the sync direction then? */ flows[i] = parsec_flow_t{.name = nullptr, diff --git a/ttg/ttg/parsec/parsec-ext.h b/ttg/ttg/parsec/parsec-ext.h index a7e5e5222..b5293e035 100644 --- a/ttg/ttg/parsec/parsec-ext.h +++ b/ttg/ttg/parsec/parsec-ext.h @@ -4,4 +4,7 @@ /* HACK: we need this flag on a data copy to indicate whether it has been registered */ #define TTG_PARSEC_DATA_FLAG_REGISTERED ((parsec_data_flag_t)1<<2) +/* HACK: mark the flows of device scratch as temporary so that we can easily discard it */ +#define TTG_PARSEC_FLOW_ACCESS_TMP (1<<7) + #endif // TTG_PARSEC_EXT_H \ No newline at end of file diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index cd1d673da..2fa00f9df 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1419,30 +1419,35 @@ namespace ttg_parsec { int device = detail::parsec_device_to_ttg_device(gpu_device->super.device_index); ttg::device::detail::set_current(device, cuda_stream->cuda_stream); } -#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) - -#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP) +#elif defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP) { parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; int device = detail::parsec_device_to_ttg_device(gpu_device->super.device_index); ttg::device::detail::set_current(device, hip_stream->hip_stream); } -#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) - -#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) && defined(TTG_HAVE_LEVEL_ZERO) +#elif defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) && defined(TTG_HAVE_LEVEL_ZERO) { parsec_level_zero_exec_stream_t *stream; stream = (parsec_level_zero_exec_stream_t *)gpu_stream; int device = detail::parsec_device_to_ttg_device(gpu_device->super.device_index); ttg::device::detail::set_current(device, stream->swq->queue); } -#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) +#endif // defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) && defined(TTG_HAVE_LEVEL_ZERO) /* Here we call back into the coroutine again after the transfers have completed */ static_op(&task->parsec_task); ttg::device::detail::reset_current(); + auto discard_tmp_flows = [&](){ + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + if (gpu_task->flow[i]->flow_flags & TTG_PARSEC_FLOW_ACCESS_TMP) { + /* temporary flow, discard by setting it to read-only to avoid evictions */ + const_cast(gpu_task->flow[i])->flow_flags = PARSEC_FLOW_ACCESS_READ; + } + } + }; + /* we will come back into this function once the kernel and transfers are done */ int rc = PARSEC_HOOK_RETURN_DONE; if (nullptr != task->suspended_task_address) { @@ -1458,6 +1463,7 @@ namespace ttg_parsec { ttg::device::detail::TTG_DEVICE_CORO_COMPLETE == dev_data.state()) { /* the task started sending so we won't come back here */ //std::cout << "device_static_submit task " << task << " complete" << std::endl; + discard_tmp_flows(); } else { //std::cout << "device_static_submit task " << task << " return-again" << std::endl; rc = PARSEC_HOOK_RETURN_AGAIN; @@ -1465,6 +1471,7 @@ namespace ttg_parsec { } else { /* the task is done so we won't come back here */ //std::cout << "device_static_submit task " << task << " complete" << std::endl; + discard_tmp_flows(); } return rc; }