From 55c7b143559e1b417973b5c190e5e9ead53a28ee Mon Sep 17 00:00:00 2001 From: Oleksandr Yermoshenko Date: Wed, 10 Mar 2021 15:18:50 +0200 Subject: [PATCH 01/21] Added concurrent mode to utilize power of all CPUs --- .gitmodules | 3 ++ CHANGELOG.md | 4 +++ Leanify.vcxproj | 6 ++-- Makefile | 2 +- README.md | 1 + build_gcc.bat | 2 +- main.cpp | 85 ++++++++++++++++++++++++++++++++++++++++++------- main.h | 2 ++ taskflow | 1 + 9 files changed, 89 insertions(+), 17 deletions(-) create mode 100644 .gitmodules create mode 160000 taskflow diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4b62775 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "taskflow"] + path = taskflow + url = https://github.com/taskflow/taskflow diff --git a/CHANGELOG.md b/CHANGELOG.md index 43e35ce..0221236 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 0.4.4 (2021-03-10) + ++ Added concurrent mode to utilize power of all CPUs + # 0.4.3 (2015-11-25) + Added support for Data URI #14. diff --git a/Leanify.vcxproj b/Leanify.vcxproj index 0519441..09892bd 100644 --- a/Leanify.vcxproj +++ b/Leanify.vcxproj @@ -147,7 +147,7 @@ WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) $(ProjectDir)lib;%(AdditionalIncludeDirectories) false - stdcpp14 + stdcpp17 @@ -259,7 +259,7 @@ None true false - stdcpp14 + stdcpp17 @@ -295,7 +295,7 @@ true $(ProjectDir)lib;%(AdditionalIncludeDirectories) Default - stdcpp14 + stdcpp17 None true false diff --git a/Makefile b/Makefile index 020f5c1..7ecaf55 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ ZOPFLIPNG_OBJ := lib/zopflipng/lodepng/lodepng.o lib/zopflipng/lodepng/lodepng CFLAGS += -Wall -Wextra -Wno-unused-parameter -Werror -O3 -msse2 -mfpmath=sse -fno-exceptions -flto CPPFLAGS += -I./lib -CXXFLAGS += $(CFLAGS) -std=c++14 -fno-rtti +CXXFLAGS += $(CFLAGS) -std=c++17 -fno-rtti LDFLAGS += -flto ifeq ($(OS), Windows_NT) diff --git a/README.md b/README.md index 90233b0..e0396c0 100644 --- a/README.md +++ b/README.md @@ -231,6 +231,7 @@ Usage: leanify [options] paths -f, --fastmode Fast mode, no recompression. -q, --quiet No output to stdout. -v, --verbose Verbose output. + -c, --concurrent Distribute all tasks to all CPUs. --keep-exif Do not remove Exif. ``` diff --git a/build_gcc.bat b/build_gcc.bat index e83f9ac..145cf92 100644 --- a/build_gcc.bat +++ b/build_gcc.bat @@ -1,7 +1,7 @@ @Echo off Pushd %~dp0 SetLocal EnableDelayedExpansion -Set Args=-std=c++14 -O3 -msse2 -mfpmath=sse -fno-exceptions -fno-rtti -flto -I./lib -s -o "Leanify" Leanify.res +Set Args=-std=c++17 -O3 -msse2 -mfpmath=sse -fno-exceptions -fno-rtti -flto -I./lib -s -o "Leanify" Leanify.res For /r "%~dp0" %%i In (*.cpp *.c *.cc) Do (Set t=%%i && Set Args=!Args!!t:%~dp0=!) For %%i In (fileio_linux.cpp lib\mozjpeg\jstdhuff.c) Do Set Args=!Args:%%i =! windres --output-format=coff Leanify.rc Leanify.res diff --git a/main.cpp b/main.cpp index d0baa9a..8d1ab39 100644 --- a/main.cpp +++ b/main.cpp @@ -16,20 +16,32 @@ #include "leanify.h" #include "version.h" +#include "taskflow/taskflow/taskflow.hpp" + using std::cerr; using std::cout; using std::endl; using std::string; -void PrintSize(size_t size) { +template +std::string ToString(const T a_value, const int n = 2) { + std::ostringstream out; + out.precision(n); + out << std::fixed << a_value; + return out.str(); +} + + +const std::string BuildSize(size_t size) { if (size < 1024) - cout << size << " B"; + return std::to_string(size) + " B"; else if (size < 1024 * 1024) - cout << size / 1024.0 << " KB"; + return ToString(size / 1024.0) + " KB"; else - cout << size / 1024.0 / 1024.0 << " MB"; + return ToString(size / 1024.0 / 1024.0) + " MB"; } + #ifdef _WIN32 int ProcessFile(const wchar_t* file_path) { char mbs[MAX_PATH] = { 0 }; @@ -43,7 +55,8 @@ int ProcessFile(const char* file_path, const struct stat* sb = nullptr, int type string filename(file_path); #endif // _WIN32 - cout << "Processing: " << filename << endl; + if (!concurrent_processing) + cout << "Processing: " << filename << endl; File input_file(file_path); if (input_file.IsOK()) { @@ -51,13 +64,21 @@ int ProcessFile(const char* file_path, const struct stat* sb = nullptr, int type size_t new_size = LeanifyFile(input_file.GetFilePionter(), original_size, 0, filename); - PrintSize(original_size); - cout << " -> "; - PrintSize(new_size); - cout << "\tLeanified: "; - PrintSize(original_size - new_size); + std::string log; + if (concurrent_processing) + log = "Processing: " + filename + "\n"; + + log += + BuildSize(original_size) + + " -> " + + BuildSize(new_size) + + "\tLeanified: " + + BuildSize(original_size - new_size) + + " (" + + ToString(100 - 100.0 * new_size / original_size) + + "%)"; - cout << " (" << 100 - 100.0 * new_size / original_size << "%)" << endl; + cout << log << endl; input_file.UnMapFile(new_size); } @@ -65,6 +86,8 @@ int ProcessFile(const char* file_path, const struct stat* sb = nullptr, int type return 0; } + + void PauseIfNotTerminal() { // pause if Leanify is not started in terminal // so that user can see the output instead of just a flash of a black box @@ -74,6 +97,7 @@ void PauseIfNotTerminal() { #endif // _WIN32 } + void PrintInfo() { cerr << "Leanify\t" << VERSION_STR << endl << endl; cerr << "Usage: leanify [options] paths\n" @@ -84,6 +108,7 @@ void PrintInfo() { " -f, --fastmode Fast mode, no recompression.\n" " -q, --quiet No output to stdout.\n" " -v, --verbose Verbose output.\n" + " -c, --concurrent Distribute all tasks to all CPUs.\n" " --keep-exif Do not remove Exif.\n" " --keep-icc Do not remove ICC profile.\n" "\n" @@ -97,6 +122,28 @@ void PrintInfo() { PauseIfNotTerminal(); } +tf::Taskflow taskflow; + +#ifdef _WIN32 +int EnqueueProcessFileTask(const wchar_t* file_path) { + std::wstring* filePath = new std::wstring(file_path); +#else +// written like this in order to be callback function of ftw() +int EnqueueProcessFileTask(const char* file_path, const struct stat* sb = nullptr, int typeflag = FTW_F) { + if (typeflag != FTW_F) + return 0; + std::string* filePath = new std::string(file_path); +#endif // _WIN32 + + auto task = [filePath]() { + ProcessFile(filePath->c_str()); + delete filePath; + }; + taskflow.emplace(task); + return 0; +} + + #ifdef _WIN32 int main() { int argc; @@ -157,6 +204,9 @@ int main(int argc, char** argv) { cout.clear(); is_verbose = true; break; + case 'c': + concurrent_processing = true; + break; case '-': if (STRCMP(argv[i] + j + 1, "fastmode") == 0) { j += 7; @@ -173,6 +223,9 @@ int main(int argc, char** argv) { } else if (STRCMP(argv[i] + j + 1, "verbose") == 0) { j += 6; argv[i][j + 1] = 'v'; + } else if (STRCMP(argv[i] + j + 1, "concurrent") == 0) { + j += 9; + argv[i][j + 1] = 'c'; } else if (STRCMP(argv[i] + j + 1, "keep-exif") == 0) { j += 9; Jpeg::keep_exif_ = true; @@ -221,9 +274,17 @@ int main(int argc, char** argv) { // support multiple input file do { - TraversePath(argv[i], ProcessFile); + TraversePath(argv[i], EnqueueProcessFileTask); } while (++i < argc); + size_t concurrent_tasks = std::thread::hardware_concurrency(); + + if (!concurrent_processing) + concurrent_tasks = 1; + + tf::Executor executor(concurrent_tasks); + executor.run(taskflow).wait(); + PauseIfNotTerminal(); return 0; diff --git a/main.h b/main.h index b4e671f..de1429b 100644 --- a/main.h +++ b/main.h @@ -22,5 +22,7 @@ int iterations; // file inside zip that is inside another zip: depth 3 int depth; int max_depth; +bool concurrent_processing = false; + #endif // MAIN_H_ diff --git a/taskflow b/taskflow new file mode 160000 index 0000000..1ec4e09 --- /dev/null +++ b/taskflow @@ -0,0 +1 @@ +Subproject commit 1ec4e0914af54ffd9796a36b18683b3879d0bce7 From 75baa8a224399ebc3baa9317880cf1fe2a94b8eb Mon Sep 17 00:00:00 2001 From: Oleksandr Yermoshenko Date: Wed, 10 Mar 2021 16:18:13 +0200 Subject: [PATCH 02/21] Removed taskflow submodule. Taskflow added from sources. --- .gitmodules | 3 - main.cpp | 2 +- taskflow | 1 - taskflow/core/algorithm/critical.hpp | 78 + taskflow/core/algorithm/for_each.hpp | 772 ++++++++++ taskflow/core/algorithm/reduce.hpp | 866 +++++++++++ taskflow/core/algorithm/sort.hpp | 482 ++++++ taskflow/core/declarations.hpp | 44 + taskflow/core/environment.hpp | 8 + taskflow/core/error.hpp | 26 + taskflow/core/executor.hpp | 1249 +++++++++++++++ taskflow/core/flow_builder.hpp | 1001 ++++++++++++ taskflow/core/graph.hpp | 572 +++++++ taskflow/core/notifier.hpp | 267 ++++ taskflow/core/observer.hpp | 735 +++++++++ taskflow/core/semaphore.hpp | 125 ++ taskflow/core/task.hpp | 664 ++++++++ taskflow/core/taskflow.hpp | 478 ++++++ taskflow/core/topology.hpp | 61 + taskflow/core/tsq.hpp | 247 +++ taskflow/core/worker.hpp | 103 ++ taskflow/cublasflow.hpp | 24 + taskflow/cuda/cublas/cublas_error.hpp | 59 + taskflow/cuda/cublas/cublas_flow.hpp | 1361 +++++++++++++++++ taskflow/cuda/cublas/cublas_handle.hpp | 156 ++ taskflow/cuda/cublas/cublas_helper.hpp | 73 + taskflow/cuda/cublas/cublas_level1.hpp | 200 +++ taskflow/cuda/cublas/cublas_level2.hpp | 286 ++++ taskflow/cuda/cublas/cublas_level3.hpp | 489 ++++++ taskflow/cuda/cuda_algorithm/cuda_blaf.hpp | 148 ++ .../cuda/cuda_algorithm/cuda_for_each.hpp | 50 + taskflow/cuda/cuda_algorithm/cuda_matmul.hpp | 57 + taskflow/cuda/cuda_algorithm/cuda_reduce.hpp | 114 ++ .../cuda/cuda_algorithm/cuda_transform.hpp | 27 + .../cuda/cuda_algorithm/cuda_transpose.hpp | 41 + taskflow/cuda/cuda_capturer.hpp | 844 ++++++++++ taskflow/cuda/cuda_device.hpp | 342 +++++ taskflow/cuda/cuda_error.hpp | 26 + taskflow/cuda/cuda_flow.hpp | 1219 +++++++++++++++ taskflow/cuda/cuda_graph.hpp | 725 +++++++++ taskflow/cuda/cuda_memory.hpp | 376 +++++ taskflow/cuda/cuda_optimizer.hpp | 638 ++++++++ taskflow/cuda/cuda_pool.hpp | 182 +++ taskflow/cuda/cuda_stream.hpp | 286 ++++ taskflow/cuda/cuda_task.hpp | 227 +++ taskflow/cudaflow.hpp | 14 + taskflow/dsl/connection.hpp | 53 + taskflow/dsl/dsl.hpp | 13 + taskflow/dsl/meta_macro.hpp | 72 + taskflow/dsl/task_analyzer.hpp | 40 + taskflow/dsl/task_dsl.hpp | 104 ++ taskflow/dsl/task_trait.hpp | 46 + taskflow/dsl/tuple_utils.hpp | 43 + taskflow/dsl/type_list.hpp | 136 ++ taskflow/taskflow.hpp | 60 + taskflow/tensorframe/tensor.hpp | 268 ++++ taskflow/tensorframe/tensor_expr.hpp | 163 ++ taskflow/tensorframe/tensor_graph.hpp | 115 ++ taskflow/tensorframe/tensor_ops.hpp | 28 + taskflow/tensorframe/tensorframe.hpp | 12 + taskflow/utility/iterator.hpp | 22 + taskflow/utility/math.hpp | 127 ++ taskflow/utility/object_pool.hpp | 775 ++++++++++ taskflow/utility/os.hpp | 146 ++ taskflow/utility/passive_vector.hpp | 212 +++ taskflow/utility/serializer.hpp | 1108 ++++++++++++++ taskflow/utility/singleton.hpp | 33 + taskflow/utility/stream.hpp | 31 + taskflow/utility/traits.hpp | 340 ++++ taskflow/utility/uuid.hpp | 237 +++ 70 files changed, 19927 insertions(+), 5 deletions(-) delete mode 100644 .gitmodules delete mode 160000 taskflow create mode 100644 taskflow/core/algorithm/critical.hpp create mode 100644 taskflow/core/algorithm/for_each.hpp create mode 100644 taskflow/core/algorithm/reduce.hpp create mode 100644 taskflow/core/algorithm/sort.hpp create mode 100644 taskflow/core/declarations.hpp create mode 100644 taskflow/core/environment.hpp create mode 100644 taskflow/core/error.hpp create mode 100644 taskflow/core/executor.hpp create mode 100644 taskflow/core/flow_builder.hpp create mode 100644 taskflow/core/graph.hpp create mode 100644 taskflow/core/notifier.hpp create mode 100644 taskflow/core/observer.hpp create mode 100644 taskflow/core/semaphore.hpp create mode 100644 taskflow/core/task.hpp create mode 100644 taskflow/core/taskflow.hpp create mode 100644 taskflow/core/topology.hpp create mode 100644 taskflow/core/tsq.hpp create mode 100644 taskflow/core/worker.hpp create mode 100644 taskflow/cublasflow.hpp create mode 100644 taskflow/cuda/cublas/cublas_error.hpp create mode 100644 taskflow/cuda/cublas/cublas_flow.hpp create mode 100644 taskflow/cuda/cublas/cublas_handle.hpp create mode 100644 taskflow/cuda/cublas/cublas_helper.hpp create mode 100644 taskflow/cuda/cublas/cublas_level1.hpp create mode 100644 taskflow/cuda/cublas/cublas_level2.hpp create mode 100644 taskflow/cuda/cublas/cublas_level3.hpp create mode 100644 taskflow/cuda/cuda_algorithm/cuda_blaf.hpp create mode 100644 taskflow/cuda/cuda_algorithm/cuda_for_each.hpp create mode 100644 taskflow/cuda/cuda_algorithm/cuda_matmul.hpp create mode 100644 taskflow/cuda/cuda_algorithm/cuda_reduce.hpp create mode 100644 taskflow/cuda/cuda_algorithm/cuda_transform.hpp create mode 100644 taskflow/cuda/cuda_algorithm/cuda_transpose.hpp create mode 100644 taskflow/cuda/cuda_capturer.hpp create mode 100644 taskflow/cuda/cuda_device.hpp create mode 100644 taskflow/cuda/cuda_error.hpp create mode 100644 taskflow/cuda/cuda_flow.hpp create mode 100644 taskflow/cuda/cuda_graph.hpp create mode 100644 taskflow/cuda/cuda_memory.hpp create mode 100644 taskflow/cuda/cuda_optimizer.hpp create mode 100644 taskflow/cuda/cuda_pool.hpp create mode 100644 taskflow/cuda/cuda_stream.hpp create mode 100644 taskflow/cuda/cuda_task.hpp create mode 100644 taskflow/cudaflow.hpp create mode 100644 taskflow/dsl/connection.hpp create mode 100644 taskflow/dsl/dsl.hpp create mode 100644 taskflow/dsl/meta_macro.hpp create mode 100644 taskflow/dsl/task_analyzer.hpp create mode 100644 taskflow/dsl/task_dsl.hpp create mode 100644 taskflow/dsl/task_trait.hpp create mode 100644 taskflow/dsl/tuple_utils.hpp create mode 100644 taskflow/dsl/type_list.hpp create mode 100644 taskflow/taskflow.hpp create mode 100644 taskflow/tensorframe/tensor.hpp create mode 100644 taskflow/tensorframe/tensor_expr.hpp create mode 100644 taskflow/tensorframe/tensor_graph.hpp create mode 100644 taskflow/tensorframe/tensor_ops.hpp create mode 100644 taskflow/tensorframe/tensorframe.hpp create mode 100644 taskflow/utility/iterator.hpp create mode 100644 taskflow/utility/math.hpp create mode 100644 taskflow/utility/object_pool.hpp create mode 100644 taskflow/utility/os.hpp create mode 100644 taskflow/utility/passive_vector.hpp create mode 100644 taskflow/utility/serializer.hpp create mode 100644 taskflow/utility/singleton.hpp create mode 100644 taskflow/utility/stream.hpp create mode 100644 taskflow/utility/traits.hpp create mode 100644 taskflow/utility/uuid.hpp diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 4b62775..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "taskflow"] - path = taskflow - url = https://github.com/taskflow/taskflow diff --git a/main.cpp b/main.cpp index 8d1ab39..1d1b730 100644 --- a/main.cpp +++ b/main.cpp @@ -16,7 +16,7 @@ #include "leanify.h" #include "version.h" -#include "taskflow/taskflow/taskflow.hpp" +#include "taskflow/taskflow.hpp" using std::cerr; using std::cout; diff --git a/taskflow b/taskflow deleted file mode 160000 index 1ec4e09..0000000 --- a/taskflow +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1ec4e0914af54ffd9796a36b18683b3879d0bce7 diff --git a/taskflow/core/algorithm/critical.hpp b/taskflow/core/algorithm/critical.hpp new file mode 100644 index 0000000..46e82fa --- /dev/null +++ b/taskflow/core/algorithm/critical.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include "../task.hpp" + +/** +@file critical.hpp +@brief critical include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// CriticalSection +// ---------------------------------------------------------------------------- + +/** +@class CriticalSection + +@brief class to create a critical region of limited workers to run tasks + +tf::CriticalSection is a warpper over tf::Semaphore and is specialized for +limiting the maximum concurrency over a set of tasks. +A critical section starts with an initial count representing that limit. +When a task is added to the critical section, +the task acquires and releases the semaphore internal to the critical section. +This design avoids explicit call of tf::Task::acquire and tf::Task::release. +The following example creates a critical section of one worker and adds +the five tasks to the critical section. + +@code{.cpp} +tf::Executor executor(8); // create an executor of 8 workers +tf::Taskflow taskflow; + +// create a critical section of 1 worker +tf::CriticalSection critical_section(1); + +tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; }); +tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; }); +tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; }); +tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; }); +tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; }); + +critical_section.add(A, B, C, D, E); + +executor.run(taskflow).wait(); +@endcode + +*/ +class CriticalSection : public Semaphore { + + public: + + /** + @brief constructs a critical region of a limited number of workers + */ + explicit CriticalSection(int max_workers = 1); + + /** + @brief adds a task into the critical region + */ + template + void add(Tasks...tasks); +}; + +inline CriticalSection::CriticalSection(int max_workers) : + Semaphore {max_workers} { +} + +template +void CriticalSection::add(Tasks... tasks) { + (tasks.acquire(*this), ...); + (tasks.release(*this), ...); +} + + +} // end of namespace tf. --------------------------------------------------- + + diff --git a/taskflow/core/algorithm/for_each.hpp b/taskflow/core/algorithm/for_each.hpp new file mode 100644 index 0000000..e04ed90 --- /dev/null +++ b/taskflow/core/algorithm/for_each.hpp @@ -0,0 +1,772 @@ +// reference: +// - gomp: https://github.com/gcc-mirror/gcc/blob/master/libgomp/iter.c +// - komp: https://github.com/llvm-mirror/openmp/blob/master/runtime/src/kmp_dispatch.cpp + + +#pragma once + +#include "../executor.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// default parallel for +// ---------------------------------------------------------------------------- + +// Function: for_each +template +Task FlowBuilder::for_each(B&& beg, E&& end, C&& c) { + return for_each_guided( + std::forward(beg), std::forward(end), std::forward(c), 1 + ); +} + +// Function: for_each_index +template +Task FlowBuilder::for_each_index(B&& beg, E&& end, S&& inc, C&& c){ + return for_each_index_guided( + std::forward(beg), + std::forward(end), + std::forward(inc), + std::forward(c), + 1 + ); +} + +// ---------------------------------------------------------------------------- +// parallel for using the guided partition algorithm +// - Polychronopoulos, C. D. and Kuck, D. J. +// "Guided Self-Scheduling: A Practical Scheduling Scheme +// for Parallel Supercomputers," +// IEEE Transactions on Computers, C-36(12):1425–1439 (1987). +// ---------------------------------------------------------------------------- + +// Function: for_each_guided +template +Task FlowBuilder::for_each_guided(B&& beg, E&& end, C&& c, H&& chunk_size){ + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + c=std::forward(c), + h=std::forward(chunk_size)] (Subflow& sf) mutable { + + // fetch the stateful values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t chunk_size = (h == 0) ? 1 : h; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= chunk_size) { + std::for_each(beg, end, c); + return; + } + + if(N < W) { + W = N; + } + + std::atomic next(0); + + for(size_t w=0; w(W); + size_t s0 = next.load(std::memory_order_relaxed); + + while(s0 < N) { + + size_t r = N - s0; + + // fine-grained + if(r < p1) { + while(1) { + s0 = next.fetch_add(chunk_size, std::memory_order_relaxed); + if(s0 >= N) { + return; + } + size_t e0 = (chunk_size <= (N - s0)) ? s0 + chunk_size : N; + std::advance(beg, s0-z); + for(size_t x=s0; x(p2 * r); + if(q < chunk_size) { + q = chunk_size; + } + size_t e0 = (q <= r) ? s0 + q : N; + if(next.compare_exchange_strong(s0, e0, std::memory_order_acquire, + std::memory_order_relaxed)) { + std::advance(beg, s0-z); + for(size_t x = s0; x< e0; x++) { + c(*beg++); + } + z = e0; + s0 = next.load(std::memory_order_relaxed); + } + } + } + //}).name("pfg_"s + std::to_string(w)); + }); + } + + sf.join(); + }); + + return task; +} + +// Function: for_each_index_guided +template +Task FlowBuilder::for_each_index_guided( + B&& beg, E&& end, S&& inc, C&& c, H&& chunk_size +){ + + using I = stateful_index_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + a=std::forward(inc), + c=std::forward(c), + h=std::forward(chunk_size)] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + I inc = a; + + if(is_range_invalid(beg, end, inc)) { + TF_THROW("invalid range [", beg, ", ", end, ") with step size ", inc); + } + + size_t chunk_size = (h == 0) ? 1 : h; + size_t W = sf._executor.num_workers(); + size_t N = distance(beg, end, inc); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= chunk_size) { + for(size_t x=0; x next(0); + + for(size_t w=0; w(W); + size_t s0 = next.load(std::memory_order_relaxed); + + while(s0 < N) { + + size_t r = N - s0; + + // find-grained + if(r < p1) { + while(1) { + s0 = next.fetch_add(chunk_size, std::memory_order_relaxed); + if(s0 >= N) { + return; + } + size_t e0 = (chunk_size <= (N - s0)) ? s0 + chunk_size : N; + auto s = static_cast(s0) * inc + beg; + for(size_t x=s0; x(p2 * r); + if(q < chunk_size) { + q = chunk_size; + } + size_t e0 = (q <= r) ? s0 + q : N; + if(next.compare_exchange_strong(s0, e0, std::memory_order_acquire, + std::memory_order_relaxed)) { + auto s = static_cast(s0) * inc + beg; + for(size_t x=s0; x +Task FlowBuilder::for_each_factoring(B&& beg, E&& end, C&& c){ + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + c=std::forward(c)] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 1) { + std::for_each(beg, end, c); + return; + } + + if(N < W) { + W = N; + } + + std::atomic batch(0); + std::atomic next(0); + + for(size_t w=0; w((N >> b0) / (double)W); + + if(ck == 0) { + ck = 1; + } + + size_t s0 = next.fetch_add(ck, std::memory_order_relaxed); + if(s0 >= N) { + return; + } + size_t e0 = (ck <= (N - s0)) ? s0 + ck : N; + std::advance(beg, s0-z); + for(size_t x=s0; x +Task FlowBuilder::for_each_factoring( + B&& beg, E&& end, S&& inc, C&& c +){ + + using I = stateful_index_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + i=std::forward(inc), + c=std::forward(c)] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + I inc = i; + + if(is_range_invalid(beg, end, inc)) { + TF_THROW("invalid range [", beg, ", ", end, ") with step size ", inc); + } + + size_t W = sf._executor.num_workers(); + size_t N = distance(beg, end, inc); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 1) { + for(size_t x=0; x batch(0); + std::atomic next(0); + + for(size_t w=0; w((N >> b0) / (double)(W)); + + if(ck == 0) { + ck = 1; + } + + size_t s0 = next.fetch_add(ck, std::memory_order_relaxed); + if(s0 >= N) { + return; + } + size_t e0 = (ck <= (N - s0)) ? s0 + ck : N; + auto s = static_cast(s0) * inc + beg; + for(size_t x=s0; x +Task FlowBuilder::for_each_dynamic( + B&& beg, E&& end, C&& c, H&& chunk_size +) { + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + c=std::forward(c), + h=std::forward(chunk_size)] (Subflow& sf) mutable { + + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t chunk_size = (h == 0) ? 1 : h; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= chunk_size) { + std::for_each(beg, end, c); + return; + } + + if(N < W) { + W = N; + } + + std::atomic next(0); + + for(size_t w=0; w= N) { + break; + } + + size_t e0 = (chunk_size <= (N - s0)) ? s0 + chunk_size : N; + std::advance(beg, s0-z); + for(size_t x=s0; x +Task FlowBuilder::for_each_index_dynamic( + B&& beg, E&& end, S&& inc, C&& c, H&& chunk_size +){ + + using I = stateful_index_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + a=std::forward(inc), + c=std::forward(c), + h=std::forward(chunk_size)] (Subflow& sf) mutable { + + I beg = b; + I end = e; + I inc = a; + + if(is_range_invalid(beg, end, inc)) { + TF_THROW("invalid range [", beg, ", ", end, ") with step size ", inc); + } + + size_t chunk_size = (h == 0) ? 1 : h; + size_t W = sf._executor.num_workers(); + size_t N = distance(beg, end, inc); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= chunk_size) { + for(size_t x=0; x next(0); + + for(size_t w=0; w= N) { + break; + } + + size_t e0 = (chunk_size <= (N - s0)) ? s0 + chunk_size : N; + I s = static_cast(s0) * inc + beg; + for(size_t x=s0; x +Task FlowBuilder::for_each_static( + B&& beg, E&& end, C&& c, H&& chunk_size +){ + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + c=std::forward(c), + h=std::forward(chunk_size)] (Subflow& sf) mutable { + + // fetch the iterator + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t chunk_size = h; + const size_t W = sf._executor.num_workers(); + const size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= chunk_size) { + std::for_each(beg, end, c); + return; + } + + std::atomic next(0); + + // even partition + if(chunk_size == 0){ + + // zero-based start and end points + const size_t q0 = N / W; + const size_t t0 = N % W; + + for(size_t i=0; i= N) { + break; + } + + //sf.emplace([&next, beg, end, chunk_size, N, W, &c] () mutable { + sf.silent_async([&next, beg, end, chunk_size, N, W, &c] () mutable { + + size_t trip = W*chunk_size; + size_t s0 = next.fetch_add(chunk_size, std::memory_order_relaxed); + + std::advance(beg, s0); + + while(1) { + + size_t items; + + I e = beg; + + for(items=0; items= N) { + break; + } + + std::advance(beg, trip); + } + //}).name("pfs_"s + std::to_string(i)); + }); + } + } + + sf.join(); + }); + + return task; +} + +// Function: for_each_index_static +// static scheduling with chunk size +template +Task FlowBuilder::for_each_index_static( + B&& beg, E&& end, S&& inc, C&& c, H&& chunk_size +){ + + using I = stateful_index_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + a=std::forward(inc), + c=std::forward(c), + h=std::forward(chunk_size)] (Subflow& sf) mutable { + + // fetch the indices + I beg = b; + I end = e; + I inc = a; + + if(is_range_invalid(beg, end, inc)) { + TF_THROW("invalid range [", beg, ", ", end, ") with step size ", inc); + } + + size_t chunk_size = h; + const size_t W = sf._executor.num_workers(); + const size_t N = distance(beg, end, inc); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= chunk_size) { + for(size_t x=0; x next(0); + + if(chunk_size == 0) { + // zero-based start and end points + const size_t q0 = N / W; + const size_t t0 = N % W; + for(size_t i=0; i(s0) * inc + beg; + + for(size_t x=0; x= N) { + break; + } + + //sf.emplace([&next, beg, inc, chunk_size, N, W, &c] () mutable { + sf.silent_async([&next, beg, inc, chunk_size, N, W, &c] () mutable { + + size_t trip = W * chunk_size; + size_t s0 = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(1) { + + size_t e0 = s0 + chunk_size; + + if(e0 > N) { + e0 = N; + } + + I s = static_cast(s0) * inc + beg; + + for(size_t x=s0; x= N) { + break; + } + } + //}).name("pfs_"s + std::to_string(i)); + }); + } + } + + sf.join(); + + }); + + return task; +} + + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/taskflow/core/algorithm/reduce.hpp b/taskflow/core/algorithm/reduce.hpp new file mode 100644 index 0000000..0c8fa34 --- /dev/null +++ b/taskflow/core/algorithm/reduce.hpp @@ -0,0 +1,866 @@ +#pragma once + +#include "../executor.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// default reduction +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::reduce( + B&& beg, + E&& end, + T& init, + O&& bop +) { + return reduce_guided( + std::forward(beg), + std::forward(end), + init, + std::forward(bop), + 1 + ); +} + +// ---------------------------------------------------------------------------- +// guided partition +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::reduce_guided( + B&& beg, E&& end, T& init, O&& bop, H&& chunk_size +) { + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + &r=init, + o=std::forward(bop), + c=std::forward(chunk_size) + ] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t C = (c == 0) ? 1 : c; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= C) { + for(; beg!=end; r = o(r, *beg++)); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + std::atomic next(0); + + for(size_t w=0; w= N) { + break; + } + + //sf.emplace([&mutex, &next, &r, beg, N, W, &o, C] () mutable { + sf.silent_async([&mutex, &next, &r, beg, N, W, &o, C] () mutable { + + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mutex); + r = o(r, *beg); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = o(*beg1, *beg2); + + size_t z = s0 + 2; + size_t p1 = 2 * W * (C + 1); + double p2 = 0.5 / static_cast(W); + s0 = next.load(std::memory_order_relaxed); + + while(s0 < N) { + + size_t r = N - s0; + + // fine-grained + if(r < p1) { + while(1) { + s0 = next.fetch_add(C, std::memory_order_relaxed); + if(s0 >= N) { + break; + } + size_t e0 = (C <= (N - s0)) ? s0 + C : N; + std::advance(beg, s0-z); + for(size_t x=s0; x(p2 * r); + if(q < C) { + q = C; + } + size_t e0 = (q <= r) ? s0 + q : N; + if(next.compare_exchange_strong(s0, e0, std::memory_order_acquire, + std::memory_order_relaxed)) { + std::advance(beg, s0-z); + for(size_t x = s0; x lock(mutex); + r = o(r, sum); + //}).name("prg_"s + std::to_string(w)); + }); + } + + sf.join(); + }); + + return task; +} + +// ---------------------------------------------------------------------------- +// reduce_dynamic +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::reduce_dynamic( + B&& beg, E&& end, T& init, O&& bop, H&& chunk_size +) { + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + &r=init, + o=std::forward(bop), + c=std::forward(chunk_size) + ] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t C = (c == 0) ? 1 : c; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= C) { + for(; beg!=end; r = o(r, *beg++)); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + std::atomic next(0); + + for(size_t w=0; w= N) { + break; + } + + //sf.emplace([&mutex, &next, &r, beg, N, &o, C] () mutable { + sf.silent_async([&mutex, &next, &r, beg, N, &o, C] () mutable { + + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mutex); + r = o(r, *beg); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = o(*beg1, *beg2); + + size_t z = s0 + 2; + + while(1) { + s0 = next.fetch_add(C, std::memory_order_relaxed); + if(s0 >= N) { + break; + } + size_t e0 = (C <= (N - s0)) ? s0 + C : N; + std::advance(beg, s0-z); + for(size_t x=s0; x lock(mutex); + r = o(r, sum); + //}).name("prd_"s + std::to_string(w)); + }); + } + + sf.join(); + }); + + return task; +} + +// ---------------------------------------------------------------------------- +// reduce_static +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::reduce_static( + B&& beg, E&& end, T& init, O&& bop, H&& chunk_size +) { + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + &r=init, + o=std::forward(bop), + c=std::forward(chunk_size) + ] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t C = c; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= C) { + for(; beg!=end; r = o(r, *beg++)); + return; + } + + std::mutex mutex; + std::atomic next(0); + + // even partition + if(C == 0) { + + const size_t q0 = N / W; + const size_t t0 = N % W; + + for(size_t i=0; i lock(mutex); + r = o(r, *beg); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = o(*beg1, *beg2); + + for(size_t i=2; i lock(mutex); + r = o(r, sum); + + //}).name("prs_"s + std::to_string(i)); + }); + } + } + // chunk-by-chunk partition + else { + for(size_t w=0; w= N) { + break; + } + + //sf.emplace([&mutex, &next, &r, beg, end, C, N, W, &o] () mutable { + sf.silent_async([&mutex, &next, &r, beg, end, C, N, W, &o] () mutable { + + size_t trip = W*C; + size_t s0 = next.fetch_add(C, std::memory_order_relaxed); + + std::advance(beg, s0); + + T sum; + + if(C == 1) { + if(s0 + trip >= N) { // last trip + std::lock_guard lock(mutex); + r = o(r, *beg); + return; + } + else { // one more trip + auto beg1 = beg; + auto beg2 = std::next(beg, trip); + sum = o(*beg1, *beg2); + s0 += trip*2; + if(s0 >= N) { + goto end_reduce; + } + beg = std::next(beg2, trip); + } + } + else { + if(N - s0 == 1) { + std::lock_guard lock(mutex); + r = o(r, *beg); + return; + } + auto beg1 = beg++; + auto beg2 = beg++; + sum = o(*beg1, *beg2); + I e = beg; + size_t i; + for(i=2; i= N) { + goto end_reduce; + } + std::advance(beg, trip-2); + } + + while(1) { + + size_t i; + + I e = beg; + + for(i=0; i= N) { + break; + } + + std::advance(beg, trip); + } + + end_reduce: + + std::lock_guard lock(mutex); + r = o(r, sum); + + //}).name("prs_"s + std::to_string(w)); + }); + } + } + + sf.join(); + }); + + return task; +} + +// ---------------------------------------------------------------------------- +// default transform and reduction +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::transform_reduce( + B&& beg, + E&& end, + T& init, + BOP&& bop, + UOP&& uop +) { + return transform_reduce_guided( + std::forward(beg), + std::forward(end), + init, + std::forward(bop), + std::forward(uop), + 1 + ); +} + +// ---------------------------------------------------------------------------- +// guided partition +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::transform_reduce_guided( + B&& beg, E&& end, T& init, BOP&& bop, UOP&& uop, H&& chunk_size +) { + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + &r=init, + bop=std::forward(bop), + uop=std::forward(uop), + c=std::forward(chunk_size) + ] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t C = (c == 0) ? 1 : c; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= C) { + for(; beg!=end; r = bop(r, uop(*beg++))); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + std::atomic next(0); + + for(size_t w=0; w= N) { + break; + } + + //sf.emplace([&mutex, &next, &r, beg, N, W, &bop, &uop, C] () mutable { + sf.silent_async([&mutex, &next, &r, beg, N, W, &bop, &uop, C] () mutable { + + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mutex); + r = bop(r, uop(*beg)); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = bop(uop(*beg1), uop(*beg2)); + + size_t z = s0 + 2; + size_t p1 = 2 * W * (C + 1); + double p2 = 0.5 / static_cast(W); + s0 = next.load(std::memory_order_relaxed); + + while(s0 < N) { + + size_t r = N - s0; + + // fine-grained + if(r < p1) { + while(1) { + s0 = next.fetch_add(C, std::memory_order_relaxed); + if(s0 >= N) { + break; + } + size_t e0 = (C <= (N - s0)) ? s0 + C : N; + std::advance(beg, s0-z); + for(size_t x=s0; x(p2 * r); + if(q < C) { + q = C; + } + size_t e0 = (q <= r) ? s0 + q : N; + if(next.compare_exchange_strong(s0, e0, std::memory_order_acquire, + std::memory_order_relaxed)) { + std::advance(beg, s0-z); + for(size_t x = s0; x lock(mutex); + r = bop(r, sum); + + //}).name("prg_"s + std::to_string(w)); + }); + } + + sf.join(); + }); + + return task; +} + +// ---------------------------------------------------------------------------- +// transform_reduce_dynamic +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::transform_reduce_dynamic( + B&& beg, E&& end, T& init, BOP&& bop, UOP&& uop, H&& chunk_size +) { + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + &r=init, + bop=std::forward(bop), + uop=std::forward(uop), + c=std::forward(chunk_size) + ] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t C = (c == 0) ? 1 : c; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= C) { + for(; beg!=end; r = bop(r, uop(*beg++))); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + std::atomic next(0); + + for(size_t w=0; w= N) { + break; + } + + //sf.emplace([&mutex, &next, &r, beg, N, &bop, &uop, C] () mutable { + sf.silent_async([&mutex, &next, &r, beg, N, &bop, &uop, C] () mutable { + + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mutex); + r = bop(r, uop(*beg)); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = bop(uop(*beg1), uop(*beg2)); + + size_t z = s0 + 2; + + while(1) { + s0 = next.fetch_add(C, std::memory_order_relaxed); + if(s0 >= N) { + break; + } + size_t e0 = (C <= (N - s0)) ? s0 + C : N; + std::advance(beg, s0-z); + for(size_t x=s0; x lock(mutex); + r = bop(r, sum); + + //}).name("prd_"s + std::to_string(w)); + }); + } + + sf.join(); + }); + + return task; +} + +// ---------------------------------------------------------------------------- +// transform_reduce_static +// ---------------------------------------------------------------------------- + +template +Task FlowBuilder::transform_reduce_static( + B&& beg, E&& end, T& init, BOP&& bop, UOP&& uop, H&& chunk_size +) { + + using I = stateful_iterator_t; + using namespace std::string_literals; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + &r=init, + bop=std::forward(bop), + uop=std::forward(uop), + c=std::forward(chunk_size) + ] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t C = c; + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= C) { + for(; beg!=end; r = bop(r, uop(*beg++))); + return; + } + + std::mutex mutex; + std::atomic next(0); + + // even partition + if(C == 0) { + + const size_t q0 = N / W; + const size_t t0 = N % W; + + for(size_t i=0; i lock(mutex); + r = bop(r, uop(*beg)); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = bop(uop(*beg1), uop(*beg2)); + + for(size_t i=2; i lock(mutex); + r = bop(r, sum); + + //}).name("prs_"s + std::to_string(i)); + }); + } + } + // chunk-by-chunk partition + else { + for(size_t w=0; w= N) { + break; + } + + //sf.emplace([&mutex, &next, &r, beg, end, C, N, W, &bop, &uop] () mutable { + sf.silent_async([&mutex, &next, &r, beg, end, C, N, W, &bop, &uop] () mutable { + + size_t trip = W*C; + size_t s0 = next.fetch_add(C, std::memory_order_relaxed); + + std::advance(beg, s0); + + T sum; + + if(C == 1) { + if(s0 + trip >= N) { // last trip + std::lock_guard lock(mutex); + r = bop(r, uop(*beg)); + return; + } + else { // one more trip + auto beg1 = beg; + auto beg2 = std::next(beg, trip); + sum = bop(uop(*beg1), uop(*beg2)); + s0 += trip*2; + if(s0 >= N) { + goto end_transform_reduce; + } + beg = std::next(beg2, trip); + } + } + else { + if(N - s0 == 1) { + std::lock_guard lock(mutex); + r = bop(r, uop(*beg)); + return; + } + auto beg1 = beg++; + auto beg2 = beg++; + sum = bop(uop(*beg1), uop(*beg2)); + I e = beg; + size_t i; + for(i=2; i= N) { + goto end_transform_reduce; + } + std::advance(beg, trip-2); + } + + while(1) { + + size_t i; + + I e = beg; + + for(i=0; i= N) { + break; + } + + std::advance(beg, trip); + } + + end_transform_reduce: + + std::lock_guard lock(mutex); + r = bop(r, sum); + + //}).name("prs_"s + std::to_string(w)); + }); + } + } + + sf.join(); + }); + + return task; +} + +} // end of namespace tf ----------------------------------------------------- + + + + diff --git a/taskflow/core/algorithm/sort.hpp b/taskflow/core/algorithm/sort.hpp new file mode 100644 index 0000000..5f5a0db --- /dev/null +++ b/taskflow/core/algorithm/sort.hpp @@ -0,0 +1,482 @@ +#pragma once + +#include "../executor.hpp" + +namespace tf { + +// threshold whether or not to perform parallel sort +template +constexpr size_t parallel_sort_cutoff() { + + //using value_type = std::decay_t())>; + using value_type = typename std::iterator_traits::value_type; + + constexpr size_t object_size = sizeof(value_type); + + if constexpr(std::is_same_v) { + return 128; + } + else { + if(object_size < 16) return 4096; + else if(object_size < 32) return 2048; + else if(object_size < 64) return 1024; + else if(object_size < 128) return 768; + else if(object_size < 256) return 512; + else if(object_size < 512) return 256; + else return 128; + } +} + +// ---------------------------------------------------------------------------- +// pattern-defeating quick sort (pdqsort) +// ---------------------------------------------------------------------------- + +// Sorts [begin, end) using insertion sort with the given comparison function. +template +void insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + if (begin == end) { + return; + } + + for (RandItr cur = begin + 1; cur != end; ++cur) { + + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first to avoid 2 moves for an element + // already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + do { + *shift-- = std::move(*shift_1); + }while (shift != begin && comp(tmp, *--shift_1)); + *shift = std::move(tmp); + } + } +} + +// Sorts [begin, end) using insertion sort with the given comparison function. +// Assumes *(begin - 1) is an element smaller than or equal to any element +// in [begin, end). +template +void unguarded_insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + if (begin == end) { + return; + } + + for (RandItr cur = begin + 1; cur != end; ++cur) { + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first so we can avoid 2 moves + // for an element already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + + do { + *shift-- = std::move(*shift_1); + }while (comp(tmp, *--shift_1)); + + *shift = std::move(tmp); + } + } +} + +// Attempts to use insertion sort on [begin, end). +// Will return false if more than +// partial_insertion_sort_limit elements were moved, +// and abort sorting. Otherwise it will successfully sort and return true. +template +bool partial_insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + using D = typename std::iterator_traits::difference_type; + + // When we detect an already sorted partition, attempt an insertion sort + // that allows this amount of element moves before giving up. + constexpr auto partial_insertion_sort_limit = D{8}; + + if (begin == end) return true; + + auto limit = D{0}; + + for (RandItr cur = begin + 1; cur != end; ++cur) { + + if (limit > partial_insertion_sort_limit) { + return false; + } + + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first so we can avoid 2 moves + // for an element already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + + do { + *shift-- = std::move(*shift_1); + }while (shift != begin && comp(tmp, *--shift_1)); + + *shift = std::move(tmp); + limit += cur - shift; + } + } + + return true; +} + +// Partitions [begin, end) around pivot *begin using comparison function comp. +// Elements equal to the pivot are put in the right-hand partition. +// Returns the position of the pivot after partitioning and whether the passed +// sequence already was correctly partitioned. +// Assumes the pivot is a median of at least 3 elements and that [begin, end) +// is at least insertion_sort_threshold long. +template +std::pair partition_right(Iter begin, Iter end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + // Move pivot into local for speed. + T pivot(std::move(*begin)); + + Iter first = begin; + Iter last = end; + + // Find the first element greater than or equal than the pivot + // (the median of 3 guarantees/ this exists). + while (comp(*++first, pivot)); + + // Find the first element strictly smaller than the pivot. + // We have to guard this search if there was no element before *first. + if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); + else while (!comp(*--last, pivot)); + + // If the first pair of elements that should be swapped to partition + // are the same element, the passed in sequence already was correctly + // partitioned. + bool already_partitioned = first >= last; + + // Keep swapping pairs of elements that are on the wrong side of the pivot. + // Previously swapped pairs guard the searches, + // which is why the first iteration is special-cased above. + while (first < last) { + std::iter_swap(first, last); + while (comp(*++first, pivot)); + while (!comp(*--last, pivot)); + } + + // Put the pivot in the right place. + Iter pivot_pos = first - 1; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return std::make_pair(pivot_pos, already_partitioned); +} + +// Similar function to the one above, except elements equal to the pivot +// are put to the left of the pivot and it doesn't check or return +// if the passed sequence already was partitioned. +// Since this is rarely used (the many equal case), +// and in that case pdqsort already has O(n) performance, +// no block quicksort is applied here for simplicity. +template +RandItr partition_left(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + T pivot(std::move(*begin)); + + RandItr first = begin; + RandItr last = end; + + while (comp(pivot, *--last)); + + if (last + 1 == end) { + while (first < last && !comp(pivot, *++first)); + } + else { + while (!comp(pivot, *++first)); + } + + while (first < last) { + std::iter_swap(first, last); + while (comp(pivot, *--last)); + while (!comp(pivot, *++first)); + } + + RandItr pivot_pos = last; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return pivot_pos; +} + +template +void parallel_pdqsort( + tf::Subflow& sf, + Iter begin, Iter end, Compare comp, + int bad_allowed, bool leftmost = true +) { + + // Partitions below this size are sorted sequentially + constexpr auto cutoff = parallel_sort_cutoff(); + + // Partitions below this size are sorted using insertion sort + constexpr auto insertion_sort_threshold = 24; + + // Partitions above this size use Tukey's ninther to select the pivot. + constexpr auto ninther_threshold = 128; + + //using diff_t = typename std::iterator_traits::difference_type; + + // Use a while loop for tail recursion elimination. + while (true) { + + //diff_t size = end - begin; + size_t size = end - begin; + + if(size <= cutoff) { + std::sort(begin, end, comp); + return; + } + //// Insertion sort is faster for small arrays. + //if (size < insertion_sort_threshold) { + // if (leftmost) { + // insertion_sort(begin, end, comp); + // } + // else { + // unguarded_insertion_sort(begin, end, comp); + // } + // return; + //} + + // Choose pivot as median of 3 or pseudomedian of 9. + //diff_t s2 = size / 2; + size_t s2 = size >> 1; + if (size > ninther_threshold) { + sort3(begin, begin + s2, end - 1, comp); + sort3(begin + 1, begin + (s2 - 1), end - 2, comp); + sort3(begin + 2, begin + (s2 + 1), end - 3, comp); + sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp); + std::iter_swap(begin, begin + s2); + } + else { + sort3(begin + s2, begin, end - 1, comp); + } + + // If *(begin - 1) is the end of the right partition + // of a previous partition operation, there is no element in [begin, end) + // that is smaller than *(begin - 1). + // Then if our pivot compares equal to *(begin - 1) we change strategy, + // putting equal elements in the left partition, + // greater elements in the right partition. + // We do not have to recurse on the left partition, + // since it's sorted (all equal). + if (!leftmost && !comp(*(begin - 1), *begin)) { + begin = partition_left(begin, end, comp) + 1; + continue; + } + + // Partition and get results. + auto pair = partition_right(begin, end, comp); + auto pivot_pos = pair.first; + auto already_partitioned = pair.second; + + // Check for a highly unbalanced partition. + //diff_t l_size = pivot_pos - begin; + //diff_t r_size = end - (pivot_pos + 1); + size_t l_size = pivot_pos - begin; + size_t r_size = end - (pivot_pos + 1); + bool highly_unbalanced = l_size < size / 8 || r_size < size / 8; + + // If we got a highly unbalanced partition we shuffle elements + // to break many patterns. + if (highly_unbalanced) { + // If we had too many bad partitions, switch to heapsort + // to guarantee O(n log n). + if (--bad_allowed == 0) { + std::make_heap(begin, end, comp); + std::sort_heap(begin, end, comp); + return; + } + + if (l_size >= insertion_sort_threshold) { + std::iter_swap(begin, begin + l_size / 4); + std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4); + if (l_size > ninther_threshold) { + std::iter_swap(begin + 1, begin + (l_size / 4 + 1)); + std::iter_swap(begin + 2, begin + (l_size / 4 + 2)); + std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1)); + std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2)); + } + } + + if (r_size >= insertion_sort_threshold) { + std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4)); + std::iter_swap(end - 1, end - r_size / 4); + if (r_size > ninther_threshold) { + std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4)); + std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4)); + std::iter_swap(end - 2, end - (1 + r_size / 4)); + std::iter_swap(end - 3, end - (2 + r_size / 4)); + } + } + } + // decently balanced + else { + // sequence try to use insertion sort. + if (already_partitioned && + partial_insertion_sort(begin, pivot_pos, comp) && + partial_insertion_sort(pivot_pos + 1, end, comp) + ) { + return; + } + } + + // Sort the left partition first using recursion and + // do tail recursion elimination for the right-hand partition. + sf.silent_async( + [&sf, begin, pivot_pos, &comp, bad_allowed, leftmost] () mutable { + parallel_pdqsort(sf, begin, pivot_pos, comp, bad_allowed, leftmost); + } + ); + begin = pivot_pos + 1; + leftmost = false; + } +} + +// ---------------------------------------------------------------------------- +// 3-way quick sort +// ---------------------------------------------------------------------------- + +// 3-way quick sort +template +void parallel_3wqsort(tf::Subflow& sf, RandItr first, RandItr last, C compare) { + + using namespace std::string_literals; + + constexpr auto cutoff = parallel_sort_cutoff(); + + sort_partition: + + if(static_cast(last - first) < cutoff) { + std::sort(first, last+1, compare); + return; + } + + auto m = pseudo_median_of_nine(first, last, compare); + + if(m != first) { + std::iter_swap(first, m); + } + + auto l = first; + auto r = last; + auto f = std::next(first, 1); + bool is_swapped_l = false; + bool is_swapped_r = false; + + while(f <= r) { + if(compare(*f, *l)) { + is_swapped_l = true; + std::iter_swap(l, f); + l++; + f++; + } + else if(compare(*l, *f)) { + is_swapped_r = true; + std::iter_swap(r, f); + r--; + } + else { + f++; + } + } + + if(l - first > 1 && is_swapped_l) { + //sf.emplace([&](tf::Subflow& sfl) mutable { + // parallel_3wqsort(sfl, first, l-1, compare); + //}); + sf.silent_async([&sf, first, l, &compare] () mutable { + parallel_3wqsort(sf, first, l-1, compare); + }); + } + + if(last - r > 1 && is_swapped_r) { + //sf.emplace([&](tf::Subflow& sfr) mutable { + // parallel_3wqsort(sfr, r+1, last, compare); + //}); + //sf.silent_async([&sf, r, last, &compare] () mutable { + // parallel_3wqsort(sf, r+1, last, compare); + //}); + first = r+1; + goto sort_partition; + } + + //sf.join(); +} + +// ---------------------------------------------------------------------------- +// tf::Taskflow::sort +// ---------------------------------------------------------------------------- + +// Function: sort +template +Task FlowBuilder::sort(B&& beg, E&& end, C&& cmp) { + + using I = stateful_iterator_t; + + Task task = emplace( + [b=std::forward(beg), + e=std::forward(end), + c=std::forward(cmp) + ] (Subflow& sf) mutable { + + // fetch the iterator values + I beg = b; + I end = e; + + if(beg == end) { + return; + } + + size_t W = sf._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= parallel_sort_cutoff()) { + std::sort(beg, end, c); + return; + } + + //parallel_3wqsort(sf, beg, end-1, c); + parallel_pdqsort(sf, beg, end, c, log2(end - beg)); + + sf.join(); + }); + + return task; +} + +// Function: sort +template +Task FlowBuilder::sort(B&& beg, E&& end) { + + using I = stateful_iterator_t; + //using value_type = std::decay_t())>; + using value_type = typename std::iterator_traits::value_type; + + return sort( + std::forward(beg), std::forward(end), std::less{} + ); +} + +} // namespace tf ------------------------------------------------------------ + diff --git a/taskflow/core/declarations.hpp b/taskflow/core/declarations.hpp new file mode 100644 index 0000000..b7f1b24 --- /dev/null +++ b/taskflow/core/declarations.hpp @@ -0,0 +1,44 @@ +#pragma once + +namespace tf { + +// taskflow +class AsyncTopology; +class Node; +class Graph; +class FlowBuilder; +class Semaphore; +class Subflow; +class Task; +class TaskView; +class Taskflow; +class Topology; +class TopologyBase; +class Executor; +class WorkerView; +class ObserverInterface; +class ChromeTracingObserver; +class TFProfObserver; +class TFProfManager; + +template +class Future; + +// cudaFlow +class cudaNode; +class cudaGraph; +class cudaTask; +class cudaFlow; +class cudaFlowCapturer; +class cudaFlowCapturerBase; +class cudaCapturingBase; +class cudaSequentialCapturing; +class cudaRoundRobinCapturing; +class cublasFlowCapturer; + + +} // end of namespace tf ----------------------------------------------------- + + + + diff --git a/taskflow/core/environment.hpp b/taskflow/core/environment.hpp new file mode 100644 index 0000000..f9013b6 --- /dev/null +++ b/taskflow/core/environment.hpp @@ -0,0 +1,8 @@ +#pragma once + +#define TF_ENABLE_PROFILER "TF_ENABLE_PROFILER" + +namespace tf { + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/core/error.hpp b/taskflow/core/error.hpp new file mode 100644 index 0000000..6a68bea --- /dev/null +++ b/taskflow/core/error.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +#include "../utility/stream.hpp" + +namespace tf { + +// Procedure: throw_se +// Throws the system error under a given error code. +template +//void throw_se(const char* fname, const size_t line, Error::Code c, ArgsT&&... args) { +void throw_re(const char* fname, const size_t line, ArgsT&&... args) { + std::ostringstream oss; + oss << "[" << fname << ":" << line << "] "; + //ostreamize(oss, std::forward(args)...); + (oss << ... << args); + throw std::runtime_error(oss.str()); +} + +} // ------------------------------------------------------------------------ + +#define TF_THROW(...) tf::throw_re(__FILE__, __LINE__, __VA_ARGS__); + diff --git a/taskflow/core/executor.hpp b/taskflow/core/executor.hpp new file mode 100644 index 0000000..c5047ef --- /dev/null +++ b/taskflow/core/executor.hpp @@ -0,0 +1,1249 @@ +#pragma once + +#include "observer.hpp" +#include "taskflow.hpp" + +/** +@file executor.hpp +@brief executor include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Executor Definition +// ---------------------------------------------------------------------------- + + +/** @class Executor + +@brief execution interface for running a taskflow graph + +An executor object manages a set of worker threads to run taskflow(s) +using an efficient work-stealing scheduling algorithm. + +*/ +class Executor { + + friend class FlowBuilder; + friend class Subflow; + friend class cudaFlow; + + struct PerThread { + Worker* worker; + PerThread() : worker {nullptr} { } + }; + + public: + + /** + @brief constructs the executor with N worker threads + */ + explicit Executor(size_t N = std::thread::hardware_concurrency()); + + /** + @brief destructs the executor + */ + ~Executor(); + + /** + @brief runs the taskflow once + + @param taskflow a tf::Taskflow object + + @return a tf::Future that will holds the result of the execution + */ + tf::Future run(Taskflow& taskflow); + + /** + @brief runs the taskflow once and invoke a callback upon completion + + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that will holds the result of the execution + */ + template + tf::Future run(Taskflow& taskflow, C&& callable); + + /** + @brief runs the taskflow for N times + + @param taskflow a tf::Taskflow object + @param N number of runs + + @return a tf::Future that will holds the result of the execution + */ + tf::Future run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs the taskflow for N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that will holds the result of the execution + */ + template + tf::Future run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs the taskflow multiple times until the predicate becomes true and + then invokes a callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return true for stop + + @return a tf::Future that will holds the result of the execution + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs the taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return true for stop + @param callable a callable object to be invoked after this run + + @return a tf::Future that will holds the result of the execution + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief wait for all pending graphs to complete + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads (can be zero) + */ + size_t num_workers() const; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + */ + size_t num_topologies() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id from 0 to N-1 exclusive to the associated executor. + If the caller thread does not belong to the executor, -1 is returned. + */ + int this_worker_id() const; + + /** + @brief runs a given function asynchronously + + @tparam F callable type + @tparam ArgsT parameter types + + @param f callable object to call + @param args parameters to pass to the callable + + @return a tf::Future that will holds the result of the execution + + This method is thread-safe. Multiple threads can launch asynchronous tasks + at the same time. + */ + template + auto async(F&& f, ArgsT&&... args); + + /** + @brief similar to tf::Executor::async but does not return a future object + */ + template + void silent_async(F&& f, ArgsT&&... args); + + /** + @brief constructs an observer to inspect the activities of worker threads + + Each executor manage a list of observers in shared ownership with callers. + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + */ + template + std::shared_ptr make_observer(ArgsT&&... args); + + /** + @brief removes the associated observer + */ + template + void remove_observer(std::shared_ptr observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const; + + private: + + inline static thread_local PerThread _per_thread; + + const size_t _VICTIM_BEG; + const size_t _VICTIM_END; + const size_t _MAX_STEALS; + const size_t _MAX_YIELDS; + + std::condition_variable _topology_cv; + std::mutex _topology_mutex; + std::mutex _wsq_mutex; + + size_t _num_topologies {0}; + + std::vector _workers; + std::vector _threads; + + Notifier _notifier; + + TaskQueue _wsq; + + std::atomic _num_actives {0}; + std::atomic _num_thieves {0}; + std::atomic _done {0}; + + std::unordered_set> _observers; + + bool _wait_for_task(Worker&, Node*&); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _worker_loop(Worker&); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _schedule(Node*); + void _schedule(const std::vector&); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_dynamic_task(Worker&, Node*); + void _invoke_dynamic_task_internal(Worker&, Node*, Graph&, bool); + void _invoke_dynamic_task_external(Node*, Graph&, bool); + void _invoke_condition_task(Worker&, Node*, int&); + void _invoke_module_task(Worker&, Node*); + void _invoke_async_task(Worker&, Node*); + void _invoke_silent_async_task(Worker&, Node*); + void _set_up_topology(Topology*); + void _tear_down_topology(Topology*); + void _tear_down_async(Node*); + void _tear_down_invoke(Node*, bool); + void _increment_topology(); + void _decrement_topology(); + void _decrement_topology_and_notify(); + void _invoke_cudaflow_task(Worker&, Node*); + + template , void>* = nullptr + > + void _invoke_cudaflow_task_entry(C&&, Node*); + + template , void>* = nullptr + > + void _invoke_cudaflow_task_entry(C&&, Node*); + + //template + //void _invoke_cudaflow_task_internal(cudaFlow&, P&&, bool); + + //template + //void _invoke_cudaflow_task_external(cudaFlow&, P&&, bool); +}; + +// Constructor +inline Executor::Executor(size_t N) : + _VICTIM_BEG {0}, + _VICTIM_END {N - 1}, + _MAX_STEALS {(N + 1) << 1}, + _MAX_YIELDS {100}, + _workers {N}, + _notifier {N} { + + if(N == 0) { + TF_THROW("no cpu workers to execute taskflows"); + } + + _spawn(N); + + // instantite the default observer if requested + if(has_env(TF_ENABLE_PROFILER)) { + TFProfManager::get()._manage(make_observer()); + } +} + +// Destructor +inline Executor::~Executor() { + + // wait for all topologies to complete + wait_for_all(); + + // shut down the scheduler + _done = true; + + _notifier.notify(true); + + for(auto& t : _threads){ + t.join(); + } + + // flush the default observer + //_flush_tfprof(); +} + +// Function: num_workers +inline size_t Executor::num_workers() const { + return _workers.size(); +} + +// Function: num_topologies +inline size_t Executor::num_topologies() const { + return _num_topologies; +} + +// Function: async +template +auto Executor::async(F&& f, ArgsT&&... args) { + + _increment_topology(); + + using T = std::invoke_result_t; + using R = std::conditional_t, void, std::optional>; + + std::promise p; + + auto tpg = std::make_shared(); + + Future fu(p.get_future(), tpg); + + auto node = node_pool.animate( + std::in_place_type_t{}, + [p=make_moc(std::move(p)), f=std::forward(f), args...] + (bool cancel) mutable { + if constexpr(std::is_same_v) { + if(!cancel) { + f(args...); + } + p.object.set_value(); + } + else { + p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); + } + }, + std::move(tpg) + ); + + _schedule(node); + + return fu; +} + +// Function: silent_async +template +void Executor::silent_async(F&& f, ArgsT&&... args) { + + _increment_topology(); + + Node* node = node_pool.animate( + std::in_place_type_t{}, + [f=std::forward(f), args...] () mutable { + f(args...); + } + ); + + _schedule(node); +} + +// Function: this_worker_id +inline int Executor::this_worker_id() const { + auto worker = _per_thread.worker; + return worker ? static_cast(worker->_id) : -1; +} + +// Procedure: _spawn +inline void Executor::_spawn(size_t N) { + for(size_t id=0; id void { + + _per_thread.worker = &w; + + Node* t = nullptr; + + // must use 1 as condition instead of !done + while(1) { + + // execute the tasks. + _exploit_task(w, t); + + // wait for tasks + if(_wait_for_task(w, t) == false) { + break; + } + } + + }, std::ref(_workers[id])); + } +} + +// Function: _explore_task +inline void Executor::_explore_task(Worker& w, Node*& t) { + + //assert(_workers[w].wsq.empty()); + assert(!t); + + size_t num_steals = 0; + size_t num_yields = 0; + + std::uniform_int_distribution rdvtm(_VICTIM_BEG, _VICTIM_END); + + //while(!_done) { + // + // size_t vtm = rdvtm(w._rdgen); + // + // t = (vtm == w._id) ? _wsq[d].steal() : _workers[vtm].wsq[d].steal(); + + // if(t) { + // break; + // } + + // if(num_steal++ > _MAX_STEALS) { + // std::this_thread::yield(); + // if(num_yields++ > _MAX_YIELDS) { + // break; + // } + // } + //} + + do { + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + break; + } + + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); + if(num_yields++ > _MAX_YIELDS) { + break; + } + } + + w._vtm = rdvtm(w._rdgen); + } while(!_done); + +} + +// Procedure: _exploit_task +inline void Executor::_exploit_task(Worker& w, Node*& t) { + + if(t) { + + if(_num_actives.fetch_add(1) == 0 && _num_thieves == 0) { + _notifier.notify(false); + } + + while(t) { + _invoke(w, t); + t = w._wsq.pop(); + } + + --_num_actives; + } +} + +// Function: _wait_for_task +inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { + + wait_for_task: + + assert(!t); + + ++_num_thieves; + + explore_task: + + _explore_task(worker, t); + + if(t) { + if(_num_thieves.fetch_sub(1) == 1) { + _notifier.notify(false); + } + return true; + } + + _notifier.prepare_wait(worker._waiter); + + //if(auto vtm = _find_vtm(me); vtm != _workers.size()) { + if(!_wsq.empty()) { + + _notifier.cancel_wait(worker._waiter); + //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal(); + + t = _wsq.steal(); // must steal here + if(t) { + if(_num_thieves.fetch_sub(1) == 1) { + _notifier.notify(false); + } + return true; + } + else { + worker._vtm = worker._id; + goto explore_task; + } + } + + if(_done) { + _notifier.cancel_wait(worker._waiter); + _notifier.notify(true); + --_num_thieves; + return false; + } + + if(_num_thieves.fetch_sub(1) == 1) { + if(_num_actives) { + _notifier.cancel_wait(worker._waiter); + goto wait_for_task; + } + // check all queues again + for(auto& w : _workers) { + if(!w._wsq.empty()) { + worker._vtm = w._id; + _notifier.cancel_wait(worker._waiter); + goto wait_for_task; + } + } + } + + // Now I really need to relinguish my self to others + _notifier.commit_wait(worker._waiter); + + return true; +} + +// Function: make_observer +template +std::shared_ptr Executor::make_observer(ArgsT&&... args) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + // use a local variable to mimic the constructor + auto ptr = std::make_shared(std::forward(args)...); + + ptr->set_up(_workers.size()); + + _observers.emplace(std::static_pointer_cast(ptr)); + + return ptr; +} + +// Procedure: remove_observer +template +void Executor::remove_observer(std::shared_ptr ptr) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + _observers.erase(std::static_pointer_cast(ptr)); +} + +// Function: num_observers +inline size_t Executor::num_observers() const { + return _observers.size(); +} + +// Procedure: _schedule +// The main procedure to schedule a give task node. +// Each task node has two types of tasks - regular and subflow. +inline void Executor::_schedule(Node* node) { + + //assert(_workers.size() != 0); + + // caller is a worker to this pool + auto worker = _per_thread.worker; + + if(worker != nullptr && worker->_executor == this) { + worker->_wsq.push(node); + return; + } + + // other threads + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +// The main procedure to schedule a set of task nodes. +// Each task node has two types of tasks - regular and subflow. +inline void Executor::_schedule(const std::vector& nodes) { + + //assert(_workers.size() != 0); + + // We need to cacth the node count to avoid accessing the nodes + // vector while the parent topology is removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // worker thread + auto worker = _per_thread.worker; + + if(worker != nullptr && worker->_executor == this) { + for(size_t i=0; i_wsq.push(nodes[i]); + } + return; + } + + // other threads + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_topology && node->_topology->_is_cancelled) { + if(node->_is_cancelled()) { + _tear_down_invoke(node, true); + return; + } + + // if acquiring semaphore(s) exists, acquire them first + if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { + std::vector nodes; + if(!node->_acquire_all(nodes)) { + _schedule(nodes); + return; + } + node->_set_state(Node::ACQUIRED); + } + + // Here we need to fetch the num_successors first to avoid the invalid memory + // access caused by topology clear. + const auto num_successors = node->num_successors(); + + // condition task + int cond = -1; + + // switch is faster than nested if-else due to jump table + switch(node->_handle.index()) { + // static task + case Node::STATIC:{ + _invoke_static_task(worker, node); + } + break; + + // dynamic task + case Node::DYNAMIC: { + _invoke_dynamic_task(worker, node); + } + break; + + // condition task + case Node::CONDITION: { + _invoke_condition_task(worker, node, cond); + } + break; + + // module task + case Node::MODULE: { + _invoke_module_task(worker, node); + } + break; + + // async task + case Node::ASYNC: { + _invoke_async_task(worker, node); + _tear_down_invoke(node, false); + return ; + } + break; + + // silent async task + case Node::SILENT_ASYNC: { + _invoke_silent_async_task(worker, node); + _tear_down_invoke(node, false); + return ; + } + break; + + // cudaflow task + case Node::CUDAFLOW: { + _invoke_cudaflow_task(worker, node); + } + break; + + // monostate + default: + break; + } + + // if releasing semaphores exist, release them + if(node->_semaphores && !node->_semaphores->to_release.empty()) { + _schedule(node->_release_all()); + } + + // We MUST recover the dependency since the graph may have cycles. + // This must be done before scheduling the successors, otherwise this might cause + // race condition on the _dependents + if(node->_has_state(Node::BRANCHED)) { + node->_join_counter = node->num_strong_dependents(); + } + else { + node->_join_counter = node->num_dependents(); + } + + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; + + // At this point, the node storage might be destructed (to be verified) + // case 1: non-condition task + if(node->_handle.index() != Node::CONDITION) { + for(size_t i=0; i_successors[i]->_join_counter) == 0) { + j.fetch_add(1); + _schedule(node->_successors[i]); + } + } + } + // case 2: condition task + else { + if(cond >= 0 && static_cast(cond) < num_successors) { + auto s = node->_successors[cond]; + s->_join_counter.store(0); // seems redundant but just for invariant + j.fetch_add(1); + _schedule(s); + } + } + + // tear_down the invoke + _tear_down_invoke(node, false); +} + +// Procedure: _tear_down_async +inline void Executor::_tear_down_async(Node* node) { + if(node->_parent) { + node->_parent->_join_counter.fetch_sub(1); + } + else { + _decrement_topology_and_notify(); + } + node_pool.recycle(node); +} + +// Procedure: _tear_down_invoke +inline void Executor::_tear_down_invoke(Node* node, bool cancel) { + + switch(node->_handle.index()) { + // async task needs to carry out the promise + case Node::ASYNC: + if(cancel) { + std::get(node->_handle).work(true); + } + _tear_down_async(node); + break; + + // silent async doesn't need to carry out the promise + case Node::SILENT_ASYNC: + _tear_down_async(node); + break; + + // tear down topology if the node is the last leaf + default: { + if(node->_parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1) == 1) { + _tear_down_topology(node->_topology); + } + } + else { // joined subflow + node->_parent->_join_counter.fetch_sub(1); + } + } + break; + } +} + +// Procedure: _observer_prologue +inline void Executor::_observer_prologue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_entry(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _observer_epilogue +inline void Executor::_observer_epilogue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_exit(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _invoke_static_task +inline void Executor::_invoke_static_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get(node->_handle).work(); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dynamic_task +inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) { + + _observer_prologue(w, node); + + auto& handle = std::get(node->_handle); + + handle.subgraph.clear(); + + Subflow sf(*this, node, handle.subgraph); + + handle.work(sf); + + if(sf._joinable) { + _invoke_dynamic_task_internal(w, node, handle.subgraph, false); + } + + _observer_epilogue(w, node); +} + +// Procedure: _invoke_dynamic_task_external +inline void Executor::_invoke_dynamic_task_external(Node*p, Graph& g, bool detach) { + + auto worker = _per_thread.worker; + + assert(worker && worker->_executor == this); + + _invoke_dynamic_task_internal(*worker, p, g, detach); +} + +// Procedure: _invoke_dynamic_task_internal +inline void Executor::_invoke_dynamic_task_internal( + Worker& w, Node* p, Graph& g, bool detach +) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter == 0) { + return; + } + + std::vector src; + + for(auto n : g._nodes) { + + n->_topology = p->_topology; + n->_set_up_join_counter(); + + if(detach) { + n->_parent = nullptr; + n->_set_state(Node::DETACHED); + } + else { + n->_parent = p; + } + + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + + // detach here + if(detach) { + + { + std::lock_guard lock(p->_topology->_taskflow._mtx); + p->_topology->_taskflow._graph.merge(std::move(g)); + } + + p->_topology->_join_counter.fetch_add(src.size()); + _schedule(src); + } + // join here + else { + p->_join_counter.fetch_add(src.size()); + _schedule(src); + Node* t = nullptr; + + std::uniform_int_distribution rdvtm(_VICTIM_BEG, _VICTIM_END); + + while(p->_join_counter != 0) { + + t = w._wsq.pop(); + + exploit: + + if(t) { + _invoke(w, t); + } + else { + explore: + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + if(t) { + goto exploit; + } + else if(p->_join_counter != 0){ + std::this_thread::yield(); + w._vtm = rdvtm(w._rdgen); + goto explore; + } + else { + break; + } + } + } + } +} + +// Procedure: _invoke_condition_task +inline void Executor::_invoke_condition_task( + Worker& worker, Node* node, int& cond +) { + _observer_prologue(worker, node); + cond = std::get(node->_handle).work(); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_cudaflow_task +inline void Executor::_invoke_cudaflow_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get(node->_handle).work(*this, node); + _observer_epilogue(worker, node); +} + + +// Procedure: _invoke_module_task +inline void Executor::_invoke_module_task(Worker& w, Node* node) { + _observer_prologue(w, node); + auto module = std::get(node->_handle).module; + _invoke_dynamic_task_internal(w, node, module->_graph, false); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_async_task +inline void Executor::_invoke_async_task(Worker& w, Node* node) { + _observer_prologue(w, node); + std::get(node->_handle).work(false); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_silent_async_task +inline void Executor::_invoke_silent_async_task(Worker& w, Node* node) { + _observer_prologue(w, node); + std::get(node->_handle).work(); + _observer_epilogue(w, node); +} + +// Function: run +inline tf::Future Executor::run(Taskflow& f) { + return run_n(f, 1, [](){}); +} + +// Function: run +template +tf::Future Executor::run(Taskflow& f, C&& c) { + return run_n(f, 1, std::forward(c)); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow& f, size_t repeat) { + return run_n(f, repeat, [](){}); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow& f, size_t repeat, C&& c) { + return run_until( + f, [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& pred) { + return run_until(f, std::forward

(pred), [](){}); +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Topology* tpg) { + + if(tpg->_is_cancelled) { + _tear_down_topology(tpg); + return; + } + + tpg->_sources.clear(); + tpg->_taskflow._graph.clear_detached(); + + // scan each node in the graph and build up the links + for(auto node : tpg->_taskflow._graph._nodes) { + + node->_topology = tpg; + node->_clear_state(); + + if(node->num_dependents() == 0) { + tpg->_sources.push_back(node); + } + + node->_set_up_join_counter(); + } + + tpg->_join_counter = tpg->_sources.size(); + _schedule(tpg->_sources); +} + +// Function: _tear_down_topology +inline void Executor::_tear_down_topology(Topology* tpg) { + + auto &f = tpg->_taskflow; + + //assert(&tpg == &(f._topologies.front())); + + // case 1: we still need to run the topology again + if(!tpg->_is_cancelled && !tpg->_pred()) { + assert(tpg->_join_counter == 0); + tpg->_join_counter = tpg->_sources.size(); + _schedule(tpg->_sources); + } + // case 2: the final run of this topology + else { + + // TODO: if the topology is cancelled, need to release all constraints + + if(tpg->_call != nullptr) { + tpg->_call(); + } + + f._mtx.lock(); + + // If there is another run (interleave between lock) + if(f._topologies.size() > 1) { + + assert(tpg->_join_counter == 0); + + // Set the promise + tpg->_promise.set_value(); + f._topologies.pop(); + tpg = f._topologies.front().get(); + + f._mtx.unlock(); + + // decrement the topology but since this is not the last we don't notify + _decrement_topology(); + + _set_up_topology(tpg); + } + else { + assert(f._topologies.size() == 1); + + // Need to back up the promise first here becuz taskflow might be + // destroy soon after calling get + auto p {std::move(tpg->_promise)}; + + // Back up lambda capture in case it has the topology pointer, + // to avoid it releasing on pop_front ahead of _mtx.unlock & + // _promise.set_value. Released safely when leaving scope. + auto c { std::move( tpg->_call ) }; + + f._topologies.pop(); + + f._mtx.unlock(); + + // We set the promise in the end in case taskflow leaves before taskflow + p.set_value(); + + _decrement_topology_and_notify(); + } + } +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& pred, C&& c) { + + _increment_topology(); + + // Special case of predicate + if(f.empty() || pred()) { + std::promise promise; + promise.set_value(); + _decrement_topology_and_notify(); + return tf::Future(promise.get_future(), std::monostate{}); + } + + // Multi-threaded execution. + bool run_now {false}; + + // create a topology for this run + auto tpg = std::make_shared( + f, std::forward

(pred), std::forward(c) + ); + + // need to create future before the topology got torn down quickly + tf::Future future(tpg->_promise.get_future(), tpg); + + { + std::lock_guard lock(f._mtx); + + f._topologies.push(tpg); + + if(f._topologies.size() == 1) { + run_now = true; + } + } + + // Notice here calling schedule may cause the topology to be removed sonner + // before the function leaves. + if(run_now) { + _set_up_topology(tpg.get()); + } + + return future; +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { + std::lock_guard lock(_topology_mutex); + ++_num_topologies; +} + +// Procedure: _decrement_topology_and_notify +inline void Executor::_decrement_topology_and_notify() { + std::lock_guard lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { + std::lock_guard lock(_topology_mutex); + --_num_topologies; +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { + std::unique_lock lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +} + +// ############################################################################ +// Forward Declaration: Subflow +// ############################################################################ + +inline void Subflow::join() { + + if(!_joinable) { + TF_THROW("subflow not joinable"); + } + + _executor._invoke_dynamic_task_external(_parent, _graph, false); + _joinable = false; +} + +inline void Subflow::detach() { + + if(!_joinable) { + TF_THROW("subflow already joined or detached"); + } + + _executor._invoke_dynamic_task_external(_parent, _graph, true); + _joinable = false; +} + +// Function: async +template +auto Subflow::async(F&& f, ArgsT&&... args) { + + _parent->_join_counter.fetch_add(1); + + //using T = typename function_traits::return_type; + using T = std::invoke_result_t; + using R = std::conditional_t, void, std::optional>; + + std::promise p; + + auto tpg = std::make_shared(); + + Future fu(p.get_future(), tpg); + + auto node = node_pool.animate( + std::in_place_type_t{}, + [p=make_moc(std::move(p)), f=std::forward(f), args...] + (bool cancel) mutable { + if constexpr(std::is_same_v) { + if(!cancel) { + f(args...); + } + p.object.set_value(); + } + else { + p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); + } + }, + std::move(tpg) + ); + + node->_topology = _parent->_topology; + node->_parent = _parent; + + _executor._schedule(node); + + return fu; +} + +// Function: silent_async +template +void Subflow::silent_async(F&& f, ArgsT&&... args) { + + _parent->_join_counter.fetch_add(1); + + auto node = node_pool.animate( + std::in_place_type_t{}, + [f=std::forward(f), args...] () mutable { + f(args...); + } + ); + + node->_topology = _parent->_topology; + node->_parent = _parent; + + _executor._schedule(node); +} + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/taskflow/core/flow_builder.hpp b/taskflow/core/flow_builder.hpp new file mode 100644 index 0000000..a022138 --- /dev/null +++ b/taskflow/core/flow_builder.hpp @@ -0,0 +1,1001 @@ +#pragma once + +#include "task.hpp" + +/** +@file flow_builder.hpp +@brief flow builder include file +*/ + +namespace tf { + +/** +@class FlowBuilder + +@brief building methods of a task dependency graph + +*/ +class FlowBuilder { + + friend class Executor; + + public: + + /** + @brief creates a static task + + @tparam C callable type constructible from std::function + + @param callable callable to construct a static task + + @return a tf::Task handle + + The following example creates a static task. + + @code{.cpp} + tf::Task static_task = taskflow.emplace([](){}); + @endcode + + Please refer to @ref StaticTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates a dynamic task + + @tparam C callable type constructible from std::function + + @param callable callable to construct a dynamic task + + @return a tf::Task handle + + The following example creates a dynamic task (tf::Subflow) + that spawns two static tasks. + + @code{.cpp} + tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){ + tf::Task static_task1 = sf.emplace([](){}); + tf::Task static_task2 = sf.emplace([](){}); + }); + @endcode + + Please refer to @ref DynamicTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates a condition task + + @tparam C callable type constructible from std::function + + @param callable callable to construct a condition task + + @return a tf::Task handle + + The following example creates an if-else block using one condition task + and three static tasks. + + @code{.cpp} + tf::Taskflow taskflow; + + auto [init, cond, yes, no] = taskflow.emplace( + [] () { }, + [] () { return 0; }, + [] () { std::cout << "yes\n"; }, + [] () { std::cout << "no\n"; } + ); + + // executes yes if cond returns 0, or no if cond returns 1 + cond.precede(yes, no); + cond.succeed(init); + @endcode + + Please refer to @ref ConditionalTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates multiple tasks from a list of callable objects + + @tparam C callable types + + @param callables one or multiple callable objects constructible from each task category + + @return a tf::Task handle + + The method returns a tuple of tasks each corresponding to the given + callable target. You can use structured binding to get the return tasks + one by one. + The following example creates four static tasks and assign them to + @c A, @c B, @c C, and @c D using structured binding. + + @code{.cpp} + auto [A, B, C, D] = taskflow.emplace( + [] () { std::cout << "A"; }, + [] () { std::cout << "B"; }, + [] () { std::cout << "C"; }, + [] () { std::cout << "D"; } + ); + @endcode + */ + template 1), void>* = nullptr> + auto emplace(C&&... callables); + + /** + @brief creates a module task from a taskflow + + @param taskflow a taskflow object for the module + + @return a tf::Task handle + + Please refer to @ref ComposableTasking for details. + */ + Task composed_of(Taskflow& taskflow); + + /** + @brief creates a placeholder task + + @return a tf::Task handle + + A placeholder task maps to a node in the taskflow graph, but + it does not have any callable work assigned yet. + A placeholder task is different from an empty task handle that + does not point to any node in a graph. + + @code{.cpp} + // create a placeholder task with no callable target assigned + tf::Task placeholder = taskflow.placeholder(); + assert(placeholder.empty() == false && placeholder.has_work() == false); + + // create an empty task handle + tf::Task task; + assert(task.empty() == true); + + // assign the task handle to the placeholder task + task = placeholder; + assert(task.empty() == false && task.has_work() == false); + @endcode + */ + Task placeholder(); + + /** + @brief creates a %cudaFlow task on the caller's GPU device context + + @tparam C callable type constructible from @c std::function + + @return a tf::Task handle + + This method is equivalent to calling tf::FlowBuilder::emplace_on(callable, d) + where @c d is the caller's device context. + The following example creates a %cudaFlow of two kernel tasks, @c task1 and + @c task2, where @c task1 runs before @c task2. + + @code{.cpp} + taskflow.emplace([&](tf::cudaFlow& cf){ + // create two kernel tasks + tf::cudaTask task1 = cf.kernel(grid1, block1, shm1, kernel1, args1); + tf::cudaTask task2 = cf.kernel(grid2, block2, shm2, kernel2, args2); + + // kernel1 runs before kernel2 + task1.precede(task2); + }); + @endcode + + Please refer to @ref GPUTaskingcudaFlow and @ref GPUTaskingcudaFlowCapturer + for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); + + /** + @brief creates a %cudaFlow task on the given device + + @tparam C callable type constructible from std::function + @tparam D device type, either @c int or @c std::ref (stateful) + + @return a tf::Task handle + + The following example creates a %cudaFlow of two kernel tasks, @c task1 and + @c task2 on GPU @c 2, where @c task1 runs before @c task2 + + @code{.cpp} + taskflow.emplace_on([&](tf::cudaFlow& cf){ + // create two kernel tasks + tf::cudaTask task1 = cf.kernel(grid1, block1, shm1, kernel1, args1); + tf::cudaTask task2 = cf.kernel(grid2, block2, shm2, kernel2, args2); + + // kernel1 runs before kernel2 + task1.precede(task2); + }, 2); + @endcode + */ + template , void>* = nullptr + > + Task emplace_on(C&& callable, D&& device); + + /** + @brief adds adjacent dependency links to a linear list of tasks + + @param tasks a vector of tasks + */ + void linearize(std::vector& tasks); + + /** + @brief adds adjacent dependency links to a linear list of tasks + + @param tasks an initializer list of tasks + */ + void linearize(std::initializer_list tasks); + + // ------------------------------------------------------------------------ + // parallel iterations + // ------------------------------------------------------------------------ + + /** + @brief constructs a STL-styled parallel-for task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [first, last). By default, we employ the guided partition algorithm with chunk size equal to one. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + callable(*itr); + } + @endcode + + Arguments templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of + the dereferenced iterator type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each(B&& first, E&& last, C&& callable); + + /** + @brief constructs a STL-styled parallel-for task using the guided partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam C callable type + @tparam H chunk size type + + @param beg iterator to the beginning (inclusive) + @param end iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [beg, end). The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of the dereferenced iterator type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_guided(B&& beg, E&& end, C&& callable, H&& chunk_size = 1); + + /** + @brief constructs a STL-styled parallel-for task using the dynamic partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam C callable type + @tparam H chunk size type + + @param beg iterator to the beginning (inclusive) + @param end iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [beg, end). The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of the dereferenced iterator type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_dynamic(B&& beg, E&& end, C&& callable, H&& chunk_size = 1); + + /** + @brief constructs a STL-styled parallel-for task using the dynamic partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam C callable type + @tparam H chunk size type + + @param beg iterator to the beginning (inclusive) + @param end iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [beg, end). The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. When the given chunk size is zero, the runtime distributes the work evenly across workers. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of the dereferenced iterator type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_static( + B&& beg, E&& end, C&& callable, H&& chunk_size = 0 + ); + + /** + @brief constructs an index-based parallel-for task + + @tparam B beginning index type (must be integral) + @tparam E ending index type (must be integral) + @tparam S step type (must be integral) + @tparam C callable type + + @param first index of the beginning (inclusive) + @param last index of the end (exclusive) + @param step step size + @param callable a callable object to apply to each valid index + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each index in the range [first, last) with the step size. By default, we employ the guided partition algorithm with chunk size equal to one. + + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + // case 1: step size is positive + for(auto i=first; ilast; i+=step) { + callable(i); + } + @endcode + + Arguments are templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of the integral index type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_index(B&& first, E&& last, S&& step, C&& callable); + + /** + @brief constructs an index-based parallel-for task using the guided partition algorithm. + + @tparam B beginning index type (must be integral) + @tparam E ending index type (must be integral) + @tparam S step type (must be integral) + @tparam C callable type + @tparam H chunk size type + + @param beg index of the beginning (inclusive) + @param end index of the end (exclusive) + @param step step size + @param callable a callable object to apply to each valid index + @param chunk_size chunk size (default 1) + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each index in the range [beg, end) with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of the integral index type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_index_guided( + B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 1 + ); + + /** + @brief constructs an index-based parallel-for task using the dynamic partition algorithm. + + @tparam B beginning index type (must be integral) + @tparam E ending index type (must be integral) + @tparam S step type (must be integral) + @tparam C callable type + @tparam H chunk size type + + @param beg index of the beginning (inclusive) + @param end index of the end (exclusive) + @param step step size + @param callable a callable object to apply to each valid index + @param chunk_size chunk size (default 1) + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each index in the range [beg, end) with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of the integral index type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_index_dynamic( + B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 1 + ); + + /** + @brief constructs an index-based parallel-for task using the static partition algorithm. + + @tparam B beginning index type (must be integral) + @tparam E ending index type (must be integral) + @tparam S step type (must be integral) + @tparam C callable type + @tparam H chunk size type + + @param beg index of the beginning (inclusive) + @param end index of the end (exclusive) + @param step step size + @param callable a callable object to apply to each valid index + @param chunk_size chunk size (default 0) + + @return a tf::Task handle + + The task spawns a subflow that applies the callable object to each index in the range [beg, end) with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker. When the given chunk size is zero, the runtime distributes the work evenly across workers. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + The callable needs to take a single argument of the integral index type. + + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_index_static( + B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 0 + ); + + // ------------------------------------------------------------------------ + // reduction + // ------------------------------------------------------------------------ + + /** + @brief constructs a STL-styled parallel-reduce task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam O binary reducer type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. By default, we employ the guided partition algorithm. + + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + init = bop(init, *itr); + } + @endcode + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task reduce(B&& first, E&& last, T& init, O&& bop); + + /** + @brief constructs a STL-styled parallel-reduce task using the guided partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam O binary reducer type + @tparam H chunk size type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task reduce_guided( + B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 1 + ); + + /** + @brief constructs a STL-styled parallel-reduce task using the dynamic partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam O binary reducer type + @tparam H chunk size type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task reduce_dynamic( + B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 1 + ); + + /** + @brief constructs a STL-styled parallel-reduce task using the static partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam O binary reducer type + @tparam H chunk size type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task reduce_static( + B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 0 + ); + + // ------------------------------------------------------------------------ + // transfrom and reduction + // ------------------------------------------------------------------------ + + /** + @brief constructs a STL-styled parallel transform-reduce task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam BOP binary reducer type + @tparam UOP unary transformion type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied in unspecified order to the results of @c uop + @param uop unary operator that will be applied to transform each element in the range to the result type + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. By default, we employ the guided partition algorithm. + + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + init = bop(init, uop(*itr)); + } + @endcode + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task transform_reduce(B&& first, E&& last, T& init, BOP&& bop, UOP&& uop); + + /** + @brief constructs a STL-styled parallel transform-reduce task using the guided partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam BOP binary reducer type + @tparam UOP unary transformion type + @tparam H chunk size type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied in unspecified order to the results of @c uop + @param uop unary operator that will be applied to transform each element in the range to the result type + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task transform_reduce_guided( + B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 1 + ); + + /** + @brief constructs a STL-styled parallel transform-reduce task using the static partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam BOP binary reducer type + @tparam UOP unary transformion type + @tparam H chunk size type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied in unspecified order to the results of @c uop + @param uop unary operator that will be applied to transform each element in the range to the result type + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task transform_reduce_static( + B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 0 + ); + + /** + @brief constructs a STL-styled parallel transform-reduce task using the dynamic partition algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam BOP binary reducer type + @tparam UOP unary transformion type + @tparam H chunk size type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied in unspecified order to the results of @c uop + @param uop unary operator that will be applied to transform each element in the range to the result type + @param chunk_size chunk size + + @return a tf::Task handle + + The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelReduction for details. + */ + template + Task transform_reduce_dynamic( + B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 1 + ); + + // ------------------------------------------------------------------------ + // sort + // ------------------------------------------------------------------------ + + /** + @brief constructs a dynamic task to perform STL-styled parallel sort + + @tparam B beginning iterator type (random-accessible) + @tparam E ending iterator type (random-accessible) + @tparam C comparator type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param cmp comparison function object + + The task spawns a subflow to parallelly sort elements in the range + [first, last). + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelSort for details. + */ + template + Task sort(B&& first, E&& last, C&& cmp); + + /** + @brief constructs a dynamic task to perform STL-styled parallel sort using + the @c std::less comparator, where @c T is the element type + + @tparam B beginning iterator type (random-accessible) + @tparam E ending iterator type (random-accessible) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + The task spawns a subflow to parallelly sort elements in the range + [first, last) using the @c std::less comparator, + where @c T is the dereferenced iterator type. + + Arguments are templated to enable stateful passing using std::reference_wrapper. + + Please refer to @ref ParallelSort for details. + */ + template + Task sort(B&& first, E&& last); + + protected: + + /** + @brief constructs a flow builder with a graph + */ + FlowBuilder(Graph& graph); + + /** + @brief associated graph object + */ + Graph& _graph; + + private: + + template + void _linearize(L&); +}; + +// Constructor +inline FlowBuilder::FlowBuilder(Graph& graph) : + _graph {graph} { +} + +// Function: emplace +template , void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph.emplace_back( + std::in_place_type_t{}, std::forward(c) + )); +} + +// Function: emplace +template , void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph.emplace_back( + std::in_place_type_t{}, std::forward(c) + )); +} + +// Function: emplace +template , void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph.emplace_back( + std::in_place_type_t{}, std::forward(c) + )); +} + +// Function: emplace +template 1), void>*> +auto FlowBuilder::emplace(C&&... cs) { + return std::make_tuple(emplace(std::forward(cs))...); +} + +// Function: composed_of +inline Task FlowBuilder::composed_of(Taskflow& taskflow) { + auto node = _graph.emplace_back( + std::in_place_type_t{}, &taskflow + ); + return Task(node); +} + +// Function: placeholder +inline Task FlowBuilder::placeholder() { + auto node = _graph.emplace_back(); + return Task(node); +} + +// Procedure: _linearize +template +void FlowBuilder::_linearize(L& keys) { + + auto itr = keys.begin(); + auto end = keys.end(); + + if(itr == end) { + return; + } + + auto nxt = itr; + + for(++nxt; nxt != end; ++nxt, ++itr) { + itr->_node->_precede(nxt->_node); + } +} + +// Procedure: linearize +inline void FlowBuilder::linearize(std::vector& keys) { + _linearize(keys); +} + +// Procedure: linearize +inline void FlowBuilder::linearize(std::initializer_list keys) { + _linearize(keys); +} + +// ---------------------------------------------------------------------------- + +/** +@class Subflow + +@brief class to construct a subflow graph from the execution of a dynamic task + +By default, a subflow automatically @em joins its parent node. +You may explicitly join or detach a subflow by calling tf::Subflow::join +or tf::Subflow::detach, respectively. +The following example creates a taskflow graph that spawns a subflow from +the execution of task @c B, and the subflow contains three tasks, @c B1, +@c B2, and @c B3, where @c B3 runs after @c B1 and @c B2. + +@code{.cpp} +// create three regular tasks +tf::Task A = taskflow.emplace([](){}).name("A"); +tf::Task C = taskflow.emplace([](){}).name("C"); +tf::Task D = taskflow.emplace([](){}).name("D"); + +// create a subflow graph (dynamic tasking) +tf::Task B = taskflow.emplace([] (tf::Subflow& subflow) { + tf::Task B1 = subflow.emplace([](){}).name("B1"); + tf::Task B2 = subflow.emplace([](){}).name("B2"); + tf::Task B3 = subflow.emplace([](){}).name("B3"); + B1.precede(B3); + B2.precede(B3); +}).name("B"); + +A.precede(B); // B runs after A +A.precede(C); // C runs after A +B.precede(D); // D runs after B +C.precede(D); // D runs after C +@endcode + +*/ +class Subflow : public FlowBuilder { + + friend class Executor; + friend class FlowBuilder; + + public: + + /** + @brief enables the subflow to join its parent task + + Performs an immediate action to join the subflow. Once the subflow is joined, + it is considered finished and you may not modify the subflow anymore. + */ + void join(); + + /** + @brief enables the subflow to detach from its parent task + + Performs an immediate action to detach the subflow. Once the subflow is detached, + it is considered finished and you may not modify the subflow anymore. + */ + void detach(); + + /** + @brief queries if the subflow is joinable + + When a subflow is joined or detached, it becomes not joinable. + */ + bool joinable() const; + + /** + @brief runs a given function asynchronously + + @tparam F callable type + @tparam ArgsT parameter types + + @param f callable object to call + @param args parameters to pass to the callable + + @return a tf::Future that will holds the result of the execution + + This method is thread-safe and can be called by multiple tasks in the + subflow at the same time. + The difference to tf::Executor::async is that the created asynchronous task + pertains to the subflow. + When the subflow joins, all asynchronous tasks created from the subflow + are guaranteed to finish before the join. + For example: + + @code{.cpp} + std::atomic counter(0); + taskflow.empalce([&](tf::Subflow& sf){ + for(int i=0; i<100; i++) { + sf.async([&](){ counter++; }); + } + sf.join(); + assert(counter == 100); + }); + @endcode + + You cannot create asynchronous tasks from a detached subflow. + Doing this results in undefined behavior. + */ + template + auto async(F&& f, ArgsT&&... args); + + /** + @brief similar to tf::Subflow::async but did not return a future object + */ + template + void silent_async(F&& f, ArgsT&&... args); + + private: + + Subflow(Executor&, Node*, Graph&); + + Executor& _executor; + Node* _parent; + + bool _joinable {true}; +}; + +// Constructor +inline Subflow::Subflow(Executor& executor, Node* parent, Graph& graph) : + FlowBuilder {graph}, + _executor {executor}, + _parent {parent} { +} + +// Function: joined +inline bool Subflow::joinable() const { + return _joinable; +} + +} // end of namespace tf. --------------------------------------------------- + + diff --git a/taskflow/core/graph.hpp b/taskflow/core/graph.hpp new file mode 100644 index 0000000..06c6dd3 --- /dev/null +++ b/taskflow/core/graph.hpp @@ -0,0 +1,572 @@ +#pragma once + +#include "../utility/iterator.hpp" +#include "../utility/object_pool.hpp" +#include "../utility/traits.hpp" +#include "../utility/singleton.hpp" +#include "../utility/os.hpp" +#include "../utility/math.hpp" +#include "../utility/serializer.hpp" +#include "error.hpp" +#include "declarations.hpp" +#include "semaphore.hpp" +#include "environment.hpp" +#include "topology.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// Class: CustomGraphBase +// ---------------------------------------------------------------------------- +class CustomGraphBase { + + public: + + virtual void dump(std::ostream&, const void*, const std::string&) const = 0; + virtual ~CustomGraphBase() = default; +}; + +// ---------------------------------------------------------------------------- +// Class: Graph +// ---------------------------------------------------------------------------- +class Graph { + + friend class Node; + friend class Taskflow; + friend class Executor; + + public: + + Graph() = default; + Graph(const Graph&) = delete; + Graph(Graph&&); + + ~Graph(); + + Graph& operator = (const Graph&) = delete; + Graph& operator = (Graph&&); + + void clear(); + void clear_detached(); + void merge(Graph&&); + + bool empty() const; + + size_t size() const; + + template + Node* emplace_back(Args&& ...); + + Node* emplace_back(); + + private: + + std::vector _nodes; +}; + +// ---------------------------------------------------------------------------- + +// Class: Node +class Node { + + friend class Graph; + friend class Task; + friend class TaskView; + friend class Taskflow; + friend class Executor; + friend class FlowBuilder; + friend class Subflow; + + TF_ENABLE_POOLABLE_ON_THIS; + + // state bit flag + constexpr static int BRANCHED = 0x1; + constexpr static int DETACHED = 0x2; + constexpr static int ACQUIRED = 0x4; + + // static work handle + struct Static { + + template + Static(C&&); + + std::function work; + }; + + // dynamic work handle + struct Dynamic { + + template + Dynamic(C&&); + + std::function work; + Graph subgraph; + }; + + // condition work handle + struct Condition { + + template + Condition(C&&); + + std::function work; + }; + + // module work handle + struct Module { + + template + Module(T&&); + + Taskflow* module {nullptr}; + }; + + // Async work + struct Async { + + template + Async(T&&, std::shared_ptr); + + std::function work; + + std::shared_ptr topology; + }; + + // Silent async work + struct SilentAsync { + + template + SilentAsync(C&&); + + std::function work; + }; + + // cudaFlow work handle + struct cudaFlow { + + template + cudaFlow(C&& c, G&& g); + + std::function work; + + std::unique_ptr graph; + }; + + using handle_t = std::variant< + std::monostate, // placeholder + Static, // static tasking + Dynamic, // dynamic tasking + Condition, // conditional tasking + Module, // composable tasking + Async, // async tasking + SilentAsync, // async tasking (no future) + cudaFlow // cudaFlow + >; + + struct Semaphores { + std::vector to_acquire; + std::vector to_release; + }; + + public: + + // variant index + constexpr static auto PLACEHOLDER = get_index_v; + constexpr static auto STATIC = get_index_v; + constexpr static auto DYNAMIC = get_index_v; + constexpr static auto CONDITION = get_index_v; + constexpr static auto MODULE = get_index_v; + constexpr static auto ASYNC = get_index_v; + constexpr static auto SILENT_ASYNC = get_index_v; + constexpr static auto CUDAFLOW = get_index_v; + + template + Node(Args&&... args); + + ~Node(); + + size_t num_successors() const; + size_t num_dependents() const; + size_t num_strong_dependents() const; + size_t num_weak_dependents() const; + + const std::string& name() const; + + private: + + std::string _name; + + handle_t _handle; + + std::vector _successors; + std::vector _dependents; + + //std::optional _semaphores; + std::unique_ptr _semaphores; + + Topology* _topology {nullptr}; + + Node* _parent {nullptr}; + + int _state {0}; + + std::atomic _join_counter {0}; + + void _precede(Node*); + void _set_state(int); + void _unset_state(int); + void _clear_state(); + void _set_up_join_counter(); + + bool _has_state(int) const; + bool _is_cancelled() const; + bool _acquire_all(std::vector&); + + std::vector _release_all(); +}; + +// ---------------------------------------------------------------------------- +// Node Object Pool +// ---------------------------------------------------------------------------- +inline ObjectPool node_pool; + +// ---------------------------------------------------------------------------- +// Definition for Node::Static +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::Static::Static(C&& c) : work {std::forward(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Dynamic +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::Dynamic::Dynamic(C&& c) : work {std::forward(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Condition +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::Condition::Condition(C&& c) : work {std::forward(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::cudaFlow +// ---------------------------------------------------------------------------- + +template +Node::cudaFlow::cudaFlow(C&& c, G&& g) : + work {std::forward(c)}, + graph {std::forward(g)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Module +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::Module::Module(T&& tf) : module {tf} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::Async +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::Async::Async(C&& c, std::shared_ptrtpg) : + work {std::forward(c)}, + topology {std::move(tpg)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node::SilentAsync +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::SilentAsync::SilentAsync(C&& c) : + work {std::forward(c)} { +} + +// ---------------------------------------------------------------------------- +// Definition for Node +// ---------------------------------------------------------------------------- + +// Constructor +template +Node::Node(Args&&... args): _handle{std::forward(args)...} { +} + +// Destructor +inline Node::~Node() { + // this is to avoid stack overflow + + if(_handle.index() == DYNAMIC) { + + auto& subgraph = std::get(_handle).subgraph; + + std::vector nodes; + + std::move( + subgraph._nodes.begin(), subgraph._nodes.end(), std::back_inserter(nodes) + ); + subgraph._nodes.clear(); + + size_t i = 0; + + while(i < nodes.size()) { + + if(nodes[i]->_handle.index() == DYNAMIC) { + + auto& sbg = std::get(nodes[i]->_handle).subgraph; + std::move( + sbg._nodes.begin(), sbg._nodes.end(), std::back_inserter(nodes) + ); + sbg._nodes.clear(); + } + + ++i; + } + + //auto& np = Graph::_node_pool(); + for(i=0; i_dependents.push_back(this); +} + +// Function: num_successors +inline size_t Node::num_successors() const { + return _successors.size(); +} + +// Function: dependents +inline size_t Node::num_dependents() const { + return _dependents.size(); +} + +// Function: num_weak_dependents +inline size_t Node::num_weak_dependents() const { + size_t n = 0; + for(size_t i=0; i<_dependents.size(); i++) { + if(_dependents[i]->_handle.index() == Node::CONDITION) { + n++; + } + } + return n; +} + +// Function: num_strong_dependents +inline size_t Node::num_strong_dependents() const { + size_t n = 0; + for(size_t i=0; i<_dependents.size(); i++) { + if(_dependents[i]->_handle.index() != Node::CONDITION) { + n++; + } + } + return n; +} + +// Function: name +inline const std::string& Node::name() const { + return _name; +} + +// Procedure: _set_state +inline void Node::_set_state(int flag) { + _state |= flag; +} + +// Procedure: _unset_state +inline void Node::_unset_state(int flag) { + _state &= ~flag; +} + +// Procedure: _clear_state +inline void Node::_clear_state() { + _state = 0; +} + +// Function: _has_state +inline bool Node::_has_state(int flag) const { + return _state & flag; +} + +// Function: _is_cancelled +inline bool Node::_is_cancelled() const { + if(_handle.index() == Node::ASYNC) { + auto& h = std::get(_handle); + if(h.topology && h.topology->_is_cancelled) { + return true; + } + } + // async tasks spawned from subflow does not have topology + return _topology && _topology->_is_cancelled; +} + +// Procedure: _set_up_join_counter +inline void Node::_set_up_join_counter() { + + size_t c = 0; + + for(auto p : _dependents) { + if(p->_handle.index() == Node::CONDITION) { + _set_state(Node::BRANCHED); + } + else { + c++; + } + } + + _join_counter.store(c, std::memory_order_relaxed); +} + + +// Function: _acquire_all +inline bool Node::_acquire_all(std::vector& nodes) { + + auto& to_acquire = _semaphores->to_acquire; + + for(size_t i = 0; i < to_acquire.size(); ++i) { + if(!to_acquire[i]->_try_acquire_or_wait(this)) { + for(size_t j = 1; j <= i; ++j) { + auto r = to_acquire[i-j]->_release(); + nodes.insert(end(nodes), begin(r), end(r)); + } + return false; + } + } + return true; +} + +// Function: _release_all +inline std::vector Node::_release_all() { + + auto& to_release = _semaphores->to_release; + + std::vector nodes; + for(const auto& sem : to_release) { + auto r = sem->_release(); + nodes.insert(end(nodes), begin(r), end(r)); + } + return nodes; +} + +// ---------------------------------------------------------------------------- +// Graph definition +// ---------------------------------------------------------------------------- + +//// Function: _node_pool +//inline ObjectPool& Graph::_node_pool() { +// static ObjectPool pool; +// return pool; +//} + +// Destructor +inline Graph::~Graph() { + //auto& np = _node_pool(); + for(auto node : _nodes) { + //np.recycle(node); + node_pool.recycle(node); + } +} + +// Move constructor +inline Graph::Graph(Graph&& other) : + _nodes {std::move(other._nodes)} { +} + +// Move assignment +inline Graph& Graph::operator = (Graph&& other) { + _nodes = std::move(other._nodes); + return *this; +} + +// Procedure: clear +inline void Graph::clear() { + //auto& np = _node_pool(); + for(auto node : _nodes) { + //node->~Node(); + //np.deallocate(node); + node_pool.recycle(node); + } + _nodes.clear(); +} + +// Procedure: clear_detached +inline void Graph::clear_detached() { + + auto mid = std::partition(_nodes.begin(), _nodes.end(), [] (Node* node) { + return !(node->_has_state(Node::DETACHED)); + }); + + //auto& np = _node_pool(); + for(auto itr = mid; itr != _nodes.end(); ++itr) { + node_pool.recycle(*itr); + } + _nodes.resize(std::distance(_nodes.begin(), mid)); +} + +// Procedure: merge +inline void Graph::merge(Graph&& g) { + for(auto n : g._nodes) { + _nodes.push_back(n); + } + g._nodes.clear(); +} + +// Function: size +// query the size +inline size_t Graph::size() const { + return _nodes.size(); +} + +// Function: empty +// query the emptiness +inline bool Graph::empty() const { + return _nodes.empty(); +} + +// Function: emplace_back +// create a node from a give argument; constructor is called if necessary +template +Node* Graph::emplace_back(ArgsT&&... args) { + //auto node = _node_pool().allocate(); + //new (node) Node(std::forward(args)...); + //_nodes.push_back(node); + _nodes.push_back(node_pool.animate(std::forward(args)...)); + return _nodes.back(); +} + +// Function: emplace_back +// create a node from a give argument; constructor is called if necessary +inline Node* Graph::emplace_back() { + //auto node = _node_pool().allocate(); + //new (node) Node(); + //_nodes.push_back(node); + _nodes.push_back(node_pool.animate()); + return _nodes.back(); +} + + +} // end of namespace tf. --------------------------------------------------- + + + + + diff --git a/taskflow/core/notifier.hpp b/taskflow/core/notifier.hpp new file mode 100644 index 0000000..a82f8a5 --- /dev/null +++ b/taskflow/core/notifier.hpp @@ -0,0 +1,267 @@ +// 2019/02/09 - created by Tsung-Wei Huang +// - modified the event count from Eigen + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +namespace tf { + +// Notifier allows to wait for arbitrary predicates in non-blocking +// algorithms. Think of condition variable, but wait predicate does not need to +// be protected by a mutex. Usage: +// Waiting thread does: +// +// if (predicate) +// return act(); +// Notifier::Waiter& w = waiters[my_index]; +// ec.prepare_wait(&w); +// if (predicate) { +// ec.cancel_wait(&w); +// return act(); +// } +// ec.commit_wait(&w); +// +// Notifying thread does: +// +// predicate = true; +// ec.notify(true); +// +// notify is cheap if there are no waiting threads. prepare_wait/commit_wait are not +// cheap, but they are executed only if the preceeding predicate check has +// failed. +// +// Algorihtm outline: +// There are two main variables: predicate (managed by user) and _state. +// Operation closely resembles Dekker mutual algorithm: +// https://en.wikipedia.org/wiki/Dekker%27s_algorithm +// Waiting thread sets _state then checks predicate, Notifying thread sets +// predicate then checks _state. Due to seq_cst fences in between these +// operations it is guaranteed than either waiter will see predicate change +// and won't block, or notifying thread will see _state change and will unblock +// the waiter, or both. But it can't happen that both threads don't see each +// other changes, which would lead to deadlock. +class Notifier { + + friend class Executor; + + public: + + struct Waiter { + std::atomic next; + std::mutex mu; + std::condition_variable cv; + uint64_t epoch; + unsigned state; + enum { + kNotSignaled, + kWaiting, + kSignaled, + }; + }; + + explicit Notifier(size_t N) : _waiters{N} { + assert(_waiters.size() < (1 << kWaiterBits) - 1); + // Initialize epoch to something close to overflow to test overflow. + _state = kStackMask | (kEpochMask - kEpochInc * _waiters.size() * 2); + } + + ~Notifier() { + // Ensure there are no waiters. + assert((_state.load() & (kStackMask | kWaiterMask)) == kStackMask); + } + + // prepare_wait prepares for waiting. + // After calling this function the thread must re-check the wait predicate + // and call either cancel_wait or commit_wait passing the same Waiter object. + void prepare_wait(Waiter* w) { + w->epoch = _state.fetch_add(kWaiterInc, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + } + + // commit_wait commits waiting. + void commit_wait(Waiter* w) { + w->state = Waiter::kNotSignaled; + // Modification epoch of this waiter. + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = _state.load(std::memory_order_seq_cst); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either cancel_wait or commit_wait, or is notified. + std::this_thread::yield(); + state = _state.load(std::memory_order_seq_cst); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter and add it to the waiter list. + assert((state & kWaiterMask) != 0); + uint64_t newstate = state - kWaiterInc + kEpochInc; + //newstate = (newstate & ~kStackMask) | (w - &_waiters[0]); + newstate = static_cast((newstate & ~kStackMask) | static_cast(w - &_waiters[0])); + if ((state & kStackMask) == kStackMask) + w->next.store(nullptr, std::memory_order_relaxed); + else + w->next.store(&_waiters[state & kStackMask], std::memory_order_relaxed); + if (_state.compare_exchange_weak(state, newstate, + std::memory_order_release)) + break; + } + _park(w); + } + + // cancel_wait cancels effects of the previous prepare_wait call. + void cancel_wait(Waiter* w) { + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = _state.load(std::memory_order_relaxed); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either cancel_wait or commit_wait, or is notified. + std::this_thread::yield(); + state = _state.load(std::memory_order_relaxed); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter. + assert((state & kWaiterMask) != 0); + if (_state.compare_exchange_weak(state, state - kWaiterInc + kEpochInc, + std::memory_order_relaxed)) + return; + } + } + + // notify wakes one or all waiting threads. + // Must be called after changing the associated wait predicate. + void notify(bool all) { + std::atomic_thread_fence(std::memory_order_seq_cst); + uint64_t state = _state.load(std::memory_order_acquire); + for (;;) { + // Easy case: no waiters. + if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0) + return; + uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + uint64_t newstate; + if (all) { + // Reset prewait counter and empty wait list. + newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask; + } else if (waiters) { + // There is a thread in pre-wait state, unblock it. + newstate = state + kEpochInc - kWaiterInc; + } else { + // Pop a waiter from list and unpark it. + Waiter* w = &_waiters[state & kStackMask]; + Waiter* wnext = w->next.load(std::memory_order_relaxed); + uint64_t next = kStackMask; + //if (wnext != nullptr) next = wnext - &_waiters[0]; + if (wnext != nullptr) next = static_cast(wnext - &_waiters[0]); + // Note: we don't add kEpochInc here. ABA problem on the lock-free stack + // can't happen because a waiter is re-pushed onto the stack only after + // it was in the pre-wait state which inevitably leads to epoch + // increment. + newstate = (state & kEpochMask) + next; + } + if (_state.compare_exchange_weak(state, newstate, + std::memory_order_acquire)) { + if (!all && waiters) return; // unblocked pre-wait thread + if ((state & kStackMask) == kStackMask) return; + Waiter* w = &_waiters[state & kStackMask]; + if (!all) w->next.store(nullptr, std::memory_order_relaxed); + _unpark(w); + return; + } + } + } + + // notify n workers + void notify_n(size_t n) { + if(n >= _waiters.size()) { + notify(true); + } + else { + for(size_t k=0; k _state; + std::vector _waiters; + + void _park(Waiter* w) { + std::unique_lock lock(w->mu); + while (w->state != Waiter::kSignaled) { + w->state = Waiter::kWaiting; + w->cv.wait(lock); + } + } + + void _unpark(Waiter* waiters) { + Waiter* next = nullptr; + for (Waiter* w = waiters; w; w = next) { + next = w->next.load(std::memory_order_relaxed); + unsigned state; + { + std::unique_lock lock(w->mu); + state = w->state; + w->state = Waiter::kSignaled; + } + // Avoid notifying if it wasn't waiting. + if (state == Waiter::kWaiting) w->cv.notify_one(); + } + } + +}; + + + +} // namespace tf ------------------------------------------------------------ + diff --git a/taskflow/core/observer.hpp b/taskflow/core/observer.hpp new file mode 100644 index 0000000..4ca0166 --- /dev/null +++ b/taskflow/core/observer.hpp @@ -0,0 +1,735 @@ +#pragma once + +#include "task.hpp" +#include "worker.hpp" + +/** +@file observer.hpp +@brief observer include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// timeline data structure +// ---------------------------------------------------------------------------- + +/** +@brief default time point type of observers +*/ +using observer_stamp_t = std::chrono::time_point; + +/** +@private +*/ +struct Segment { + + std::string name; + TaskType type; + + observer_stamp_t beg; + observer_stamp_t end; + + template + auto save(Archiver& ar) const { + return ar(name, type, beg, end); + } + + template + auto load(Archiver& ar) { + return ar(name, type, beg, end); + } + + Segment() = default; + + Segment( + const std::string& n, TaskType t, observer_stamp_t b, observer_stamp_t e + ) : name {n}, type {t}, beg {b}, end {e} { + } + + auto span() const { + return end-beg; + } +}; + +/** +@private +*/ +struct Timeline { + + size_t uid; + + observer_stamp_t origin; + std::vector>> segments; + + Timeline() = default; + + Timeline(const Timeline& rhs) = delete; + Timeline(Timeline&& rhs) = default; + + Timeline& operator = (const Timeline& rhs) = delete; + Timeline& operator = (Timeline&& rhs) = default; + + template + auto save(Archiver& ar) const { + return ar(uid, origin, segments); + } + + template + auto load(Archiver& ar) { + return ar(uid, origin, segments); + } +}; + +/** +@private + */ +struct ProfileData { + + std::vector timelines; + + ProfileData() = default; + + ProfileData(const ProfileData& rhs) = delete; + ProfileData(ProfileData&& rhs) = default; + + ProfileData& operator = (const ProfileData& rhs) = delete; + ProfileData& operator = (ProfileData&&) = default; + + template + auto save(Archiver& ar) const { + return ar(timelines); + } + + template + auto load(Archiver& ar) { + return ar(timelines); + } +}; + +// ---------------------------------------------------------------------------- +// observer interface +// ---------------------------------------------------------------------------- + +/** +@class: ObserverInterface + +@brief The interface class for creating an executor observer. + +The tf::ObserverInterface class let users define custom methods to monitor +the behaviors of an executor. This is particularly useful when you want to +inspect the performance of an executor and visualize when each thread +participates in the execution of a task. +To prevent users from direct access to the internal threads and tasks, +tf::ObserverInterface provides immutable wrappers, +tf::WorkerView and tf::TaskView, over workers and tasks. + +Please refer to tf::WorkerView and tf::TaskView for details. + +Example usage: + +@code{.cpp} + +struct MyObserver : public tf::ObserverInterface { + + MyObserver(const std::string& name) { + std::cout << "constructing observer " << name << '\n'; + } + + void set_up(size_t num_workers) override final { + std::cout << "setting up observer with " << num_workers << " workers\n"; + } + + void on_entry(WorkerView w, tf::TaskView tv) override final { + std::ostringstream oss; + oss << "worker " << w.id() << " ready to run " << tv.name() << '\n'; + std::cout << oss.str(); + } + + void on_exit(WorkerView w, tf::TaskView tv) override final { + std::ostringstream oss; + oss << "worker " << w.id() << " finished running " << tv.name() << '\n'; + std::cout << oss.str(); + } +}; + +tf::Taskflow taskflow; +tf::Executor executor; + +// insert tasks into taskflow +// ... + +// create a custom observer +std::shared_ptr observer = executor.make_observer("MyObserver"); + +// run the taskflow +executor.run(taskflow).wait(); +@endcode +*/ +class ObserverInterface { + + friend class Executor; + + public: + + /** + @brief virtual destructor + */ + virtual ~ObserverInterface() = default; + + /** + @brief constructor-like method to call when the executor observer is fully created + @param num_workers the number of the worker threads in the executor + */ + virtual void set_up(size_t num_workers) = 0; + + /** + @brief method to call before a worker thread executes a closure + @param w an immutable view of this worker thread + @param task_view a constant wrapper object to the task + */ + virtual void on_entry(WorkerView w, TaskView task_view) = 0; + + /** + @brief method to call after a worker thread executed a closure + @param w an immutable view of this worker thread + @param task_view a constant wrapper object to the task + */ + virtual void on_exit(WorkerView w, TaskView task_view) = 0; +}; + +// ---------------------------------------------------------------------------- +// ChromeObserver definition +// ---------------------------------------------------------------------------- + +/** +@class: ChromeObserver + +@brief observer interface based on Chrome tracing format + +A tf::ChromeObserver inherits tf::ObserverInterface and defines methods to dump +the observed thread activities into a format that can be visualized through +@ChromeTracing. + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +// insert tasks into taskflow +// ... + +// create a custom observer +std::shared_ptr observer = executor.make_observer(); + +// run the taskflow +executor.run(taskflow).wait(); + +// dump the thread activities to a chrome-tracing format. +observer->dump(std::cout); +@endcode +*/ +class ChromeObserver : public ObserverInterface { + + friend class Executor; + + // data structure to record each task execution + struct Segment { + + std::string name; + + observer_stamp_t beg; + observer_stamp_t end; + + Segment( + const std::string& n, + observer_stamp_t b, + observer_stamp_t e + ); + }; + + // data structure to store the entire execution timeline + struct Timeline { + observer_stamp_t origin; + std::vector> segments; + std::vector> stacks; + }; + + public: + + /** + @brief dumps the timelines into a @ChromeTracing format through + an output stream + */ + void dump(std::ostream& ostream) const; + + /** + @brief dumps the timelines into a @ChromeTracing format + */ + inline std::string dump() const; + + /** + @brief clears the timeline data + */ + inline void clear(); + + /** + @brief queries the number of tasks observed + */ + inline size_t num_tasks() const; + + private: + + inline void set_up(size_t num_workers) override final; + inline void on_entry(WorkerView w, TaskView task_view) override final; + inline void on_exit(WorkerView w, TaskView task_view) override final; + + Timeline _timeline; +}; + +// constructor +inline ChromeObserver::Segment::Segment( + const std::string& n, observer_stamp_t b, observer_stamp_t e +) : + name {n}, beg {b}, end {e} { +} + +// Procedure: set_up +inline void ChromeObserver::set_up(size_t num_workers) { + _timeline.segments.resize(num_workers); + _timeline.stacks.resize(num_workers); + + for(size_t w=0; w 0) { + break; + } + } + + os << '['; + + for(size_t w=first; w<_timeline.segments.size(); w++) { + + if(w != first && _timeline.segments[w].size() > 0) { + os << ','; + } + + for(size_t i=0; i<_timeline.segments[w].size(); i++) { + + os << '{' + << "\"cat\":\"ChromeObserver\","; + + // name field + os << "\"name\":\""; + if(_timeline.segments[w][i].name.empty()) { + os << w << '_' << i; + } + else { + os << _timeline.segments[w][i].name; + } + os << "\","; + + // segment field + os << "\"ph\":\"X\"," + << "\"pid\":1," + << "\"tid\":" << w << ',' + << "\"ts\":" << std::chrono::duration_cast( + _timeline.segments[w][i].beg - _timeline.origin + ).count() << ',' + << "\"dur\":" << std::chrono::duration_cast( + _timeline.segments[w][i].end - _timeline.segments[w][i].beg + ).count(); + + if(i != _timeline.segments[w].size() - 1) { + os << "},"; + } + else { + os << '}'; + } + } + } + os << "]\n"; +} + +// Function: dump +inline std::string ChromeObserver::dump() const { + std::ostringstream oss; + dump(oss); + return oss.str(); +} + +// Function: num_tasks +inline size_t ChromeObserver::num_tasks() const { + return std::accumulate( + _timeline.segments.begin(), _timeline.segments.end(), size_t{0}, + [](size_t sum, const auto& exe){ + return sum + exe.size(); + } + ); +} + +// ---------------------------------------------------------------------------- +// TFProfObserver definition +// ---------------------------------------------------------------------------- + +/** +@class TFProfObserver + +@brief observer interface based on the built-in taskflow profiler format + +A tf::TFProfObserver inherits tf::ObserverInterface and defines methods to dump +the observed thread activities into a format that can be visualized through +@TFProf. + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +// insert tasks into taskflow +// ... + +// create a custom observer +std::shared_ptr observer = executor.make_observer(); + +// run the taskflow +executor.run(taskflow).wait(); + +// dump the thread activities to Taskflow Profiler format. +observer->dump(std::cout); +@endcode + +We recommend using our @TFProf python script to observe thread activities +instead of the raw function call. +The script will turn on environment variables needed for observing all executors +in a taskflow program and dump the result to a valid, clean JSON file +compatible with the format of @TFProf. +*/ +class TFProfObserver : public ObserverInterface { + + friend class Executor; + friend class TFProfManager; + + public: + + /** + @brief dumps the timelines into a @TFProf format through + an output stream + */ + void dump(std::ostream& ostream) const; + + /** + @brief dumps the timelines into a JSON string + */ + std::string dump() const; + + /** + @brief clears the timeline data + */ + void clear(); + + /** + @brief queries the number of tasks observed + */ + size_t num_tasks() const; + + private: + + Timeline _timeline; + + std::vector> _stacks; + + inline void set_up(size_t num_workers) override final; + inline void on_entry(WorkerView, TaskView) override final; + inline void on_exit(WorkerView, TaskView) override final; +}; + +// Procedure: set_up +inline void TFProfObserver::set_up(size_t num_workers) { + _timeline.uid = unique_id(); + _timeline.origin = observer_stamp_t::clock::now(); + _timeline.segments.resize(num_workers); + _stacks.resize(num_workers); +} + +// Procedure: on_entry +inline void TFProfObserver::on_entry(WorkerView wv, TaskView) { + _stacks[wv.id()].push(observer_stamp_t::clock::now()); +} + +// Procedure: on_exit +inline void TFProfObserver::on_exit(WorkerView wv, TaskView tv) { + + size_t w = wv.id(); + + assert(!_stacks[w].empty()); + + if(_stacks[w].size() > _timeline.segments[w].size()) { + _timeline.segments[w].resize(_stacks[w].size()); + } + + auto beg = _stacks[w].top(); + _stacks[w].pop(); + + _timeline.segments[w][_stacks[w].size()].emplace_back( + tv.name(), tv.type(), beg, observer_stamp_t::clock::now() + ); +} + +// Function: clear +inline void TFProfObserver::clear() { + for(size_t w=0; w<_timeline.segments.size(); ++w) { + for(size_t l=0; l<_timeline.segments[w].size(); ++l) { + _timeline.segments[w][l].clear(); + } + while(!_stacks[w].empty()) { + _stacks[w].pop(); + } + } +} + +// Procedure: dump +inline void TFProfObserver::dump(std::ostream& os) const { + + size_t first; + + for(first = 0; first<_timeline.segments.size(); ++first) { + if(_timeline.segments[first].size() > 0) { + break; + } + } + + // not timeline data to dump + if(first == _timeline.segments.size()) { + os << "{}\n"; + return; + } + + os << "{\"executor\":\"" << _timeline.uid << "\",\"data\":["; + + bool comma = false; + + for(size_t w=first; w<_timeline.segments.size(); w++) { + for(size_t l=0; l<_timeline.segments[w].size(); l++) { + + if(_timeline.segments[w][l].empty()) { + continue; + } + + if(comma) { + os << ','; + } + else { + comma = true; + } + + os << "{\"worker\":" << w << ",\"level\":" << l << ",\"data\":["; + for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) { + + const auto& s = _timeline.segments[w][l][i]; + + if(i) os << ','; + + // span + os << "{\"span\":[" + << std::chrono::duration_cast( + s.beg - _timeline.origin + ).count() << "," + << std::chrono::duration_cast( + s.end - _timeline.origin + ).count() << "],"; + + // name + os << "\"name\":\""; + if(s.name.empty()) { + os << w << '_' << i; + } + else { + os << s.name; + } + os << "\","; + + // category "type": "Condition Task", + os << "\"type\":\"" << to_string(s.type) << "\""; + + os << "}"; + } + os << "]}"; + } + } + + os << "]}\n"; +} + +// Function: dump +inline std::string TFProfObserver::dump() const { + std::ostringstream oss; + dump(oss); + return oss.str(); +} + +// Function: num_tasks +inline size_t TFProfObserver::num_tasks() const { + return std::accumulate( + _timeline.segments.begin(), _timeline.segments.end(), size_t{0}, + [](size_t sum, const auto& exe){ + return sum + exe.size(); + } + ); +} + +// ---------------------------------------------------------------------------- +// TFProfManager +// ---------------------------------------------------------------------------- + +/** +@private +*/ +class TFProfManager { + + friend class Executor; + + public: + + ~TFProfManager(); + + TFProfManager(const TFProfManager&) = delete; + TFProfManager& operator=(const TFProfManager&) = delete; + + static TFProfManager& get(); + + void dump(std::ostream& ostream) const; + + private: + + const std::string _fpath; + + std::mutex _mutex; + std::vector> _observers; + + TFProfManager(); + + void _manage(std::shared_ptr observer); +}; + +// constructor +inline TFProfManager::TFProfManager() : + _fpath {get_env(TF_ENABLE_PROFILER)} { + +} + +// Procedure: manage +inline void TFProfManager::_manage(std::shared_ptr observer) { + std::lock_guard lock(_mutex); + _observers.push_back(std::move(observer)); +} + +// Procedure: dump +inline void TFProfManager::dump(std::ostream& os) const { + for(size_t i=0; i<_observers.size(); ++i) { + if(i) os << ','; + _observers[i]->dump(os); + } +} + +// Destructor +inline TFProfManager::~TFProfManager() { + std::ofstream ofs(_fpath); + if(ofs) { + // .tfp + if(_fpath.rfind(".tfp") != std::string::npos) { + ProfileData data; + data.timelines.reserve(_observers.size()); + for(size_t i=0; i<_observers.size(); ++i) { + data.timelines.push_back(std::move(_observers[i]->_timeline)); + } + Serializer serializer(ofs); + serializer(data); + } + // .json + else { + ofs << "[\n"; + for(size_t i=0; i<_observers.size(); ++i) { + if(i) ofs << ','; + _observers[i]->dump(ofs); + } + ofs << "]\n"; + } + } +} + +// Function: get +inline TFProfManager& TFProfManager::get() { + static TFProfManager mgr; + return mgr; +} + +// ---------------------------------------------------------------------------- +// Identifier for Each Built-in Observer +// ---------------------------------------------------------------------------- + +/** @enum ObserverType + +@brief enumeration of all observer types + +*/ +enum class ObserverType : int { + TFPROF = 0, + CHROME, + UNDEFINED +}; + +/** +@brief convert an observer type to a human-readable string +*/ +inline const char* to_string(ObserverType type) { + switch(type) { + case ObserverType::TFPROF: return "tfprof"; + case ObserverType::CHROME: return "chrome"; + default: return "undefined"; + } +} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/taskflow/core/semaphore.hpp b/taskflow/core/semaphore.hpp new file mode 100644 index 0000000..75d49be --- /dev/null +++ b/taskflow/core/semaphore.hpp @@ -0,0 +1,125 @@ +#pragma once + +#include +#include + +#include "declarations.hpp" + +/** +@file semaphore.hpp +@brief semaphore include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Semaphore +// ---------------------------------------------------------------------------- + +/** +@class Semaphore + +@brief class to create a semophore object for building a concurrency constraint + +A semaphore creates a constraint that limits the maximum concurrency, +i.e., the number of workers, in a set of tasks. +You can let a task acquire/release one or multiple semaphores before/after +executing its work. +A task can acquire and release a semaphore, +or just acquire or just release it. +A tf::Semaphore object starts with an initial count. +As long as that count is above 0, tasks can acquire the semaphore and do +their work. +If the count is 0 or less, a task trying to acquire the semaphore will not run +but goes to a waiting list of that semaphore. +When the semaphore is released by another task, +it reschedules all tasks on that waiting list. + +@code{.cpp} +tf::Executor executor(8); // create an executor of 8 workers +tf::Taskflow taskflow; + +tf::Semaphore semaphore(1); // create a semaphore with initial count 1 + +std::vector tasks { + taskflow.emplace([](){ std::cout << "A" << std::endl; }), + taskflow.emplace([](){ std::cout << "B" << std::endl; }), + taskflow.emplace([](){ std::cout << "C" << std::endl; }), + taskflow.emplace([](){ std::cout << "D" << std::endl; }), + taskflow.emplace([](){ std::cout << "E" << std::endl; }) +}; + +for(auto & task : tasks) { // each task acquires and release the semaphore + task.acquire(semaphore); + task.release(semaphore); +} + +executor.run(taskflow).wait(); +@endcode + +The above example creates five tasks with no dependencies between them. +Under normal circumstances, the five tasks would be executed concurrently. +However, this example has a semaphore with initial count 1, +and all tasks need to acquire that semaphore before running and release that +semaphore after they are done. +This organization limits the number of concurrently running tasks to only one. + +*/ +class Semaphore { + + friend class Node; + + public: + + /** + @brief constructs a semaphore with the given counter + */ + explicit Semaphore(int max_workers); + + /** + @brief queries the counter value (not thread-safe during the run) + */ + int count() const; + + private: + + std::mutex _mtx; + + int _counter; + + std::vector _waiters; + + bool _try_acquire_or_wait(Node*); + + std::vector _release(); +}; + +inline Semaphore::Semaphore(int max_workers) : + _counter(max_workers) { +} + +inline bool Semaphore::_try_acquire_or_wait(Node* me) { + std::lock_guard lock(_mtx); + if(_counter > 0) { + --_counter; + return true; + } + else { + _waiters.push_back(me); + return false; + } +} + +inline std::vector Semaphore::_release() { + std::lock_guard lock(_mtx); + ++_counter; + std::vector r{std::move(_waiters)}; + return r; +} + +inline int Semaphore::count() const { + return _counter; +} + +} // end of namespace tf. --------------------------------------------------- + diff --git a/taskflow/core/task.hpp b/taskflow/core/task.hpp new file mode 100644 index 0000000..2cc4621 --- /dev/null +++ b/taskflow/core/task.hpp @@ -0,0 +1,664 @@ +#pragma once + +#include "graph.hpp" + +/** +@file task.hpp +@brief task include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Task Types +// ---------------------------------------------------------------------------- + +/** +@enum TaskType + +@brief enumeration of all task types +*/ +enum class TaskType : int { + PLACEHOLDER = 0, + CUDAFLOW, + STATIC, + DYNAMIC, + CONDITION, + MODULE, + ASYNC, + UNDEFINED +}; + +/** +@brief array of all task types (used for iterating task types) +*/ +inline constexpr std::array TASK_TYPES = { + TaskType::PLACEHOLDER, + TaskType::CUDAFLOW, + TaskType::STATIC, + TaskType::DYNAMIC, + TaskType::CONDITION, + TaskType::MODULE, + TaskType::ASYNC +}; + +/** +@brief convert a task type to a human-readable string +*/ +inline const char* to_string(TaskType type) { + + const char* val; + + switch(type) { + case TaskType::PLACEHOLDER: val = "placeholder"; break; + case TaskType::CUDAFLOW: val = "cudaflow"; break; + case TaskType::STATIC: val = "static"; break; + case TaskType::DYNAMIC: val = "subflow"; break; + case TaskType::CONDITION: val = "condition"; break; + case TaskType::MODULE: val = "module"; break; + case TaskType::ASYNC: val = "async"; break; + default: val = "undefined"; break; + } + + return val; +} + +// ---------------------------------------------------------------------------- +// Task Traits +// ---------------------------------------------------------------------------- + +/** +@brief determines if a callable is a static task + +A static task is a callable object constructible from std::function. +*/ +template +constexpr bool is_static_task_v = std::is_invocable_r_v && + !std::is_invocable_r_v; + +/** +@brief determines if a callable is a dynamic task + +A dynamic task is a callable object constructible from std::function. +*/ +template +constexpr bool is_dynamic_task_v = std::is_invocable_r_v; + +/** +@brief determines if a callable is a condition task + +A condition task is a callable object constructible from std::function. +*/ +template +constexpr bool is_condition_task_v = std::is_invocable_r_v; + +/** +@brief determines if a callable is a cudaflow task + +A cudaFlow task is a callable object constructible from +std::function or std::function. +*/ +template +constexpr bool is_cudaflow_task_v = std::is_invocable_r_v || + std::is_invocable_r_v; + +// ---------------------------------------------------------------------------- +// Task +// ---------------------------------------------------------------------------- + +/** +@class Task + +@brief handle to a node in a task dependency graph + +A Task is handle to manipulate a node in a taskflow graph. +It provides a set of methods for users to access and modify the attributes of +the associated graph node without directly touching internal node data. + +*/ +class Task { + + friend class FlowBuilder; + friend class Taskflow; + friend class TaskView; + + public: + + /** + @brief constructs an empty task + */ + Task() = default; + + /** + @brief constructs the task with the copy of the other task + */ + Task(const Task& other); + + /** + @brief replaces the contents with a copy of the other task + */ + Task& operator = (const Task&); + + /** + @brief replaces the contents with a null pointer + */ + Task& operator = (std::nullptr_t); + + /** + @brief compares if two tasks are associated with the same graph node + */ + bool operator == (const Task& rhs) const; + + /** + @brief compares if two tasks are not associated with the same graph node + */ + bool operator != (const Task& rhs) const; + + /** + @brief queries the name of the task + */ + const std::string& name() const; + + /** + @brief queries the number of successors of the task + */ + size_t num_successors() const; + + /** + @brief queries the number of predecessors of the task + */ + size_t num_dependents() const; + + /** + @brief queries the number of strong dependents of the task + */ + size_t num_strong_dependents() const; + + /** + @brief queries the number of weak dependents of the task + */ + size_t num_weak_dependents() const; + + /** + @brief assigns a name to the task + + @param name a @std_string acceptable string + + @return @c *this + */ + Task& name(const std::string& name); + + /** + @brief assigns a callable + + @tparam C callable type + + @param callable callable to construct one of the static, dynamic, condition, and cudaFlow tasks + + @return @c *this + */ + template + Task& work(C&& callable); + + /** + @brief creates a module task from a taskflow + + @param taskflow a taskflow object for the module + + @return @c *this + */ + Task& composed_of(Taskflow& taskflow); + + /** + @brief adds precedence links from this to other tasks + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template + Task& precede(Ts&&... tasks); + + /** + @brief adds precedence links from other tasks to this + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template + Task& succeed(Ts&&... tasks); + + /** + @brief makes the task release this semaphore + */ + Task& release(Semaphore& semaphore); + + /** + @brief makes the task acquire this semaphore + */ + Task& acquire(Semaphore& semaphore); + + /** + @brief resets the task handle to null + */ + void reset(); + + /** + @brief resets the associated work to a placeholder + */ + void reset_work(); + + /** + @brief queries if the task handle points to a task node + */ + bool empty() const; + + /** + @brief queries if the task has a work assigned + */ + bool has_work() const; + + /** + @brief applies an visitor callable to each successor of the task + */ + template + void for_each_successor(V&& visitor) const; + + /** + @brief applies an visitor callable to each dependents of the task + */ + template + void for_each_dependent(V&& visitor) const; + + /** + @brief obtains a hash value of the underlying node + */ + size_t hash_value() const; + + /** + @brief returns the task type + */ + TaskType type() const; + + /** + @brief dumps the task through an output stream + */ + void dump(std::ostream& ostream) const; + + private: + + Task(Node*); + + Node* _node {nullptr}; +}; + +// Constructor +inline Task::Task(Node* node) : _node {node} { +} + +// Constructor +inline Task::Task(const Task& rhs) : _node {rhs._node} { +} + +// Function: precede +template +Task& Task::precede(Ts&&... tasks) { + (_node->_precede(tasks._node), ...); + //_precede(std::forward(tasks)...); + return *this; +} + +// Function: succeed +template +Task& Task::succeed(Ts&&... tasks) { + (tasks._node->_precede(_node), ...); + //_succeed(std::forward(tasks)...); + return *this; +} + +// Function: composed_of +inline Task& Task::composed_of(Taskflow& tf) { + _node->_handle.emplace(&tf); + return *this; +} + +// Operator = +inline Task& Task::operator = (const Task& rhs) { + _node = rhs._node; + return *this; +} + +// Operator = +inline Task& Task::operator = (std::nullptr_t ptr) { + _node = ptr; + return *this; +} + +// Operator == +inline bool Task::operator == (const Task& rhs) const { + return _node == rhs._node; +} + +// Operator != +inline bool Task::operator != (const Task& rhs) const { + return _node != rhs._node; +} + +// Function: name +inline Task& Task::name(const std::string& name) { + _node->_name = name; + return *this; +} + +// Function: acquire +inline Task& Task::acquire(Semaphore& s) { + if(!_node->_semaphores) { + //_node->_semaphores.emplace(); + _node->_semaphores = std::make_unique(); + } + _node->_semaphores->to_acquire.push_back(&s); + return *this; +} + +// Function: release +inline Task& Task::release(Semaphore& s) { + if(!_node->_semaphores) { + //_node->_semaphores.emplace(); + _node->_semaphores = std::make_unique(); + } + _node->_semaphores->to_release.push_back(&s); + return *this; +} + +// Procedure: reset +inline void Task::reset() { + _node = nullptr; +} + +// Procedure: reset_work +inline void Task::reset_work() { + _node->_handle.emplace(); +} + +// Function: name +inline const std::string& Task::name() const { + return _node->_name; +} + +// Function: num_dependents +inline size_t Task::num_dependents() const { + return _node->num_dependents(); +} + +// Function: num_strong_dependents +inline size_t Task::num_strong_dependents() const { + return _node->num_strong_dependents(); +} + +// Function: num_weak_dependents +inline size_t Task::num_weak_dependents() const { + return _node->num_weak_dependents(); +} + +// Function: num_successors +inline size_t Task::num_successors() const { + return _node->num_successors(); +} + +// Function: empty +inline bool Task::empty() const { + return _node == nullptr; +} + +// Function: has_work +inline bool Task::has_work() const { + return _node ? _node->_handle.index() != 0 : false; +} + +// Function: task_type +inline TaskType Task::type() const { + switch(_node->_handle.index()) { + case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; + case Node::STATIC: return TaskType::STATIC; + case Node::DYNAMIC: return TaskType::DYNAMIC; + case Node::CONDITION: return TaskType::CONDITION; + case Node::MODULE: return TaskType::MODULE; + case Node::ASYNC: return TaskType::ASYNC; + case Node::SILENT_ASYNC: return TaskType::ASYNC; + case Node::CUDAFLOW: return TaskType::CUDAFLOW; + default: return TaskType::UNDEFINED; + } +} + +// Function: for_each_successor +template +void Task::for_each_successor(V&& visitor) const { + for(size_t i=0; i<_node->_successors.size(); ++i) { + visitor(Task(_node->_successors[i])); + } +} + +// Function: for_each_dependent +template +void Task::for_each_dependent(V&& visitor) const { + for(size_t i=0; i<_node->_dependents.size(); ++i) { + visitor(Task(_node->_dependents[i])); + } +} + +// Function: hash_value +inline size_t Task::hash_value() const { + return std::hash{}(_node); +} + +// Procedure: dump +inline void Task::dump(std::ostream& os) const { + os << "task "; + if(name().empty()) os << _node; + else os << name(); + os << " [type=" << to_string(type()) << ']'; +} + +// Function: work +template +Task& Task::work(C&& c) { + if constexpr(is_static_task_v) { + _node->_handle.emplace(std::forward(c)); + } + else if constexpr(is_dynamic_task_v) { + _node->_handle.emplace(std::forward(c)); + } + else if constexpr(is_condition_task_v) { + _node->_handle.emplace(std::forward(c)); + } + else if constexpr(is_cudaflow_task_v) { + _node->_handle.emplace(std::forward(c)); + } + else { + static_assert(dependent_false_v, "invalid task callable"); + } + return *this; +} + +// ---------------------------------------------------------------------------- +// global ostream +// ---------------------------------------------------------------------------- + +/** +@brief overload of ostream inserter operator for cudaTask +*/ +inline std::ostream& operator << (std::ostream& os, const Task& task) { + task.dump(os); + return os; +} + +// ---------------------------------------------------------------------------- + +/** +@class TaskView + +@brief class to access task information from the observer interface +*/ +class TaskView { + + friend class Executor; + + public: + + /** + @brief queries the name of the task + */ + const std::string& name() const; + + /** + @brief queries the number of successors of the task + */ + size_t num_successors() const; + + /** + @brief queries the number of predecessors of the task + */ + size_t num_dependents() const; + + /** + @brief queries the number of strong dependents of the task + */ + size_t num_strong_dependents() const; + + /** + @brief queries the number of weak dependents of the task + */ + size_t num_weak_dependents() const; + + /** + @brief applies an visitor callable to each successor of the task + */ + template + void for_each_successor(V&& visitor) const; + + /** + @brief applies an visitor callable to each dependents of the task + */ + template + void for_each_dependent(V&& visitor) const; + + /** + @brief queries the task type + */ + TaskType type() const; + + /** + @brief obtains a hash value of the underlying node + */ + size_t hash_value() const; + + private: + + TaskView(const Node&); + TaskView(const TaskView&) = default; + + const Node& _node; +}; + +// Constructor +inline TaskView::TaskView(const Node& node) : _node {node} { +} + +// Function: name +inline const std::string& TaskView::name() const { + return _node._name; +} + +// Function: num_dependents +inline size_t TaskView::num_dependents() const { + return _node.num_dependents(); +} + +// Function: num_strong_dependents +inline size_t TaskView::num_strong_dependents() const { + return _node.num_strong_dependents(); +} + +// Function: num_weak_dependents +inline size_t TaskView::num_weak_dependents() const { + return _node.num_weak_dependents(); +} + +// Function: num_successors +inline size_t TaskView::num_successors() const { + return _node.num_successors(); +} + +// Function: type +inline TaskType TaskView::type() const { + switch(_node._handle.index()) { + case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; + case Node::STATIC: return TaskType::STATIC; + case Node::DYNAMIC: return TaskType::DYNAMIC; + case Node::CONDITION: return TaskType::CONDITION; + case Node::MODULE: return TaskType::MODULE; + case Node::ASYNC: return TaskType::ASYNC; + case Node::SILENT_ASYNC: return TaskType::ASYNC; + case Node::CUDAFLOW: return TaskType::CUDAFLOW; + default: return TaskType::UNDEFINED; + } +} + +// Function: hash_value +inline size_t TaskView::hash_value() const { + return std::hash{}(&_node); +} + +// Function: for_each_successor +template +void TaskView::for_each_successor(V&& visitor) const { + for(size_t i=0; i<_node._successors.size(); ++i) { + visitor(TaskView(_node._successors[i])); + } +} + +// Function: for_each_dependent +template +void TaskView::for_each_dependent(V&& visitor) const { + for(size_t i=0; i<_node._dependents.size(); ++i) { + visitor(TaskView(_node._dependents[i])); + } +} + +} // end of namespace tf. --------------------------------------------------- + +namespace std { + +/** +@struct hash + +@brief hash specialization for std::hash +*/ +template <> +struct hash { + auto operator() (const tf::Task& task) const noexcept { + return task.hash_value(); + } +}; + +/** +@struct hash + +@brief hash specialization for std::hash +*/ +template <> +struct hash { + auto operator() (const tf::TaskView& task_view) const noexcept { + return task_view.hash_value(); + } +}; + +} // end of namespace std ---------------------------------------------------- + + + diff --git a/taskflow/core/taskflow.hpp b/taskflow/core/taskflow.hpp new file mode 100644 index 0000000..00b26f3 --- /dev/null +++ b/taskflow/core/taskflow.hpp @@ -0,0 +1,478 @@ +#pragma once + +#include "flow_builder.hpp" + +/** +@file core/taskflow.hpp +@brief taskflow include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- + +/** +@class Taskflow + +@brief main entry to create a task dependency graph + +A %taskflow manages a task dependency graph where each task represents a +callable object (e.g., @std_lambda, @std_function) and an edge represents a +dependency between two tasks. A task is one of the following types: + + 1. static task: the callable constructible from + @c std::function + 2. dynamic task: the callable constructible from + @c std::function + 3. condition task: the callable constructible from + @c std::function + 4. module task: the task constructed from tf::Taskflow::composed_of + 5. %cudaFlow task: the callable constructible from + @c std::function or + @c std::function + +Each task is a basic computation unit and is run by one worker thread +from an executor. +The following example creates a simple taskflow graph of four static tasks, +@c A, @c B, @c C, and @c D, where +@c A runs before @c B and @c C and +@c D runs after @c B and @c C. + +@code{.cpp} +tf::Executor executor; +tf::Taskflow taskflow("simple"); + +tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; }); +tf::Task B = taskflow.emplace([](){ std::cout << "TaskB\n"; }); +tf::Task C = taskflow.emplace([](){ std::cout << "TaskC\n"; }); +tf::Task D = taskflow.emplace([](){ std::cout << "TaskD\n"; }); + +A.precede(B, C); // A runs before B and C +D.succeed(B, C); // D runs after B and C + +executor.run(taskflow).wait(); +@endcode + +Please refer to @ref Cookbook to learn more about each task type +and how to submit a taskflow to an executor. +*/ +class Taskflow : public FlowBuilder { + + friend class Topology; + friend class Executor; + friend class FlowBuilder; + + struct Dumper { + std::stack stack; + std::unordered_set visited; + }; + + public: + + /** + @brief constructs a taskflow with the given name + */ + Taskflow(const std::string& name); + + /** + @brief constructs a taskflow + */ + Taskflow(); + + /** + @brief default destructor + + When the destructor is called, all tasks and their associated data + (e.g., captured data) will be destroyed. + It is your responsibility to ensure all submitted execution of this + taskflow have completed before destroying it. + */ + ~Taskflow() = default; + + /** + @brief dumps the taskflow to a DOT format through a std::ostream target + */ + void dump(std::ostream& ostream) const; + + /** + @brief dumps the taskflow to a std::string of DOT format + */ + std::string dump() const; + + /** + @brief queries the number of tasks + */ + size_t num_tasks() const; + + /** + @brief queries the emptiness of the taskflow + */ + bool empty() const; + + /** + @brief assigns a name to the taskflow + */ + void name(const std::string&); + + /** + @brief queries the name of the taskflow + */ + const std::string& name() const ; + + /** + @brief clears the associated task dependency graph + + When you clear a taskflow, all tasks and their associated data + (e.g., captured data) will be destroyed. + You should never clean a taskflow while it is being run by an executor. + */ + void clear(); + + /** + @brief applies a visitor to each task in the taskflow + + A visitor is a callable that takes an argument of type tf::Task + and returns nothing. The following example iterates each task in a + taskflow and prints its name: + + @code{.cpp} + taskflow.for_each_task([](tf::Task task){ + std::cout << task.name() << '\n'; + }); + @endcode + */ + template + void for_each_task(V&& visitor) const; + + private: + + std::string _name; + + Graph _graph; + + std::mutex _mtx; + + std::queue> _topologies; + + void _dump(std::ostream&, const Taskflow*) const; + void _dump(std::ostream&, const Node*, Dumper&) const; + void _dump(std::ostream&, const Graph&, Dumper&) const; +}; + +// Constructor +inline Taskflow::Taskflow(const std::string& name) : + FlowBuilder {_graph}, + _name {name} { +} + +// Constructor +inline Taskflow::Taskflow() : FlowBuilder{_graph} { +} + +// Procedure: +inline void Taskflow::clear() { + _graph.clear(); +} + +// Function: num_tasks +inline size_t Taskflow::num_tasks() const { + return _graph.size(); +} + +// Function: empty +inline bool Taskflow::empty() const { + return _graph.empty(); +} + +// Function: name +inline void Taskflow::name(const std::string &name) { + _name = name; +} + +// Function: name +inline const std::string& Taskflow::name() const { + return _name; +} + +// Function: for_each_task +template +void Taskflow::for_each_task(V&& visitor) const { + for(size_t i=0; i<_graph._nodes.size(); ++i) { + visitor(Task(_graph._nodes[i])); + } +} + +// Procedure: dump +inline std::string Taskflow::dump() const { + std::ostringstream oss; + dump(oss); + return oss.str(); +} + +// Function: dump +inline void Taskflow::dump(std::ostream& os) const { + os << "digraph Taskflow {\n"; + _dump(os, this); + os << "}\n"; +} + +// Procedure: _dump +inline void Taskflow::_dump(std::ostream& os, const Taskflow* top) const { + + Dumper dumper; + + dumper.stack.push(top); + dumper.visited.insert(top); + + while(!dumper.stack.empty()) { + + auto f = dumper.stack.top(); + dumper.stack.pop(); + + os << "subgraph cluster_p" << f << " {\nlabel=\"Taskflow: "; + if(f->_name.empty()) os << 'p' << f; + else os << f->_name; + os << "\";\n"; + _dump(os, f->_graph, dumper); + os << "}\n"; + } +} + +// Procedure: _dump +inline void Taskflow::_dump( + std::ostream& os, const Node* node, Dumper& dumper +) const { + + os << 'p' << node << "[label=\""; + if(node->_name.empty()) os << 'p' << node; + else os << node->_name; + os << "\" "; + + // shape for node + switch(node->_handle.index()) { + + case Node::CONDITION: + os << "shape=diamond color=black fillcolor=aquamarine style=filled"; + break; + + case Node::CUDAFLOW: + os << " style=\"filled\"" + << " color=\"black\" fillcolor=\"purple\"" + << " fontcolor=\"white\"" + << " shape=\"folder\""; + break; + + default: + break; + } + + os << "];\n"; + + for(size_t s=0; s_successors.size(); ++s) { + if(node->_handle.index() == Node::CONDITION) { + // case edge is dashed + os << 'p' << node << " -> p" << node->_successors[s] + << " [style=dashed label=\"" << s << "\"];\n"; + } + else { + os << 'p' << node << " -> p" << node->_successors[s] << ";\n"; + } + } + + // subflow join node + if(node->_parent && node->_successors.size() == 0) { + os << 'p' << node << " -> p" << node->_parent << ";\n"; + } + + switch(node->_handle.index()) { + + case Node::DYNAMIC: { + auto& sbg = std::get(node->_handle).subgraph; + if(!sbg.empty()) { + os << "subgraph cluster_p" << node << " {\nlabel=\"Subflow: "; + if(node->_name.empty()) os << 'p' << node; + else os << node->_name; + + os << "\";\n" << "color=blue\n"; + _dump(os, sbg, dumper); + os << "}\n"; + } + } + break; + + case Node::CUDAFLOW: { + std::get(node->_handle).graph->dump( + os, node, node->_name + ); + } + break; + + default: + break; + } +} + +// Procedure: _dump +inline void Taskflow::_dump( + std::ostream& os, const Graph& graph, Dumper& dumper +) const { + + for(const auto& n : graph._nodes) { + + // regular task + if(n->_handle.index() != Node::MODULE) { + _dump(os, n, dumper); + } + // module task + else { + + auto module = std::get(n->_handle).module; + + os << 'p' << n << "[shape=box3d, color=blue, label=\""; + if(n->_name.empty()) os << n; + else os << n->_name; + os << " [Taskflow: "; + if(module->_name.empty()) os << 'p' << module; + else os << module->_name; + os << "]\"];\n"; + + if(dumper.visited.find(module) == dumper.visited.end()) { + dumper.visited.insert(module); + dumper.stack.push(module); + } + + for(const auto s : n->_successors) { + os << 'p' << n << "->" << 'p' << s << ";\n"; + } + } + } +} + +// ---------------------------------------------------------------------------- +// class definition: Future +// ---------------------------------------------------------------------------- + +/** +@class Future + +@brief class to access the result of task execution + +tf::Future is a derived class from std::future that will eventually hold the +execution result of a submitted taskflow (e.g., tf::Executor::run) +or an asynchronous task (e.g., tf::Executor::async). +In addition to base methods of std::future, +you can call tf::Future::cancel to cancel the execution of the running taskflow +associated with this future object. +The following example cancels a submission of a taskflow that contains +1000 tasks each running one second. + +@code{.cpp} +tf::Executor executor; +tf::Taskflow taskflow; + +for(int i=0; i<1000; i++) { + taskflow.emplace([](){ + std::this_thread::sleep_for(std::chrono::seconds(1)); + }); +} + +// submit the taskflow +tf::Future fu = executor.run(taskflow); + +// request to cancel the submitted execution above +fu.cancel(); + +// wait until the cancellation finishes +fu.get(); +@endcode +*/ +template +class Future : public std::future { + + friend class Executor; + friend class Subflow; + + using handle_t = std::variant< + std::monostate, std::weak_ptr, std::weak_ptr + >; + + // variant index + constexpr static auto ASYNC = get_index_v, handle_t>; + constexpr static auto TASKFLOW = get_index_v, handle_t>; + + public: + + /** + @brief default constructor + */ + Future() = default; + + /** + @brief disabled copy constructor + */ + Future(const Future&) = delete; + + /** + @brief default move constructor + */ + Future(Future&&) = default; + + /** + @brief disabled copy assignment + */ + Future& operator = (const Future&) = delete; + + /** + @brief default move assignment + */ + Future& operator = (Future&&) = default; + + /** + @brief cancels the execution of the running taskflow associated with + this future object + + @return @c true if the execution can be cancelled or + @c false if the execution has already completed + */ + bool cancel(); + + private: + + handle_t _handle; + + template + Future(std::future&&, P&&); +}; + +template +template +Future::Future(std::future&& fu, P&& p) : + std::future {std::move(fu)}, + _handle {std::forward

(p)} { +} + +// Function: cancel +template +bool Future::cancel() { + return std::visit([](auto&& arg){ + using P = std::decay_t; + if constexpr(std::is_same_v) { + return false; + } + else { + auto ptr = arg.lock(); + if(ptr) { + ptr->_is_cancelled = true; + return true; + } + return false; + } + }, _handle); +} + + +} // end of namespace tf. --------------------------------------------------- + + + + diff --git a/taskflow/core/topology.hpp b/taskflow/core/topology.hpp new file mode 100644 index 0000000..a9b8e51 --- /dev/null +++ b/taskflow/core/topology.hpp @@ -0,0 +1,61 @@ +#pragma once + +namespace tf { + +// ---------------------------------------------------------------------------- + +// class: TopologyBase +class TopologyBase { + + friend class Executor; + friend class Node; + + template + friend class Future; + + protected: + + bool _is_cancelled { false }; +}; + +// ---------------------------------------------------------------------------- + +// class: AsyncTopology +class AsyncTopology : public TopologyBase { +}; + +// ---------------------------------------------------------------------------- + +// class: Topology +class Topology : public TopologyBase { + + friend class Executor; + + public: + + template + Topology(Taskflow&, P&&, C&&); + + private: + + Taskflow& _taskflow; + + std::promise _promise; + + std::vector _sources; + + std::function _pred; + std::function _call; + + std::atomic _join_counter {0}; +}; + +// Constructor +template +Topology::Topology(Taskflow& tf, P&& p, C&& c): + _taskflow(tf), + _pred {std::forward

(p)}, + _call {std::forward(c)} { +} + +} // end of namespace tf. ---------------------------------------------------- diff --git a/taskflow/core/tsq.hpp b/taskflow/core/tsq.hpp new file mode 100644 index 0000000..0a13630 --- /dev/null +++ b/taskflow/core/tsq.hpp @@ -0,0 +1,247 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace tf { + +/** +@class: TaskQueue + +@tparam T data type (must be a pointer) + +@brief Lock-free unbounded single-producer multiple-consumer queue. + +This class implements the work stealing queue described in the paper, +"Correct and Efficient Work-Stealing for Weak Memory Models," +available at https://www.di.ens.fr/~zappa/readings/ppopp13.pdf. + +Only the queue owner can perform pop and push operations, +while others can steal data from the queue. +*/ +template +class TaskQueue { + + static_assert(std::is_pointer_v, "T must be a pointer type"); + + struct Array { + + int64_t C; + int64_t M; + std::atomic* S; + + explicit Array(int64_t c) : + C {c}, + M {c-1}, + S {new std::atomic[static_cast(C)]} { + } + + ~Array() { + delete [] S; + } + + int64_t capacity() const noexcept { + return C; + } + + template + void push(int64_t i, O&& o) noexcept { + S[i & M].store(std::forward(o), std::memory_order_relaxed); + } + + T pop(int64_t i) noexcept { + return S[i & M].load(std::memory_order_relaxed); + } + + Array* resize(int64_t b, int64_t t) { + Array* ptr = new Array {2*C}; + for(int64_t i=t; i!=b; ++i) { + ptr->push(i, pop(i)); + } + return ptr; + } + + }; + + std::atomic _top; + std::atomic _bottom; + std::atomic _array; + std::vector _garbage; + + public: + + /** + @brief constructs the queue with a given capacity + + @param capacity the capacity of the queue (must be power of 2) + */ + explicit TaskQueue(int64_t capacity = 1024); + + /** + @brief destructs the queue + */ + ~TaskQueue(); + + /** + @brief queries if the queue is empty at the time of this call + */ + bool empty() const noexcept; + + /** + @brief queries the number of items at the time of this call + */ + size_t size() const noexcept; + + /** + @brief queries the capacity of the queue + */ + int64_t capacity() const noexcept; + + /** + @brief inserts an item to the queue + + Only the owner thread can insert an item to the queue. + The operation can trigger the queue to resize its capacity + if more space is required. + + @tparam O data type + + @param item the item to perfect-forward to the queue + */ + void push(T item); + + /** + @brief pops out an item from the queue + + Only the owner thread can pop out an item from the queue. + The return can be a nullptr if this operation failed (empty queue). + */ + T pop(); + + /** + @brief steals an item from the queue + + Any threads can try to steal an item from the queue. + The return can be a nullptr if this operation failed (not necessary empty). + */ + T steal(); +}; + +// Constructor +template +TaskQueue::TaskQueue(int64_t c) { + assert(c && (!(c & (c-1)))); + _top.store(0, std::memory_order_relaxed); + _bottom.store(0, std::memory_order_relaxed); + _array.store(new Array{c}, std::memory_order_relaxed); + _garbage.reserve(32); +} + +// Destructor +template +TaskQueue::~TaskQueue() { + for(auto a : _garbage) { + delete a; + } + delete _array.load(); +} + +// Function: empty +template +bool TaskQueue::empty() const noexcept { + int64_t b = _bottom.load(std::memory_order_relaxed); + int64_t t = _top.load(std::memory_order_relaxed); + return b <= t; +} + +// Function: size +template +size_t TaskQueue::size() const noexcept { + int64_t b = _bottom.load(std::memory_order_relaxed); + int64_t t = _top.load(std::memory_order_relaxed); + return static_cast(b >= t ? b - t : 0); +} + +// Function: push +template +void TaskQueue::push(T o) { + int64_t b = _bottom.load(std::memory_order_relaxed); + int64_t t = _top.load(std::memory_order_acquire); + Array* a = _array.load(std::memory_order_relaxed); + + // queue is full + if(a->capacity() - 1 < (b - t)) { + Array* tmp = a->resize(b, t); + _garbage.push_back(a); + std::swap(a, tmp); + _array.store(a, std::memory_order_relaxed); + } + + a->push(b, o); + std::atomic_thread_fence(std::memory_order_release); + _bottom.store(b + 1, std::memory_order_relaxed); +} + +// Function: pop +template +T TaskQueue::pop() { + int64_t b = _bottom.load(std::memory_order_relaxed) - 1; + Array* a = _array.load(std::memory_order_relaxed); + _bottom.store(b, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t t = _top.load(std::memory_order_relaxed); + + T item {nullptr}; + + if(t <= b) { + item = a->pop(b); + if(t == b) { + // the last item just got stolen + if(!_top.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + item = nullptr; + } + _bottom.store(b + 1, std::memory_order_relaxed); + } + } + else { + _bottom.store(b + 1, std::memory_order_relaxed); + } + + return item; +} + +// Function: steal +template +T TaskQueue::steal() { + int64_t t = _top.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t b = _bottom.load(std::memory_order_acquire); + + T item {nullptr}; + + if(t < b) { + Array* a = _array.load(std::memory_order_consume); + item = a->pop(t); + if(!_top.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + return nullptr; + } + } + + return item; +} + +// Function: capacity +template +int64_t TaskQueue::capacity() const noexcept { + return _array.load(std::memory_order_relaxed)->capacity(); +} + +} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/core/worker.hpp b/taskflow/core/worker.hpp new file mode 100644 index 0000000..61b7bc8 --- /dev/null +++ b/taskflow/core/worker.hpp @@ -0,0 +1,103 @@ +#pragma once + +#include "declarations.hpp" +#include "tsq.hpp" +#include "notifier.hpp" + +/** +@file worker.hpp +@brief worker include file +*/ + +namespace tf { + +/** +@private +*/ +struct Worker { + + friend class Executor; + friend class WorkerView; + + private: + + size_t _id; + size_t _vtm; + Executor* _executor; + Notifier::Waiter* _waiter; + std::default_random_engine _rdgen { std::random_device{}() }; + TaskQueue _wsq; +}; + +// ---------------------------------------------------------------------------- +// Class Definition: WorkerView +// ---------------------------------------------------------------------------- + +/** +@class WorkerView + +@brief class to create an immutable view of a worker in an executor + +An executor keeps a set of internal worker threads to run tasks. +A worker view provides users an immutable interface to observe +when a worker runs a task, and the view object is only accessible +from an observer derived from tf::ObserverInterface. +*/ +class WorkerView { + + friend class Executor; + + public: + + /** + @brief queries the worker id associated with the executor + + A worker id is a unsigned integer in the range [0, N), + where @c N is the number of workers spawned at the construction + time of the executor. + */ + size_t id() const; + + /** + @brief queries the size of the queue (i.e., number of pending tasks to + run) associated with the worker + */ + size_t queue_size() const; + + /** + @brief queries the current capacity of the queue + */ + size_t queue_capacity() const; + + private: + + WorkerView(const Worker&); + WorkerView(const WorkerView&) = default; + + const Worker& _worker; + +}; + +// Constructor +inline WorkerView::WorkerView(const Worker& w) : _worker{w} { +} + +// function: id +inline size_t WorkerView::id() const { + return _worker._id; +} + +// Function: queue_size +inline size_t WorkerView::queue_size() const { + return _worker._wsq.size(); +} + +// Function: queue_capacity +inline size_t WorkerView::queue_capacity() const { + return static_cast(_worker._wsq.capacity()); +} + + +} // end of namespact tf ----------------------------------------------------- + + diff --git a/taskflow/cublasflow.hpp b/taskflow/cublasflow.hpp new file mode 100644 index 0000000..c714dbf --- /dev/null +++ b/taskflow/cublasflow.hpp @@ -0,0 +1,24 @@ +#pragma once + +// taskflow.hpp +// ^ +// | +// cudaflow.hpp +// ^ +// | +// cublasflow.hpp + +#include "cudaflow.hpp" + +#include "cuda/cublas/cublas_flow.hpp" +#include "cuda/cublas/cublas_helper.hpp" +#include "cuda/cublas/cublas_level1.hpp" +#include "cuda/cublas/cublas_level2.hpp" +#include "cuda/cublas/cublas_level3.hpp" + +/** +@file cublasflow.hpp +@brief main cublasFlow include file +*/ + + diff --git a/taskflow/cuda/cublas/cublas_error.hpp b/taskflow/cuda/cublas/cublas_error.hpp new file mode 100644 index 0000000..65052a3 --- /dev/null +++ b/taskflow/cuda/cublas/cublas_error.hpp @@ -0,0 +1,59 @@ +#pragma once + +#include + +namespace tf { + +// cuBLAS API errors +constexpr const char* cublas_error_to_string(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return "unknown cublas error"; +} + + + +#define TF_CHECK_CUBLAS(...) \ +if(TF_CUDA_GET_FIRST(__VA_ARGS__) != CUBLAS_STATUS_SUCCESS) { \ + std::ostringstream oss; \ + auto ev = TF_CUDA_GET_FIRST(__VA_ARGS__); \ + auto error_str = cublas_error_to_string(ev); \ + oss << "[" << __FILE__ << ":" << __LINE__ << " " \ + << error_str << "] "; \ + tf::ostreamize(oss, TF_CUDA_REMOVE_FIRST(__VA_ARGS__)); \ + throw std::runtime_error(oss.str()); \ +} + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/cuda/cublas/cublas_flow.hpp b/taskflow/cuda/cublas/cublas_flow.hpp new file mode 100644 index 0000000..7f4d819 --- /dev/null +++ b/taskflow/cuda/cublas/cublas_flow.hpp @@ -0,0 +1,1361 @@ +#pragma once + +#include "cublas_handle.hpp" + +/** +@file cublas_flow.hpp +@brief cublasFlowCapturer include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// cublasFlowCapturer definition +// ---------------------------------------------------------------------------- + +/** +@class cublasFlowCapturer + +@brief class to construct a cuBLAS task graph + +%cublasFlowCapturer provides a higher-level interface over the @cuBLAS library +and hide concurrency details from users. +It inherits methods from tf::cudaFlowCapturerBase and must be used from +a tf::cudaFlowCapturer object. +All pointers used to %cublasFlowCapturer methods must be in GPU memory space or managed +(i.e., @c cudaMallocManaged), +including scalars, @c alpha and @c beta, input data and output data pointers. +The following example uses @c cublasamax to find the minimum index of the element +of the maximum absolute magnitude in a vector. + +@code{.cpp} +#include + +int main() { + tf::Executor executor; + tf::Taskflow taskflow; + + size_t N = 1024; + float *x = nullptr; + int *d_res; + int h_res; + + std::vector host(N, 0.0f); + host[512] = 100.0f; // artificially set the mid-position to the largest + + cudaMalloc(&x, N*sizeof(float)); + cudaMalloc(&d_res, sizeof(int)); + + taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ + auto* cublas = capturer.make_capturer(); + + tf::cudaTask h2d = capturer.copy(x, host.data(), N); + tf::cudaTask find_max = cublas->amax(N, x, 1, d_res); + tf::cudaTask d2h = capturer.copy(&h_res, d_res, 1); + + h2d.precede(find_max); // amax runs before host-to-device copy + find_max.precede(d2h); // amax runs after device-to-host copy + }); + + executor.run(taskflow).wait(); + + assert(h_res == 512); +} +@endcode + +Currently, %cublasFlowCapturer supports only @c float and @c double data types. + +We design most tf::cublasFlowCapturer methods on top of the native, +high-performance @cuBLAS library. +You may refer to @cuBLAS for more details. + +*/ +class cublasFlowCapturer : public cudaFlowCapturerBase { + + public: + + /** + @brief constructs a cublas flow capturer + */ + cublasFlowCapturer() = default; + + /** + @brief gets the native cublas handle associated with this %cublasFlowCapturer + + @return a native cublas handle of type cublasHandle_t + */ + cublasHandle_t native_handle(); + + /** + @brief copies vector data from host to device + + This method copies @c n elements from a vector @c h in host memory space + to a vector @c d in GPU memory space. + The storage spacing between consecutive elements is given by @c inch for + the source vector @c h and by @c incd for the destination vector @c d. + + This method calls native @c cublasSetVectorAsync with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param n number of elements + @param d target device pointer + @param incd spacing between consecutive elements in @c d + @param h source host pointer + @param inch spacing between consecutive elements in @c h + + @return a tf::cudaTask handle + */ + template , void>* = nullptr + > + cudaTask vset(size_t n, const T* h, int inch, T* d, int incd); + + /** + @brief copies vector data from device to host + + This method copies @c n elements from a vector @c d in GPU memory space + to a vector @c h in host memory space. + The storage spacing between consecutive elements is given by @c inch for + the target vector @c h and by @c incd for the source vector @c d. + + This method calls native @c cublasGetVectorAsync with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param n number of elements + @param h target host pointer + @param inch spacing between consecutive elements in @c h + @param d source device pointer + @param incd spacing between consecutive elements in @c d + + @return a tf::cudaTask handle + */ + template , void>* = nullptr + > + cudaTask vget(size_t n, const T* d, int incd, T* h, int inch); + + // ------------------------------------------------------------------------ + // Level-1 vector-vector operations + // ------------------------------------------------------------------------ + + /** + @brief finds the smallest index of the element of the maximum + absolute magnitude + + This method calls native @c cublasamax with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements in vector @c x + @param x pointer to the memory address of the vector + @param incx stride between consecutive elements of @c x + @param result the resulting index (1-based indexing) + + @return a tf::cudaTask handle + */ + template + cudaTask amax(int n, const T* x, int incx, int* result); + + /** + @brief finds the smallest index of the element of the minimum + absolute magnitude + + This method calls native @c cublasamin with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements in vector @c x + @param x pointer to the memory address of the vector + @param incx stride between consecutive elements of @c x + @param result the resulting index (1-based indexing) + + @return a tf::cudaTask handle + */ + template + cudaTask amin(int n, const T* x, int incx, int* result); + + /** + @brief finds the sum of absolute values of the elements over a vector + + This method calls native @c cublasasum with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements in vector @c x + @param x pointer to the memory address of the vector + @param incx stride between consecutive elements of @c x + @param result the result + + @return a tf::cudaTask handle + */ + template + cudaTask asum(int n, const T* x, int incx, T* result); + + /** + @brief multiples a vector by a scalar and adds it to a vector + + This function multiplies the vector @c x by the scalar @c alpha and + adds it to the vector @c y overwriting the latest vector with the result. + Hence, the performed operation is: + + y[j] = alpha * x[k] + y[j], + + where @c j and @c k are indices of @c n elements with step sizes + @c incy and @c incx. + + This method calls native @c cublasasum with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements in vectors @c x and @c y + @param alpha scalar used to multiplication + @param x pointer to the memory address of the vector @c x + @param incx stride between consecutive elements of @c x + @param y pointer to the memory address of the vector @c y + @param incy stride between consecutive elements of @c y + + @return a tf::cudaTask handle + */ + template + cudaTask axpy( + int n, const T *alpha, const T *x, int incx, T *y, int incy + ); + + /** + @brief copies a vector to another vector + + This function copies @c n elements from a vector @c x of a step size @c incx + to another vector @c y of step size @c incy. + + adds it to the vector @c y overwriting the latest vector with the result. + Hence, the performed operation is: + + y[j] = x[k], + + where @c j and @c k are indices of @c n elements with step sizes + @c incy and @c incx. + + This method calls native @c cublascopy with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements to copy + @param x pointer to the memory address of the vector @c x + @param incx stride between consecutive elements of @c x + @param y pointer to the memory address of the vector @c y + @param incy stride between consecutive elements of @c y + + @return a tf::cudaTask handle + */ + template + cudaTask vcopy(int n, const T* x, int incx, T* y, int incy); + + /** + @brief computes the dot product of two vectors + + sum += x[i] * y[i] + + This method calls native @c cublasdot with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements to perform the dot product + @param x pointer to the memory address of the vector @c x + @param incx stride between consecutive elements of @c x + @param y pointer to the memory address of the vector @c y + @param incy stride between consecutive elements of @c y + @param result the resulting dot product + + @return a tf::cudaTask handle + */ + template + cudaTask dot(int n, const T* x, int incx, const T* y, int incy, T* result); + + /** + @brief computes the Euclidean norm of a vector + + This method calls native @c cublasnrm2 with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements in vector @c x + @param x pointer to the memory address of the vector + @param incx stride between consecutive elements of @c x + @param result the result + + @return a tf::cudaTask handle + */ + template + cudaTask nrm2(int n, const T* x, int incx, T* result); + + /** + @brief scales a vector by a scalar + + This method calls native @c cublasscal with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements in vector @c x + @param scalar scalar used for multiplication + @param x pointer to the memory address of the vector + @param incx stride between consecutive elements of @c x + + @return a tf::cudaTask handle + */ + template + cudaTask scal(int n, const T* scalar, T* x, int incx); + + /** + @brief swaps elements between two vectors + + This function interchanges the elements of vectors @c x and @c y. + Hence, the performed operation is: + + y[j] <-> x[k], + + where @c j is the index of element in @c y with a step size @c incy and + @c k is the index of element in @c x with a step size @c incx. + + This method calls native @c cublasswap with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param n number of elements to perform the dot product + @param x pointer to the memory address of the vector @c x + @param incx stride between consecutive elements of @c x + @param y pointer to the memory address of the vector @c y + @param incy stride between consecutive elements of @c y + + @return a tf::cudaTask handle + */ + template + cudaTask swap(int n, T* x, int incx, T* y, int incy); + + // ------------------------------------------------------------------------ + // TODO Level-2 matrix_vector operations + // ------------------------------------------------------------------------ + + /** + @brief performs matrix-vector multiplication + + This function performs matrix-vector multiplication: + + y = alpha * op(A) * x + beta * y, + + where @c alpha and @c beta are scalars, @c A + is a 2D matrix stored in column-major format, + and @c x, @c y are vectors. + + The input matrices are in column-major storage. + + This method calls native @c cublasgemv with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param trans transport operation @c op(A) + @param m number of rows of matrix @c A + @param n number of columns of matrix @c A + @param alpha pointer to the @c alpha scalar + @param A pointer to the address of @c A + @param lda leading dimension of 2D array used to store the matrix @c A + @param x pointer to the address of @c x of at least + (1 + (n - 1) * abs(incx)) elements if no transposition, + or (1 + (m - 1) * abs(incx)) elements otherwise. + @param incx stride between consecutive elements of @c x + @param beta pointer to the @c beta scalar + @param y pointer to the address of @c y + @param incy stride between consecutive elements of @c y + + @return a tf::cudaTask handle + */ + + template + cudaTask gemv( + cublasOperation_t trans, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy + ); + + /** + @brief similar to tf::cublasFlowCapturer::gemv but operates on C-styled + row-major layout + */ + + template + cudaTask c_gemv( + cublasOperation_t trans, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy + ); + + /** + @brief performs symmetric matrix-vector multiplication + + This function performs symmetric matrix-vector multiplication: + + y = alpha * A * x + beta * y, + + where @c alpha and @c beta are scalars, @c A + is a 2D symmetric matrix stored in column-major format, + and @c x, @c y are vectors + + This method calls native @c cublassymv with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param uplo indicates if matrix @c A lower or upper part is stored, + the other symmetric part is not referenced and is inferred + from the stored elements + @param n number of rows and columns of matrix @c A + @param alpha pointer to the @c alpha scalar + @param A pointer to the address of @c A + @param lda leading dimension of 2D array used to store the matrix @c A + @param x pointer to the address of @c x + @param incx stride between consecutive elements of @c x + @param beta pointer to the @c beta scalar + @param y pointer to the address of @c y + @param incy stride between consecutive elements of @c y + + @return a tf::cudaTask handle + */ + template + cudaTask symv( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy + ); + + /** + @brief similar to tf::cublasFlowCapturer::symv but operates on + C-styled row-major layout + */ + template + cudaTask c_symv( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy + ); + + /** + @brief performs symmetric rank-1 update + + This function performs symmetric rank-1 update: + + A = alpha * x * x^T + A, + + where @c alpha is a scalar, @c A + is a 2D symmetric matrix stored in column-major format, + and @c x is a vector. + + The result is also symmetric and is stored on in the @c uplo part + of @c A. + + This method calls native @c cublassyr with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param uplo indicates if matrix @c A lower or upper part is stored, + the other symmetric part is not referenced and is inferred + from the stored elements + @param n number of rows and columns of matrix @c A + @param alpha pointer to the @c alpha scalar + @param x pointer to the address of @c x + @param incx stride between consecutive elements of @c x + @param A pointer to the address of @c A + @param lda leading dimension of 2D array used to store the matrix @c A + + @return a tf::cudaTask handle + */ + template + cudaTask syr( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + T *A, int lda + ); + + /** + @brief similar to tf::cublasFlowCapturer::c_syr but operates on + C-styled row-major layout + */ + template + cudaTask c_syr( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + T *A, int lda + ); + + /** + @brief performs symmetric rank-2 update + + This function performs symmetric rank-2 update: + + A = alpha * x * y^T + y * x^T + A, + + where @c alpha is a scalar, @c A + is a 2D symmetric matrix stored in column-major format, + and @c x and @c y are vectors. + + The result is also symmetric and is stored on in the @c uplo part + of @c A. + + This method calls native @c cublassyr2 with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param uplo indicates if matrix @c A lower or upper part is stored, + the other symmetric part is not referenced and is inferred + from the stored elements + @param n number of rows and columns of matrix @c A + @param alpha pointer to the @c alpha scalar + @param x pointer to the address of @c x + @param incx stride between consecutive elements of @c x + @param y pointer to the address of @c y + @param incy stride between consecutive elements of @c y + @param A pointer to the address of @c A + @param lda leading dimension of 2D array used to store the matrix @c A + + @return a tf::cudaTask handle + */ + template + cudaTask syr2( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + const T *y, int incy, + T *A, int lda + ); + + /** + @brief similar to tf::cublasFlowCapturer::syr2 but operates on + C-styled row-major layout + */ + template + cudaTask c_syr2( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + const T *y, int incy, + T *A, int lda + ); + + /** + @brief performs the triangular matrix-vector multiplication + + This method performs the triangular matrix-vector multiplication: + + x = op(A), + + where @c A is a triangular matrix stored in lower or upper mode + with or without the main diagonal, and @c x is a vector. + + @tparam T data type + @param uplo indicates if matrix @c A lower or upper part is stored, + the other part is not referenced and is inferred from + the stored elements + @param tran transpose operation @c op(A) + @param diag indicates if the elements on the main diagonal of matrix @c A + are unity (i.e., all 1s) and of no need to be accessed + @param n number of rows and columns of matrix @c A + @param A pointer to the address of A + @param lda leading dimension of 2D array used to store matrix @c A + @param x input of vector @c b and output of the solution on exit + @param incx stride between consecutive elements of @c x + */ + template + cudaTask trmv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx + ); + + /** + @brief similar to tf::cublasFlowCapturer::trmv but operates on C-styled + row-major layout + */ + template + cudaTask c_trmv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx + ); + + /** + @brief solves the triangular linear system with a single right-hand-side + + This method solves the triangular linear system with a single right-hand-side + + op(A) x = b, + + where @c A is a triangular matrix stored in lower or upper mode + with or without the main diagonal, and @c x and @c b are vectors. + + @tparam T data type + @param uplo indicates if matrix @c A lower or upper part is stored, + the other part is not referenced and is inferred from + the stored elements + @param tran transpose operation @c op(A) + @param diag indicates if the elements on the main diagonal of matrix @c A + are unity (i.e., all 1s) and of no need to be accessed + @param n number of rows and columns of matrix @c A + @param A pointer to the address of A + @param lda leading dimension of 2D array used to store matrix @c A + @param x input of vector @c b and output of the solution on exit + @param incx stride between consecutive elements of @c x + */ + template + cudaTask trsv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx + ); + + /** + @brief similar to tf::cublasFlowCapturer::trsv but operates on C-styled + row-major layout + */ + template + cudaTask c_trsv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx + ); + + // ------------------------------------------------------------------------ + // Level-3 matrix-matrix operations + // ------------------------------------------------------------------------ + + /** + @brief performs matrix-matrix addition and transposition + + This method performs the matrix-matrix addition/transposition: + + C = alpha * op(A) + beta * op(B), + + where @c alpha and @c beta are scalars, and @c A, @c B and @c C are matrices + stored in column-major format with dimensions @c op(A) as @c m by @c n, + @c op(B) as @c m by @c n and @c C as @c m by @c n, respectively. + + The operation is out-of-place if @c C does not overlap @c A or @c B. + + The in-place mode supports the following two operations: + + 1. C = alpha * C + beta * op(B) + 2. C = alpha * op(A) + beta * C + + For in-place mode, if @c C equals @c A, @c ldc equals @c lda and + @c ta equals @c CUBLAS_OP_N. If @c C equals @c B, @c ldc equals @c ldb + and @c tb equals CUBLAS_OP_N. + + The operation includes the following special cases: + + 1. the user can reset matrix @c C to zero by setting @c alpha and + @c beta to 0 + 2. the user can transpose matrix @c A by setting @c alpha to 1 and + @c beta to 0 + + The input matrices are in column-major storage. + + This method calls native @c cublasgeam with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param ta transport operation @c op(A) + @param tb transport operation @c op(B) + @param m number of rows of matrix @c C and @c op(A) + @param n number of columns of matrix @c C and @c op(B) + @param alpha pointer to the @c alpha scalar + @param A pointer to the address of @c A + @param lda leading dimension of 2D array used to store the matrix @c A + @param beta pointer to the @c beta scalar + @param B pointer to the address of @c B + @param ldb leading dimension of 2D array used to store the matrix @c B + @param C pointer to the address of @c C + @param ldc leading dimension of 2D array used to store the matrix @c C + + @return a tf::cudaTask handle + */ + template + cudaTask geam( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *beta, + const T *B, int ldb, + T *C, int ldc + ); + + /** + @brief similar to tf::cublasFlowCapturer::geam but on row-major layout + */ + template + cudaTask c_geam( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *beta, + const T *B, int ldb, + T *C, int ldc + ); + + /** + @brief performs matrix-matrix multiplication + + This function performs matrix-matrix multiplication: + + C = alpha * op (A) * op (B) + beta * C, + + where @c alpha and @c beta are scalars, and @c A, @c B, and @c C + are 2D matrices stored in column-major format + with dimension @c op(A) as @c m by @c k, + dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n. + + The input matrices are in column-major storage. + + This method calls native @c cublasgemm with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param ta transport operation @c op(A) + @param tb transport operation @c op(B) + @param m number of rows of matrix @c C and @c op(A) + @param n number of columns of matrix @c C and @c op(B) + @param k number of columns of @c op(A) and rows of @c op(B) + @param alpha pointer to the @c alpha scalar + @param A pointer to the address of @c A + @param lda leading dimension of 2D array used to store the matrix @c A + @param B pointer to the address of @c B + @param ldb leading dimension of 2D array used to store the matrix @c B + @param beta pointer to the @c beta scalar + @param C pointer to the address of @c C + @param ldc leading dimension of 2D array used to store the matrix @c C + + @return a tf::cudaTask handle + */ + template + cudaTask gemm( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief similar to tf::cublasFlowCapturer::gemm but operates on C-styled + row-major layout + */ + template + cudaTask c_gemm( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief performs matrix-matrix multiplication over a batch of matrices + + The batch must be @em uniform. + All instances in the batch must have the same dimensions (m, n, k), + leading dimensions (lda, ldb, ldc) and transpositions + (ta, tb) for their respective @c A, @c B and @c C matrices. + The address of the input matrices and the output matrix of each instance + of the batch are read from arrays of pointers passed to the function + by the caller. + + C[i]= alpha * op (A[i]) * op (B[i]) + beta * C[i], i in [0, bc), + + where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i] + are 2D matrices stored in column-major format + with dimension @c op(A) as @c m by @c k, + dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n. + + The input matrices are in column-major storage. + + This method calls native @c cublasgemmBatched with packed parameters, + (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param ta transport operation @c op(A[i]) + @param tb transport operation @c op(B[i]) + @param m number of rows of matrix @c C[i] and @c op(A[i]) + @param n number of columns of matrix @c C[i] and @c op(B[i]) + @param k number of columns of @c op(A[i]) and rows of @c op(B[i]) + @param alpha pointer to the @c alpha scalar + @param A array pointer to @c A batch + @param lda leading dimension of 2D array used to store the matrix @c A[i] + @param B array pointer to @c B batch + @param ldb leading dimension of 2D array used to store the matrix @c B[i] + @param beta pointer to the @c beta scalar + @param C array pointer to @c C batch + @param ldc leading dimension of 2D array used to store the matrix @c C[i] + @param bc batch size (number of matrices) + + @return a tf::cudaTask handle + */ + template + cudaTask gemm_batched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A[], int lda, + const T *B[], int ldb, + const T *beta, + T *C[], int ldc, + int bc + ); + + /** + @brief similar to tf::cublasFlowCapturer::gemm_batched but operates on + C-styled row-major layout + */ + template + cudaTask c_gemm_batched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A[], int lda, + const T *B[], int ldb, + const T *beta, + T *C[], int ldc, + int bc + ); + + /** + @brief performs matrix-matrix multiplication over a batch of matrices + with strided memory access + + Here, we use @c A[i], @c B[i], @c C[i] as notation + for A, B and C matrices in the @c i-th instance of the batch, + implicitly assuming they are respectively address offsets + @c sA, @c sB, @c sC away from @c A[i-1], @c B[i-1], @c C[i-1]. + + The input matrices are in column-major storage. + + This method calls native @c cublasgemmStridedBatched with + packed parameters, (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + + @param ta transport operation @c op(A[i]) + @param tb transport operation @c op(B[i]) + @param m number of rows of matrix @c C[i] and @c op(A[i]) + @param n number of columns of matrix @c C[i] and @c op(B[i]) + @param k number of columns of @c op(A[i]) and rows of @c op(B[i]) + @param alpha pointer to the @c alpha scalar + @param A pointer to @c A batch + @param lda leading dimension of 2D array used to store the matrix @c A[i] + @param sA address offset between @c A[i] and @c A[i+1] + @param B pointer to @c B batch + @param ldb leading dimension of 2D array used to store the matrix @c B[i] + @param sB address offset between @c B[i] and @c B[i+1] + @param beta pointer to the @c beta scalar + @param C pointer to @c C batch + @param ldc leading dimension of 2D array used to store the matrix @c C[i] + @param sC address offset between @c C[i] and @c C[i+1] + @param bc batch size (number of matrices) + + @return a tf::cudaTask handle + + The batch must be @em uniform. + All instances in the batch must have the same dimensions (m, n, k), + leading dimensions (lda, ldb, ldc) and transpositions + (ta, tb) for their respective @c A, @c B and @c C matrices. + Input matrices @c A, @c B and output matrix @c C for each instance of the batch + are located at fixed address offsets from their locations in the previous instance. + Pointers to @c A, @c B and @c C matrices for the first instance are passed + to the function by the user along with the address @em offsets - + @c sA, @c sB and @c sC that determine the locations + of input and output matrices in future instances. + + C + i*sC = alpha * op (A + i*sA) * op (B + i*sB) + beta * (C + i*sC), i in [0, bc), + + where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i] + are 2D matrices stored in column-major format + with dimension @c op(A) as @c m by @c k, + dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n. + + On certain problem sizes, it might be advantageous to create multiple gemm tasks + to take advantage of concurrent kernels, rather than this method. + */ + template + cudaTask gemm_sbatched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, long long int sA, + const T *B, int ldb, long long int sB, + const T *beta, + T *C, int ldc, long long int sC, + int bc + ); + + /** + @brief similar to tf::cublasFlowCapturer::c_gemm_sbatched but operates on + C-styled row-major layout + */ + template + cudaTask c_gemm_sbatched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, long long int sA, + const T *B, int ldb, long long int sB, + const T *beta, + T *C, int ldc, long long int sC, + int bc + ); + + /** + @brief performs the symmetric matrix-matrix multiplication + + The method performs symmetric matrix-matrix multiplication: + + C = alpha * A * B + beta * C, if side == CUBLAS_SIDE_LEFT, or + + C = alpha * B * A + beta * C, if side == CUBLAS_SIDE_RIGHT. + + @c A is a symmetric matrix stored in lower or upper mode, + @c B and @c C are @c m by @c n matrices, and @c alpha and @c beta + are scalars. + + This method calls native @c cublassymm with + packed parameters, (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param side indicates if matrix @c A is on the left or right of @c B. + @param uplo indicates if matrix @c A lower or upper part is stored, + the other symmetric part is not referenced and + is inferred from the stored elements. + @param m number of rows of matrix @c C and @c B, + with matrix @c A sized accordingly + @param n number of columns of matrix @c C and @c B, + with matrix @c A sized accordingly + @param alpha scalar used for multiplication + @param A pointer to the address of matrix @c A + @param lda leading dimension of the 2D array used to store A + @param B pointer to the address of matrix @c B + @param ldb leading dimension of the 2D array used to store B + @param beta scalar used for multiplication + @param C pointer to the address of matrix @c C + @param ldc leading dimension of the 2D array used to store C + + */ + template + cudaTask symm( + cublasSideMode_t side, cublasFillMode_t uplo, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief similar to tf::cublasFlowCapturer::symm but operates on + C-styled row-major layout + */ + template + cudaTask c_symm( + cublasSideMode_t side, cublasFillMode_t uplo, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief performs the symmetric rank-k update + + This method performs the symmetric rank-k update : + + C = alpha * op(A) * op(A)^T + beta * C, + + where @c alpha and @c beta are scalars, @c C is a symmetric matrix + stored in lower or upper mode, and @c A is a matrix with dimension + @c op(A) @c n by @c k. + + The result is stored to @c uplo part of @c C. + + This method calls native @c cublassyrk with + packed parameters, (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param uplo indicates if matrix @c C lower or upper part is stored, + the other symmetric part is not referenced and is + inferred from the stored elements. + @param tran transposition operation to apply to @c A + @param n number of rows of matrix @c C and @c op(A) + @param k number of columns of matrix @c op(A) + @param alpha scalar used for multiplication + @param A pointer to the address of @c A + @param lda leading dimension of the 2D array used to store @c A + @param beta scalar used for multiplication + @param C pointer to the address of @c C + @param ldc leading dimension of the 2D array used to store @c C + */ + template + cudaTask syrk( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *beta, + T *C, int ldc + ); + + /** + @brief similar to tf::cublasFlowCapturer::c_syrk but operates on + C-styled row-major layout + */ + template + cudaTask c_syrk( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *beta, + T *C, int ldc + ); + + /** + @brief performs the symmetric rank-2k update + + This method performs the symmetric rank-2k update : + + C = alpha * (op(A) * op(B)^T + op(B) * op(A)^T) + beta * C, + + where @c alpha and @c beta are scalars, @c C is a symmetric matrix + stored in lower or upper mode, and @c A and @c B are two matrices + with dimensions @c op(A) and op(B) @c n by @c k. + + The result is stored to @c uplo part of @c C. + + This method calls native @c cublassyr2k with + packed parameters, (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param uplo indicates if matrix @c C lower or upper part is stored, + the other symmetric part is not referenced and is + inferred from the stored elements. + @param tran transposition operation to apply to @c A + @param n number of rows of matrix @c C and @c op(A) + @param k number of columns of matrix @c op(A) + @param alpha scalar used for multiplication + @param A pointer to the address of @c A + @param lda leading dimension of the 2D array used to store @c A + @param B pointer to the address of @c B + @param ldb leading dimension of the 2D array used to store @c B + @param beta scalar used for multiplication + @param C pointer to the address of @c C + @param ldc leading dimension of the 2D array used to store @c C + */ + template + cudaTask syr2k( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief similar to tf::cublasFlowCapturer::syr2k but operates on + C-styled row-major layout + */ + template + cudaTask c_syr2k( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief performs a variation of the symmetric rank-k update + + This method performs a variation of the symmetric rank-k update: + + C = alpha * op(A) * op(B)^T + beta * C, + + where @c alpha and @c beta are scalars, @c C is a symmetric matrix + stored in lower or upper mode, and @c A and @c B are two matrices + with dimensions @c op(A) and op(B) @c n by @c k. + + The result is stored to @c uplo part of @c C. + + This method calls native @c cublassyr2k with + packed parameters, (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param uplo indicates if matrix @c C lower or upper part is stored, + the other symmetric part is not referenced and is + inferred from the stored elements. + @param tran transposition operation to apply to @c A + @param n number of rows of matrix @c C and @c op(A) + @param k number of columns of matrix @c op(A) + @param alpha scalar used for multiplication + @param A pointer to the address of @c A + @param lda leading dimension of the 2D array used to store @c A + @param B pointer to the address of @c B + @param ldb leading dimension of the 2D array used to store @c B + @param beta scalar used for multiplication + @param C pointer to the address of @c C + @param ldc leading dimension of the 2D array used to store @c C + */ + template + cudaTask syrkx( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief similar to tf::cublasFlowCapturer::syrkx but operates on + C-styled row-major layout + */ + template + cudaTask c_syrkx( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc + ); + + /** + @brief performs triangular matrix-matrix multiplication + + This method performs triangular matrix-matrix multiplication: + + C = alpha * op(A) * B, if side == CUBLAS_SIDE_LEFT, or + + C = alpha * B * op(A), if side == CUBLAS_SIDE_RIGHT, + + where @c A is a triangular matrix stored in lower or upper mode with + or without the main diagonal, @c B and @c C are @c m by @c n matrix, + and @c alpha is a scalar. + + This method calls native @c cublastrmm with + packed parameters, (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param side indicates if matrix @c A is on the left or right of @c B + @param uplo indicates if matrix @c A lower or upper part is stored, + the other part is not referenced and is inferred from + the stored elements + @param tran transposition operation to apply to @c A + @param diag indicates if the elements on the main diagonal of matrix + @c A are unity and should not be accessed. + @param m number of rows of matrix @c B, with matrix @c A sized accordingly + @param n number of columns of matrix @c B, with matrix @c A sized accordingly + @param alpha scalar used for multiplication + @param A pointer to the address of matrix @c A + @param lda leading dimension of the 2D array used to store @c A + @param B pointer to the address of matrix @c B + @param ldb leading dimension of the 2D array used to store @c B + @param C pointer to the address of matrix @c C + @param ldc leading dimension of the 2D array used to store @c C + + Notice that in this method, @c B and @c C can point to the same address + in which case the in-place implementation is performed + (with results written back to @c B). + */ + template + cudaTask trmm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + T *C, int ldc + ); + + /** + @brief similar to tf::cublasFlowCapturer::trmm but oeprates on C-styled + row-major layout + */ + template + cudaTask c_trmm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + T *C, int ldc + ); + + /** + @brief solves the triangular linear system with multiple right-hand-sides + + This method solves the triangular linear system with multiple + right-hand-sides: + + op(A) * X = alpha * B, if side == CUBLAS_SIDE_LEFT, or + + X * op(A) = alpha * B, if side == CUBLAS_SIDE_RIGHT, + + where @c A is a triangular matrix stored in lower or upper mode + with or without the main diagonal, @c X and @c B are @c m by @c n matrices, + and @c alpha is a scalar. + + The solution @c X overwrites the right-hand-sides @c B on exit. + + This method calls native @c cublastrsm with + packed parameters, (handle, args...), where @c handle is managed by + the %cublasFlowCapturer and @c args... are the given arguments. + + @tparam T data type + @param side indicates if @c A is on the left or right side of @c X + @param uplo indicates if matrix @c A lower or upper part is stored, + the other part is not referenced and is inferred from + the stored elements + @param tran transposition operation to apply to @c A + @param diag indicates if the elements on the main diagonal of matrix @c A + are unity and should not be accessed + @param m number of rows in matrix @c B, with matrix @c A sized accordingly + @param n number of columns in matrix @c B, with matrix @c A sized accordingly + @param alpha scalar to apply to @c B + @param A pointer to the address of matrix @c A + @param lda leading dimension of the 2D array used to store @c A + @param B pointer to the address of matrix @c B + @param ldb leading dimension of the 2D array used to store @c B + */ + template + cudaTask trsm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + T *B, int ldb + ); + + /** + @brief similar to tf::cublasFlowCapturer::trsm but operates on C-styled + row-major layout + */ + template + cudaTask c_trsm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + T *B, int ldb + ); + + private: + + cublasScopedPerThreadHandle _handle; + + void _stream(cudaStream_t); +}; + +// Procedure: _stream +inline void cublasFlowCapturer::_stream(cudaStream_t stream) { + TF_CHECK_CUBLAS( + cublasSetStream(_handle, stream), "failed to set cublas stream" + ); +} + +// Function: native_handle +inline cublasHandle_t cublasFlowCapturer::native_handle() { + return _handle; +} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/taskflow/cuda/cublas/cublas_handle.hpp b/taskflow/cuda/cublas/cublas_handle.hpp new file mode 100644 index 0000000..b854090 --- /dev/null +++ b/taskflow/cuda/cublas/cublas_handle.hpp @@ -0,0 +1,156 @@ +#pragma once + +#include "cublas_error.hpp" + +/** +@file cublas_handle.hpp +*/ + +namespace tf { + +/** @private */ +struct cublasHandleCreator { + cublasHandle_t operator () () const { + cublasHandle_t handle; + + TF_CHECK_CUBLAS( + cublasCreate(&handle), "failed to create a cublas handle" + ); + + TF_CHECK_CUBLAS( + cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE), + "failed to set cublas pointer mode on device" + ); + + //std::cout << "create cublas handle " << handle << '\n'; + return handle; + } +}; + +/** @private */ +struct cublasHandleDeleter { + void operator () (cublasHandle_t ptr) const { + //std::cout << "destroy cublas handle " << ptr << '\n'; + cublasDestroy(ptr); + } +}; + +/** +@private alias of per-thread cublas handle pool type + */ +using cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool< + cublasHandle_t, cublasHandleCreator, cublasHandleDeleter +>; + +/** +@private acquires the per-thread cublas stream pool +*/ +inline cublasPerThreadHandlePool& cublas_per_thread_handle_pool() { + thread_local cublasPerThreadHandlePool pool; + return pool; +} + +// ---------------------------------------------------------------------------- +// cublasScopedPerThreadHandle definition +// ---------------------------------------------------------------------------- + +/** +@brief class to provide RAII-styled guard of cublas handle acquisition + +Sample usage: + +@code{.cpp} +{ + tf::cublasScopedPerThreadHandle handle(1); // acquires a cublas handle on device 1 + + // use handle as a normal cublas handle (cublasHandle_t) + cublasSetStream(handle, stream); + +} // leaving the scope to release the handle back to the pool on device 1 +@endcode + +By default, the cublas handle has a pointer mode set to device +(i.e., @c CUBLAS_POINTER_MODE_DEVICE), +that is required for capturing cublas kernels. +The scoped per-thread cublas handle is primarily used by tf::cublasFlowCapturer. + +%cublasScopedPerThreadHandle is non-copyable. + */ +class cublasScopedPerThreadHandle { + + public: + + /** + @brief constructs a scoped handle under the given device context + + The constructor acquires a handle from a per-thread handle pool. + */ + explicit cublasScopedPerThreadHandle(int d) : + _ptr {cublas_per_thread_handle_pool().acquire(d)} { + } + + /** + @brief constructs a scoped handle under caller's device context + + The constructor acquires a handle from a per-thread handle pool. + */ + cublasScopedPerThreadHandle() : + _ptr {cublas_per_thread_handle_pool().acquire(cuda_get_device())} { + } + + /** + @brief destructs the scoped handle guard + + The destructor releases the handle to the per-thread handle pool. + */ + ~cublasScopedPerThreadHandle() { + if(_ptr) { + cublas_per_thread_handle_pool().release(std::move(_ptr)); + } + } + + /** + @brief implicit conversion to the native cublas handle (cublasHandle_t) + */ + operator cublasHandle_t () const { + return _ptr->value; + } + + /** + @brief returns the number of shared owners + */ + long use_count() const { + return _ptr.use_count(); + } + + /** + @brief disabled copy constructor + */ + cublasScopedPerThreadHandle(const cublasScopedPerThreadHandle&) = delete; + + /** + @brief default move constructor + */ + cublasScopedPerThreadHandle(cublasScopedPerThreadHandle&&) = default; + + /** + @brief disabled copy assignment + */ + cublasScopedPerThreadHandle& operator = (const cublasScopedPerThreadHandle&) = delete; + + /** + @brief default move assignment + */ + cublasScopedPerThreadHandle& operator = (cublasScopedPerThreadHandle&&) = delete; + + private: + + std::shared_ptr _ptr; + +}; + + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/taskflow/cuda/cublas/cublas_helper.hpp b/taskflow/cuda/cublas/cublas_helper.hpp new file mode 100644 index 0000000..60fcdb9 --- /dev/null +++ b/taskflow/cuda/cublas/cublas_helper.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include "cublas_handle.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// global utility functions +// ---------------------------------------------------------------------------- +// find the tranposed op +template && std::is_same_v, void>* = nullptr +> +constexpr cublasOperation_t cublas_rtran(cublasOperation_t op) { + if(op != CUBLAS_OP_N && op != CUBLAS_OP_T) { + TF_THROW("invalid transposition op for floating data types"); + } + return (op == CUBLAS_OP_N) ? CUBLAS_OP_T : CUBLAS_OP_N; +} + +// find the transposed fill +constexpr cublasFillMode_t cublas_rfill(cublasFillMode_t uplo) { + switch(uplo) { + case CUBLAS_FILL_MODE_LOWER: return CUBLAS_FILL_MODE_UPPER; + case CUBLAS_FILL_MODE_UPPER: return CUBLAS_FILL_MODE_LOWER; + default: return uplo; + } +} + +// find the transposed side +constexpr cublasSideMode_t cublas_rside(cublasSideMode_t side) { + switch(side) { + case CUBLAS_SIDE_LEFT : return CUBLAS_SIDE_RIGHT; + case CUBLAS_SIDE_RIGHT: return CUBLAS_SIDE_LEFT; + default: return side; + } +} + +// ---------------------------------------------------------------------------- +// cublasFlowCapturer helper functions +// ---------------------------------------------------------------------------- + +// Function: vset +template , void>* +> +cudaTask cublasFlowCapturer::vset( + size_t n, const T* h, int inch, T* d, int incd +) { + return factory()->on([n, h, inch, d, incd] (cudaStream_t stream) mutable { + TF_CHECK_CUBLAS( + cublasSetVectorAsync(n, sizeof(T), h, inch, d, incd, stream), + "failed to run vset_async" + ); + }); +} + +// Function: vget +template , void>* +> +cudaTask cublasFlowCapturer::vget(size_t n, const T* d, int incd, T* h, int inch) { + return factory()->on([n, d, incd, h, inch] (cudaStream_t stream) mutable { + TF_CHECK_CUBLAS( + cublasGetVectorAsync(n, sizeof(T), d, incd, h, inch, stream), + "failed to run vget_async" + ); + }); +} + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/taskflow/cuda/cublas/cublas_level1.hpp b/taskflow/cuda/cublas/cublas_level1.hpp new file mode 100644 index 0000000..1fe1c02 --- /dev/null +++ b/taskflow/cuda/cublas/cublas_level1.hpp @@ -0,0 +1,200 @@ +#pragma once + +#include "cublas_handle.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// cublasFlowCapturere level-1 functions +// ---------------------------------------------------------------------------- + +// Function: amax +template +cudaTask cublasFlowCapturer::amax( + int n, const T* x, int incx, int* result +) { + return factory()->on([this, n, x, incx, result] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasIsamax(_handle, n, x, incx, result); + } + else if constexpr(std::is_same_v) { + stat = cublasIdamax(_handle, n, x, incx, result); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to run cublasamax"); + }); +} + +// Function: amin +template +cudaTask cublasFlowCapturer::amin( + int n, const T* x, int incx, int* result +) { + return factory()->on([this, n, x, incx, result] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasIsamin(_handle, n, x, incx, result); + } + else if constexpr(std::is_same_v) { + stat = cublasIdamin(_handle, n, x, incx, result); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + TF_CHECK_CUBLAS(stat, "failed to run cublasamin"); + }); +} + +// Function: asum +template +cudaTask cublasFlowCapturer::asum( + int n, const T* x, int incx, T* result +) { + return factory()->on([this, n, x, incx, result] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSasum(_handle, n, x, incx, result); + } + else if constexpr(std::is_same_v) { + stat = cublasDasum(_handle, n, x, incx, result); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to run cublasasum"); + }); +} + +// Function: axpy +template +cudaTask cublasFlowCapturer::axpy( + int n, const T *alpha, const T *x, int incx, T *y, int incy +) { + return factory()->on([this, n, alpha, x, incx, y, incy] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSaxpy(_handle, n, alpha, x, incx, y, incy); + } + else if constexpr(std::is_same_v) { + stat = cublasDaxpy(_handle, n, alpha, x, incx, y, incy); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + TF_CHECK_CUBLAS(stat, "failed to run cublasaxpy"); + }); +} + +// Function: vcopy +template +cudaTask cublasFlowCapturer::vcopy( + int n, const T* x, int incx, T* y, int incy +) { + return factory()->on([this, n, x, incx, y, incy] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasScopy(_handle, n, x, incx, y, incy); + } + else if constexpr(std::is_same_v) { + stat = cublasDcopy(_handle, n, x, incx, y, incy); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to run cublascopy"); + }); +} + +// Function: dot +template +cudaTask cublasFlowCapturer::dot( + int n, const T* x, int incx, const T* y, int incy, T* result +) { + return factory()->on([this, n, x, incx, y, incy, result] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSdot(_handle, n, x, incx, y, incy, result); + } + else if constexpr(std::is_same_v) { + stat = cublasDdot(_handle, n, x, incx, y, incy, result); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + TF_CHECK_CUBLAS(stat, "failed to run cublasdot"); + }); +} + +template +cudaTask cublasFlowCapturer::nrm2(int n, const T* x, int incx, T* result) { + return factory()->on([this, n, x, incx, result] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSnrm2(_handle, n, x, incx, result); + } + else if constexpr(std::is_same_v) { + stat = cublasDnrm2(_handle, n, x, incx, result); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to run cublasnrm2"); + }); +} + +// Function: scal +template +cudaTask cublasFlowCapturer::scal(int n, const T* scalar, T* x, int incx) { + return factory()->on([this, n, scalar, x, incx] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSscal(_handle, n, scalar, x, incx); + } + else if constexpr(std::is_same_v) { + stat = cublasDscal(_handle, n, scalar, x, incx); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + TF_CHECK_CUBLAS(stat, "failed to run cublasscal"); + }); +} + +template +cudaTask cublasFlowCapturer::swap(int n, T* x, int incx, T* y, int incy) { + return factory()->on([this, n, x, incx, y, incy] (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSswap(_handle, n, x, incx, y, incy); + } + else if constexpr(std::is_same_v) { + stat = cublasDswap(_handle, n, x, incx, y, incy); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to run cublasswap"); + }); +} + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/taskflow/cuda/cublas/cublas_level2.hpp b/taskflow/cuda/cublas/cublas_level2.hpp new file mode 100644 index 0000000..f1d13e7 --- /dev/null +++ b/taskflow/cuda/cublas/cublas_level2.hpp @@ -0,0 +1,286 @@ +#pragma once + +#include "cublas_handle.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// cublasFlowCapturere level-2 functions +// ---------------------------------------------------------------------------- + +template +cudaTask cublasFlowCapturer::gemv( + cublasOperation_t trans, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy +) { + return factory()->on([this, trans, m, n, alpha, A, lda, x, incx, beta, y, incy] + (cudaStream_t stream) mutable { + _stream(stream); + + cublasStatus_t stat; + + if constexpr(std::is_same_v) { + stat = cublasSgemv(_handle, + trans, m, n, alpha, A, lda, x, incx, beta, y, incy + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDgemv(_handle, + trans, m, n, alpha, A, lda, x, incx, beta, y, incy + ); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to capture gemv"); + }); +} + +// gemv +template +cudaTask cublasFlowCapturer::c_gemv( + cublasOperation_t trans, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy +) { + return gemv( + cublas_rtran(trans), n, m, alpha, A, lda, x, incx, beta, y, incy + ); +} + +// trmv +template +cudaTask cublasFlowCapturer::trmv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx +) { + return factory()->on([this, uplo, tran, diag, n, A, lda, x, incx] + (cudaStream_t stream) mutable { + + _stream(stream); + + cublasStatus_t stat; + + if constexpr(std::is_same_v) { + stat = cublasStrmv(_handle, uplo, tran, diag, n, A, lda, x, incx); + } + else if constexpr(std::is_same_v) { + stat = cublasDtrmv(_handle, uplo, tran, diag, n, A, lda, x, incx); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to capture trmv"); + }); +} + +// c_trmv +template +cudaTask cublasFlowCapturer::c_trmv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx +) { + return trmv( + cublas_rfill(uplo), cublas_rtran(tran), diag, n, A, lda, x, incx + ); +} + +// trsv +template +cudaTask cublasFlowCapturer::trsv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx +) { + return factory()->on([this, uplo, tran, diag, n, A, lda, x, incx] + (cudaStream_t stream) mutable { + + _stream(stream); + + cublasStatus_t stat; + + if constexpr(std::is_same_v) { + stat = cublasStrsv(_handle, uplo, tran, diag, n, A, lda, x, incx); + } + else if constexpr(std::is_same_v) { + stat = cublasDtrsv(_handle, uplo, tran, diag, n, A, lda, x, incx); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to capture trsv"); + }); +} + +// c_trsv +template +cudaTask cublasFlowCapturer::c_trsv( + cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int n, const T* A, int lda, + T *x, int incx +) { + return trsv( + cublas_rfill(uplo), cublas_rtran(tran), diag, n, A, lda, x, incx + ); +} + +// symv +template +cudaTask cublasFlowCapturer::symv( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy +) { + return factory()->on([this, uplo, n, alpha, A, lda, x, incx, beta, y, incy] + (cudaStream_t stream) mutable { + + _stream(stream); + + cublasStatus_t stat; + + if constexpr(std::is_same_v) { + stat = cublasSsymv(_handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy); + } + else if constexpr(std::is_same_v) { + stat = cublasDsymv(_handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to capture symv"); + }); +} + +// c_symv +template +cudaTask cublasFlowCapturer::c_symv( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy +) { + return symv( + cublas_rfill(uplo), n, alpha, A, lda, x, incx, beta, y, incy + ); +} + +// syr +template +cudaTask cublasFlowCapturer::syr( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + T *A, int lda +) { + + return factory()->on([this, uplo, n, alpha, x, incx, A, lda] + (cudaStream_t stream) mutable { + + _stream(stream); + + cublasStatus_t stat; + + if constexpr(std::is_same_v) { + stat = cublasSsyr(_handle, uplo, n, alpha, x, incx, A, lda); + } + else if constexpr(std::is_same_v) { + stat = cublasDsyr(_handle, uplo, n, alpha, x, incx, A, lda); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to capture syr"); + }); +} + +// c_syr +template +cudaTask cublasFlowCapturer::c_syr( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + T *A, int lda +) { + return syr( + cublas_rfill(uplo), n, alpha, x, incx, A, lda + ); +} + +// syr2 +template +cudaTask cublasFlowCapturer::syr2( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + const T *y, int incy, + T *A, int lda +) { + + return factory()->on([this, uplo, n, alpha, x, incx, y, incy, A, lda] + (cudaStream_t stream) mutable { + + _stream(stream); + + cublasStatus_t stat; + + if constexpr(std::is_same_v) { + stat = cublasSsyr2(_handle, uplo, n, alpha, x, incx, y, incy, A, lda); + } + else if constexpr(std::is_same_v) { + stat = cublasDsyr2(_handle, uplo, n, alpha, x, incx, y, incy, A, lda); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + + TF_CHECK_CUBLAS(stat, "failed to capture syr2"); + }); +} + +// c_syr2 +template +cudaTask cublasFlowCapturer::c_syr2( + cublasFillMode_t uplo, + int n, + const T *alpha, + const T *x, int incx, + const T *y, int incy, + T *A, int lda +) { + return syr2( + cublas_rfill(uplo), n, alpha, x, incx, y, incy, A, lda + ); +} + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/cuda/cublas/cublas_level3.hpp b/taskflow/cuda/cublas/cublas_level3.hpp new file mode 100644 index 0000000..ad10a8c --- /dev/null +++ b/taskflow/cuda/cublas/cublas_level3.hpp @@ -0,0 +1,489 @@ +#pragma once + +#include "cublas_handle.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// cublasFlowCapturere level-3 functions +// ---------------------------------------------------------------------------- + +// Function: geam +template +cudaTask cublasFlowCapturer::geam( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *beta, + const T *B, int ldb, + T *C, int ldc +) { + return factory()->on([this, ta, tb, m, n, alpha, A, lda, beta, B, ldb, C, ldc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSgeam(_handle, + ta, tb, m, n, alpha, A, lda, beta, B, ldb, C, ldc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDgeam(_handle, + ta, tb, m, n, alpha, A, lda, beta, B, ldb, C, ldc + ); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + TF_CHECK_CUBLAS(stat, "failed to run geam"); + }); +} + +// Function: c_geam +template +cudaTask cublasFlowCapturer::c_geam( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *beta, + const T *B, int ldb, + T *C, int ldc +) { + return geam( + ta, tb, n, m, alpha, A, lda, beta, B, ldb, C, ldc + ); +} + +// Function: gemm +template +cudaTask cublasFlowCapturer::gemm( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return factory()->on([this, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSgemm(_handle, + ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDgemm(_handle, + ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else { + static_assert(dependent_false_v, "unknown cublas data type"); + } + TF_CHECK_CUBLAS(stat, "failed to run gemm"); + }); +} + +template +cudaTask cublasFlowCapturer::c_gemm( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return gemm( + tb, ta, n, m, k, alpha, B, ldb, A, lda, beta, C, ldc + ); +} + +// Function: gemm_batched +template +cudaTask cublasFlowCapturer::gemm_batched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A[], int lda, + const T *B[], int ldb, + const T *beta, + T *C[], int ldc, + int bc +) { + return factory()->on([this, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, bc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSgemmBatched(_handle, + ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, bc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDgemmBatched(_handle, + ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, bc + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run gemm_batched"); + }); +} + +// Function: c_gemm_batched +template +cudaTask cublasFlowCapturer::c_gemm_batched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A[], int lda, + const T *B[], int ldb, + const T *beta, + T *C[], int ldc, + int bc +) { + return gemm_batched( + tb, ta, n, m, k, alpha, B, ldb, A, lda, beta, C, ldc, bc + ); +} + +// Function: gemm_sbatched (strided) +template +cudaTask cublasFlowCapturer::gemm_sbatched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, long long int sA, + const T *B, int ldb, long long int sB, + const T *beta, + T *C, int ldc, long long int sC, + int bc +) { + return factory()->on([this, ta, tb, m, n, k, alpha, A, lda, sA, B, ldb, sB, beta, C, ldc, sC, bc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSgemmStridedBatched(_handle, + ta, tb, m, n, k, alpha, A, lda, sA, B, ldb, sB, beta, C, ldc, sC, bc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDgemmStridedBatched(_handle, + ta, tb, m, n, k, alpha, A, lda, sA, B, ldb, sB, beta, C, ldc, sC, bc + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run gemm_sbatched"); + }); +} + +// Function: c_gemm_sbatched (strided) +template +cudaTask cublasFlowCapturer::c_gemm_sbatched( + cublasOperation_t ta, cublasOperation_t tb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, long long int sA, + const T *B, int ldb, long long int sB, + const T *beta, + T *C, int ldc, long long int sC, + int bc +){ + return gemm_sbatched( + tb, ta, n, m, k, alpha, B, ldb, sB, A, lda, sA, beta, C, ldc, sC, bc + ); +} + +// symm +template +cudaTask cublasFlowCapturer::symm( + cublasSideMode_t side, cublasFillMode_t uplo, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return factory()->on( + [this, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSsymm(_handle, + side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDsymm(_handle, + side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run symm"); + }); +} + +// c_symm +template +cudaTask cublasFlowCapturer::c_symm( + cublasSideMode_t side, cublasFillMode_t uplo, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return symm( + cublas_rside(side), cublas_rfill(uplo), + n, m, alpha, A, lda, B, ldb, beta, C, ldc + ); +} + +// syrk +template +cudaTask cublasFlowCapturer::syrk( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *beta, + T *C, int ldc +) { + return factory()->on( + [this, uplo, tran, n, k, alpha, A, lda, beta, C, ldc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSsyrk(_handle, + uplo, tran, n, k, alpha, A, lda, beta, C, ldc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDsyrk(_handle, + uplo, tran, n, k, alpha, A, lda, beta, C, ldc + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run syrk"); + }); +} + +// c_syrk +template +cudaTask cublasFlowCapturer::c_syrk( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *beta, + T *C, int ldc +) { + return syrk( + cublas_rfill(uplo), cublas_rtran(tran), + n, k, alpha, A, lda, beta, C, ldc + ); +} + +// syr2k +template +cudaTask cublasFlowCapturer::syr2k( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return factory()->on( + [this, uplo, tran, n, k, alpha, A, lda, B, ldb, beta, C, ldc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSsyr2k(_handle, + uplo, tran, n, k, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDsyr2k(_handle, + uplo, tran, n, k, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run syr2k"); + }); +} + +// c_syr2k +template +cudaTask cublasFlowCapturer::c_syr2k( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return syr2k( + cublas_rfill(uplo), cublas_rtran(tran), + n, k, alpha, B, ldb, A, lda, beta, C, ldc + ); +} + +// syrkx +template +cudaTask cublasFlowCapturer::syrkx( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return factory()->on( + [this, uplo, tran, n, k, alpha, A, lda, B, ldb, beta, C, ldc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasSsyrkx(_handle, + uplo, tran, n, k, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDsyrkx(_handle, + uplo, tran, n, k, alpha, A, lda, B, ldb, beta, C, ldc + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run syrkx"); + }); +} + +// c_syrkx +template +cudaTask cublasFlowCapturer::c_syrkx( + cublasFillMode_t uplo, cublasOperation_t tran, + int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc +) { + return syrkx( + cublas_rfill(uplo), cublas_rtran(tran), + n, k, alpha, B, ldb, A, lda, beta, C, ldc + ); +} + +// trmm +template +cudaTask cublasFlowCapturer::trmm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + T *C, int ldc +) { + + return factory()->on( + [this, side, uplo, tran, diag, m, n, alpha, A, lda, B, ldb, C, ldc] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasStrmm(_handle, + side, uplo, tran, diag, m, n, alpha, A, lda, B, ldb, C, ldc + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDtrmm(_handle, + side, uplo, tran, diag, m, n, alpha, A, lda, B, ldb, C, ldc + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run trmm"); + }); +} + +// c_trmm +template +cudaTask cublasFlowCapturer::c_trmm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + T *C, int ldc +) { + return trmm( + cublas_rside(side), cublas_rfill(uplo), tran, diag, + n, m, alpha, A, lda, B, ldb, C, ldc + ); +} + +// trsm +template +cudaTask cublasFlowCapturer::trsm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + T *B, int ldb +) { + + return factory()->on( + [this, side, uplo, tran, diag, m, n, alpha, A, lda, B, ldb] + (cudaStream_t stream) mutable { + _stream(stream); + cublasStatus_t stat; + if constexpr(std::is_same_v) { + stat = cublasStrsm(_handle, + side, uplo, tran, diag, m, n, alpha, A, lda, B, ldb + ); + } + else if constexpr(std::is_same_v) { + stat = cublasDtrsm(_handle, + side, uplo, tran, diag, m, n, alpha, A, lda, B, ldb + ); + } + else static_assert(dependent_false_v, "unknown cublas data type"); + TF_CHECK_CUBLAS(stat, "failed to run trsm"); + }); +} + +// c_trsm +template +cudaTask cublasFlowCapturer::c_trsm( + cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t tran, cublasDiagType_t diag, + int m, int n, + const T *alpha, + const T *A, int lda, + T *B, int ldb +) { + return trsm( + cublas_rside(side), cublas_rfill(uplo), tran, diag, + n, m, alpha, A, lda, B, ldb + ); +} + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/cuda/cuda_algorithm/cuda_blaf.hpp b/taskflow/cuda/cuda_algorithm/cuda_blaf.hpp new file mode 100644 index 0000000..de2c008 --- /dev/null +++ b/taskflow/cuda/cuda_algorithm/cuda_blaf.hpp @@ -0,0 +1,148 @@ +#pragma once + +#include "cuda_transpose.hpp" +#include "cuda_matmul.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaBLAF definition +// ---------------------------------------------------------------------------- + +/** +@brief basic linear algebra flow on top of cudaFlow +*/ +class cudaBLAF { + + public: + + /** + @brief constructs a blas builder object + + @param cudaflow a cudaflow object + */ + cudaBLAF(cudaFlow& cudaflow); + + /** + @brief transposes a two-dimenstional matrix + + @tparam T data type + @param d_in pointer to the source matrix + @param d_out pointer to the target matrix + @param rows number of rows in the source matrix + @param cols number of columns in the source matrix + + @return cudaTask handle + */ + template + cudaTask transpose(const T* d_in, T* d_out, size_t rows, size_t cols); + + template + cudaTask matmul(const T* A, const T* B, T* C, size_t M, size_t K, size_t N); + + // ------------------------------------------------------------------------ + // update APIs + // ------------------------------------------------------------------------ + template + void update_transpose(cudaTask ct, const T* d_in, T* d_out, size_t rows, size_t cols); + + template + void update_matmul(cudaTask ct, const T* A, const T* B, T* C, size_t M, size_t K, size_t N); + + private: + + cudaFlow& _cf; +}; + +// Constructor +inline cudaBLAF::cudaBLAF(cudaFlow& cf) : _cf{cf} { +} + +// Function: row-wise matrix transpose +template +cudaTask cudaBLAF::transpose(const T* d_in, T* d_out, size_t rows, size_t cols) { + + //TODO: throw invalid parameters (e.x. grid_dimx = 0) + + size_t grid_dimx = (cols + 31) / 32; + size_t grid_dimy = (rows + 31) / 32; + + return _cf.kernel( + dim3(grid_dimx, grid_dimy, 1), + dim3(32, 8, 1), + 0, + cuda_transpose, + d_in, + d_out, + rows, + cols + ); + +} + +// Function: row-major matrix transpose +template +cudaTask cudaBLAF::matmul(const T* A, const T* B, T* C, size_t M, size_t K, size_t N) { + + size_t grid_dimx = (N + 31) / 32; + size_t grid_dimy = (M + 31) / 32; + + //TODO: throw invalid parameters (e.x. grid_dimx = 0) + return _cf.kernel( + dim3(grid_dimx, grid_dimy, 1), + dim3(32, 32, 1), + 0, + cuda_matmul, + A, + B, + C, + M, + K, + N + ); +} + +// ------------------------------------------------------------------------ +// update APIs +// ------------------------------------------------------------------------ +template +void cudaBLAF::update_transpose(cudaTask ct, const T* d_in, T* d_out, size_t rows, size_t cols) { + size_t grid_dimx = (cols + 31) / 32; + size_t grid_dimy = (rows + 31) / 32; + + + _cf.update_kernel( + ct, + dim3(grid_dimx, grid_dimy, 1), + dim3(32, 8, 1), + 0, + d_in, + d_out, + rows, + cols + ); +} + +template +void cudaBLAF::update_matmul(cudaTask ct, const T* A, const T* B, T* C, size_t M, size_t K, size_t N) { + size_t grid_dimx = (N + 31) / 32; + size_t grid_dimy = (M + 31) / 32; + + _cf.update_kernel( + ct, + dim3(grid_dimx, grid_dimy, 1), + dim3(32, 32, 1), + 0, + A, + B, + C, + M, + K, + N + ); +} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/taskflow/cuda/cuda_algorithm/cuda_for_each.hpp b/taskflow/cuda/cuda_algorithm/cuda_for_each.hpp new file mode 100644 index 0000000..5414f94 --- /dev/null +++ b/taskflow/cuda/cuda_algorithm/cuda_for_each.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "../cuda_error.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// single_task +// ---------------------------------------------------------------------------- + +// Kernel: single_task +template +__global__ void cuda_single_task(C callable) { + callable(); +} + +// ---------------------------------------------------------------------------- +// for_each +// ---------------------------------------------------------------------------- + +// Kernel: for_each +template +__global__ void cuda_for_each(I first, size_t N, F op) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if (i < N) { + op(*(first+i)); + } +} + +// ---------------------------------------------------------------------------- +// for_each_index +// ---------------------------------------------------------------------------- + +// Kernel: for_each_index +template +__global__ void cuda_for_each_index(I beg, I inc, size_t N, F op) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if (i < N) { + op(static_cast(i)*inc + beg); + } +} + + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/taskflow/cuda/cuda_algorithm/cuda_matmul.hpp b/taskflow/cuda/cuda_algorithm/cuda_matmul.hpp new file mode 100644 index 0000000..350c105 --- /dev/null +++ b/taskflow/cuda/cuda_algorithm/cuda_matmul.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include "../cuda_error.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// row-major matrix multiplication +// ---------------------------------------------------------------------------- + +template +__global__ void cuda_matmul( + const T* A, + const T* B, + T* C, + size_t M, + size_t K, + size_t N +) { + __shared__ T A_tile[32][32]; + __shared__ T B_tile[32][32]; + + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; + + T res = 0; + + for(size_t k = 0; k < K; k += 32) { + if((threadIdx.x + k) < K && y < M) { + A_tile[threadIdx.y][threadIdx.x] = A[y * K + threadIdx.x + k]; + } + else{ + A_tile[threadIdx.y][threadIdx.x] = 0; + } + + if((threadIdx.y + k) < K && x < N) { + B_tile[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k) * N + x]; + } + else{ + B_tile[threadIdx.y][threadIdx.x] = 0; + } + + __syncthreads(); + + for(size_t i = 0; i < 32; ++i) { + res += A_tile[threadIdx.y][i] * B_tile[i][threadIdx.x]; + } + __syncthreads(); + } + + if(x < N && y < M) { + C[y * N + x] = res; + } + +} + +} // end of namespace tf --------------------------------------------------------- diff --git a/taskflow/cuda/cuda_algorithm/cuda_reduce.hpp b/taskflow/cuda/cuda_algorithm/cuda_reduce.hpp new file mode 100644 index 0000000..8b14c69 --- /dev/null +++ b/taskflow/cuda/cuda_algorithm/cuda_reduce.hpp @@ -0,0 +1,114 @@ +#pragma once + +#include "../cuda_error.hpp" + +namespace tf { + +template +__device__ void cuda_warp_reduce( + volatile T* shm, size_t N, size_t tid, C op +) { + if(tid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]); + if(tid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]); + if(tid + 8 < N) shm[tid] = op(shm[tid], shm[tid+8]); + if(tid + 4 < N) shm[tid] = op(shm[tid], shm[tid+4]); + if(tid + 2 < N) shm[tid] = op(shm[tid], shm[tid+2]); + if(tid + 1 < N) shm[tid] = op(shm[tid], shm[tid+1]); +} + +// Kernel: cuda_reduce +// This reduction kernel assums only one block to avoid extra output memory. +template +__global__ void cuda_reduce(I first, size_t N, T* res, C op) { + + cudaSharedMemory shared_memory; + T* shm = shared_memory.get(); + + size_t tid = threadIdx.x; + + if(tid >= N) { + return; + } + + shm[tid] = *(first+tid); + + for(size_t i=tid+blockDim.x; i 32; s >>= 1) { + if(tid < s && tid + s < N) { + shm[tid] = op(shm[tid], shm[tid+s]); + } + __syncthreads(); + } + + if(tid < 32) { + cuda_warp_reduce(shm, N, tid, op); + } + + if(tid == 0) { + if constexpr (uninitialized) { + *res = shm[0]; + } + else { + *res = op(*res, shm[0]); + } + } +} + +//template +//__device__ void cuda_warp_reduce( +// volatile int* shm, size_t N, size_t tid, size_t gid, C op +//) { +// if(gid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]); +// if(gid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]); +// if(gid + 8 < N) shm[tid] = op(shm[tid], shm[tid+8]); +// if(gid + 4 < N) shm[tid] = op(shm[tid], shm[tid+4]); +// if(gid + 2 < N) shm[tid] = op(shm[tid], shm[tid+2]); +// if(gid + 1 < N) shm[tid] = op(shm[tid], shm[tid+1]); +//} +// +//template +//__global__ void cuda_reduce(int* din, int* dout, size_t N, C op) { +// +// extern __shared__ int shm[]; +// +// size_t tid = threadIdx.x; +// size_t gid = threadIdx.x + blockIdx.x * (blockDim.x); +// size_t gsd = blockDim.x * gridDim.x; +// +// if(gid >= N) { +// return; +// } +// +// //printf("%lu %lu %lu\n", tid, gid, gsd); +// +// shm[tid] = din[gid]; +// +// for(size_t nxt = gid + gsd; nxt < N; nxt += gsd) { +// shm[tid] = op(shm[tid], din[nxt]); +// } +// +// __syncthreads(); +// +// for(size_t s = blockDim.x / 2; s > 32; s >>= 1) { +// if(tid < s && gid + s < N) { +// shm[tid] = op(shm[tid], shm[tid+s]); +// } +// __syncthreads(); +// } +// +// if(tid < 32) { +// warp_reduce(shm, N, tid, gid, op); +// } +// +// if(tid == 0){ +// dout[blockIdx.x] = shm[0]; +// } +//} + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/cuda/cuda_algorithm/cuda_transform.hpp b/taskflow/cuda/cuda_algorithm/cuda_transform.hpp new file mode 100644 index 0000000..62b4f9a --- /dev/null +++ b/taskflow/cuda/cuda_algorithm/cuda_transform.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "../cuda_error.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// transform +// ---------------------------------------------------------------------------- + +// Kernel: for_each +template +__global__ void cuda_transform(I first, size_t N, F op, S... srcs) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if (i < N) { + //data[i] = op(src[i]...); + *(first + i) = op((*(srcs+i))...); + } +} + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/taskflow/cuda/cuda_algorithm/cuda_transpose.hpp b/taskflow/cuda/cuda_algorithm/cuda_transpose.hpp new file mode 100644 index 0000000..999d9de --- /dev/null +++ b/taskflow/cuda/cuda_algorithm/cuda_transpose.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include "../cuda_error.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// row-wise matrix transpose +// ---------------------------------------------------------------------------- +// +template +__global__ void cuda_transpose( + const T* d_in, + T* d_out, + size_t rows, + size_t cols +) { + __shared__ T tile[32][32]; + size_t x = blockIdx.x * 32 + threadIdx.x; + size_t y = blockIdx.y * 32 + threadIdx.y; + + for(size_t i = 0; i < 32; i += 8) { + if(x < cols && (y + i) < rows) { + tile[threadIdx.y + i][threadIdx.x] = d_in[(y + i) * cols + x]; + } + } + + __syncthreads(); + + x = blockIdx.y * 32 + threadIdx.x; + y = blockIdx.x * 32 + threadIdx.y; + + for(size_t i = 0; i < 32; i += 8) { + if(x < rows && (y + i) < cols) { + d_out[(y + i) * rows + x] = tile[threadIdx.x][threadIdx.y + i]; + } + } +} + +} // end of namespace -------------------------------------------------------- + diff --git a/taskflow/cuda/cuda_capturer.hpp b/taskflow/cuda/cuda_capturer.hpp new file mode 100644 index 0000000..8dbc6f2 --- /dev/null +++ b/taskflow/cuda/cuda_capturer.hpp @@ -0,0 +1,844 @@ +#pragma once + +#include "cuda_task.hpp" +#include "cuda_algorithm/cuda_for_each.hpp" +#include "cuda_algorithm/cuda_transform.hpp" +#include "cuda_algorithm/cuda_reduce.hpp" +#include "cuda_optimizer.hpp" + +/** +@file cuda_capturer.hpp +@brief %cudaFlow capturer include file +*/ + +namespace tf { + +/** +@brief queries the maximum threads allowed per block +*/ +constexpr size_t cuda_default_max_threads_per_block() { + return 512; +} + +/** +@brief queries the default number of threads per block in an 1D vector of N elements +*/ +constexpr size_t cuda_default_threads_per_block(size_t N) { + // TODO: special case when N == 0? + if(N <= 32) return 32; + else { + return std::min(cuda_default_max_threads_per_block(), next_pow2(N)); + } +} + +// ---------------------------------------------------------------------------- +// class definition: cudaFlowCapturerBase +// ---------------------------------------------------------------------------- + +/** +@class cudaFlowCapturerBase + +@brief base class to construct a CUDA task graph through stream capture +*/ +class cudaFlowCapturerBase { + + friend class cudaFlowCapturer; + + public: + + /** + @brief default constructor + */ + cudaFlowCapturerBase() = default; + + /** + @brief default virtual destructor + */ + virtual ~cudaFlowCapturerBase() = default; + + /** + @brief accesses the parent capturer + */ + cudaFlowCapturer* factory() const; + + private: + + cudaFlowCapturer* _factory {nullptr}; +}; + +// Function: accesses the parent capturer +inline cudaFlowCapturer* cudaFlowCapturerBase::factory() const { + return _factory; +} + +// ---------------------------------------------------------------------------- +// class definition: cudaFlowCapturer +// ---------------------------------------------------------------------------- + +/** +@class cudaFlowCapturer + +@brief class for building a CUDA task dependency graph through stream capture + +A %cudaFlowCapturer inherits all the base methods from tf::cudaFlowCapturerBase +to construct a CUDA task graph through stream capturer. +This class also defines a factory interface tf::cudaFlowCapturer::make_capturer +for users to create custom capturers with their lifetimes managed by the factory. + +The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can +call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous +CUDA operations through the given stream. +The following example creates a CUDA graph that captures two kernel tasks, +@c task_1 and @c task_2, where @c task_1 runs before @c task_2. + +@code{.cpp} +taskflow.emplace([](tf::cudaFlowCapturer& capturer){ + + // capture my_kernel_1 through the given stream managed by the capturer + auto task_1 = capturer.on([&](cudaStream_t stream){ + my_kernel_1<<>>(my_parameters_1); + }); + + // capture my_kernel_2 through the given stream managed by the capturer + auto task_2 = capturer.on([&](cudaStream_t stream){ + my_kernel_2<<>>(my_parameters_2); + }); + + task_1.precede(task_2); +}); +@endcode + +Similar to tf::cudaFlow, a %cudaFlowCapturer is a task (tf::Task) +created from tf::Taskflow +and will be run by @em one worker thread in the executor. +That is, the callable that describes a %cudaFlowCapturer +will be executed sequentially. +Inside a %cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run +in parallel scheduled by both our capturing algorithm and the CUDA runtime. + +Please refer to @ref GPUTaskingcudaFlowCapturer for details. +*/ +class cudaFlowCapturer { + + friend class cudaFlow; + friend class Executor; + + struct External { + cudaGraph graph; + }; + + struct Internal { + }; + + using handle_t = std::variant; + + using Optimizer = std::variant< + cudaSequentialCapturing, + cudaRoundRobinCapturing + //cudaGreedyCapturing + >; + + public: + + /** + @brief constrcts a standalone cudaFlowCapturer + + A standalone %cudaFlow capturer does not go through any taskflow and + can be run by the caller thread using explicit offload methods + (e.g., tf::cudaFlow::offload). + */ + cudaFlowCapturer(); + + /** + @brief destructs the cudaFlowCapturer + */ + virtual ~cudaFlowCapturer(); + + /** + @brief queries the emptiness of the graph + */ + bool empty() const; + + /** + @brief dumps the capture graph into a DOT format through an + output stream + */ + void dump(std::ostream& os) const; + + /** + @brief creates a custom capturer derived from tf::cudaFlowCapturerBase + + @tparam T custom capturer type + @tparam ArgsT arguments types + + @param args arguments to forward to construct the custom capturer + + @return a pointer to the custom capturer + + Each %cudaFlow capturer keeps a list of custom capturers + and manages their lifetimes. The lifetime of each custom capturer is + the same as the capturer. + */ + template + T* make_capturer(ArgsT&&... args); + + /** + @brief enables different optimization algorithms + + @tparam OPT optimizer type + @tparam ArgsT arguments types + + @param args arguments to forward to construct the optimizer + + @return a reference to the optimizer + + We currently supports the following optimization algorithms to capture + a user-described %cudaFlow: + + tf::cudaSequentialCapturing + + tf::cudaRoundRobinCapturing + + */ + template + OPT& make_optimizer(ArgsT&&... args); + + // ------------------------------------------------------------------------ + // + // ------------------------------------------------------------------------ + + /** + @brief captures a sequential CUDA operations from the given callable + + @tparam C callable type constructible with @c std::function + @param callable a callable to capture CUDA operations with the stream + + This methods applies a stream created by the flow to capture + a sequence of CUDA operations defined in the callable. + */ + template , void>* = nullptr + > + cudaTask on(C&& callable); + + /** + @brief copies data between host and device asynchronously through a stream + + @param dst destination memory address + @param src source memory address + @param count size in bytes to copy + + The method captures a @c cudaMemcpyAsync operation through an + internal stream. + */ + cudaTask memcpy(void* dst, const void* src, size_t count); + + /** + @brief captures a copy task of typed data + + @tparam T element type (non-void) + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param num number of elements to copy + + @return cudaTask handle + + A copy task transfers num*sizeof(T) bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. + */ + template , void>* = nullptr + > + cudaTask copy(T* tgt, const T* src, size_t num); + + /** + @brief initializes or sets GPU memory to the given value byte by byte + + @param ptr pointer to GPU mempry + @param v value to set for each byte of the specified memory + @param n size in bytes to set + + The method captures a @c cudaMemsetAsync operation through an + internal stream to fill the first @c count bytes of the memory area + pointed to by @c devPtr with the constant byte value @c value. + */ + cudaTask memset(void* ptr, int v, size_t n); + + /** + @brief captures a kernel + + @tparam F kernel function type + @tparam ArgsT kernel function parameters type + + @param g configured grid + @param b configured block + @param s configured shared memory size in bytes + @param f kernel function + @param args arguments to forward to the kernel function by copy + + @return cudaTask handle + */ + template + cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args); + + // ------------------------------------------------------------------------ + // generic algorithms + // ------------------------------------------------------------------------ + + /** + @brief capturers a kernel to runs the given callable with only one thread + + @tparam C callable type + + @param callable callable to run by a single kernel thread + */ + template + cudaTask single_task(C&& callable); + + /** + @brief captures a kernel that applies a callable to each dereferenced element + of the data array + + @tparam I iterator type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + for(auto itr = first; itr != last; i++) { + callable(*itr); + } + @endcode + */ + template + cudaTask for_each(I first, I last, C&& callable); + + /** + @brief captures a kernel that applies a callable to each index in the range + with the step size + + @tparam I index type + @tparam C callable type + + @param first beginning index + @param last last index + @param step step size + @param callable the callable to apply to each element in the data array + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + // step is positive [first, last) + for(auto i=first; ilast; i+=step) { + callable(i); + } + @endcode + */ + template + cudaTask for_each_index(I first, I last, I step, C&& callable); + + /** + @brief captures a kernel that applies a callable to a source range and + stores the result in a target range + + @tparam I iterator type + @tparam C callable type + @tparam S source types + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable the callable to apply to each element in the range + @param srcs iterators to the source ranges + + @return cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first != last) { + *first++ = callable(*src1++, *src2++, *src3++, ...); + } + @endcode + */ + template + cudaTask transform(I first, I last, C&& callable, S... srcs); + + /** + @brief captures a kernel that performs parallel reduction over a range of items + + @tparam I input iterator type + @tparam T value type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param result pointer to the result with an initialized value + @param op binary reduction operator + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first != last) { + *result = op(*result, *first++); + } + @endcode + */ + template + cudaTask reduce(I first, I last, T* result, C&& op); + + /** + @brief similar to tf::cudaFlowCapturerBase::reduce but does not assum + any initial value to reduce + + This method is equivalent to the parallel execution of the following loop + on a GPU: + + @code{.cpp} + *result = *first++; // no initial values partitipcate in the loop + while (first != last) { + *result = op(*result, *first++); + } + @endcode + */ + template + cudaTask uninitialized_reduce(I first, I last, T* result, C&& op); + + // ------------------------------------------------------------------------ + // rebind methods to update captured tasks + // ------------------------------------------------------------------------ + + /** + @brief rebinds a capture task to another sequential CUDA operations + + The method is similar to cudaFlowCapturerBase::on but with an additional + argument on a previously created capture task. + */ + template , void>* = nullptr + > + cudaTask rebind_on(cudaTask task, C&& callable); + + /** + @brief rebinds a capture task to a memcpy operation + + The method is similar to cudaFlowCapturerBase::memcpy but with an additional + argument on a previously created ceapture task. + */ + cudaTask rebind_memcpy(cudaTask task, void* dst, const void* src, size_t count); + + /** + @brief rebinds a capture task to a copy operation + + The method is similar to cudaFlowCapturerBase::copy but with an additional + argument on a previously created ceapture task. + */ + template , void>* = nullptr + > + cudaTask rebind_copy(cudaTask task, T* tgt, const T* src, size_t num); + + /** + @brief rebinds a capture task to a memset operation + + The method is similar to cudaFlowCapturerBase::memset but with an additional + argument on a previously created ceapture task. + */ + cudaTask rebind_memset(cudaTask task, void* ptr, int value, size_t n); + + /** + @brief rebinds a capture task to a kernel operation + + The method is similar to cudaFlowCapturerBase::kernel but with an additional + argument on a previously created ceapture task. + */ + template + cudaTask rebind_kernel(cudaTask task, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args); + + // ------------------------------------------------------------------------ + // offload methods + // ------------------------------------------------------------------------ + + /** + @brief offloads the captured %cudaFlow onto a GPU and repeatedly runs it until + the predicate becomes true + + @tparam P predicate type (a binary callable) + + @param predicate a binary predicate (returns @c true for stop) + + Immediately offloads the %cudaFlow captured so far onto a GPU and + repeatedly runs it until the predicate returns @c true. + + By default, if users do not offload the %cudaFlow capturer, + the executor will offload it once. + */ + template + void offload_until(P&& predicate); + + /** + @brief offloads the captured %cudaFlow and executes it by the given times + + @param n number of executions + */ + void offload_n(size_t n); + + /** + @brief offloads the captured %cudaFlow and executes it once + */ + void offload(); + + private: + + handle_t _handle; + + cudaGraph& _graph; + + Optimizer _optimizer; + + cudaGraphExec_t _executable {nullptr}; + + std::vector> _capturers; + + cudaFlowCapturer(cudaGraph&); + + cudaGraph_t _capture(); + + void _destroy_executable(); +}; + +// constructs a cudaFlow capturer from a taskflow +inline cudaFlowCapturer::cudaFlowCapturer(cudaGraph& g) : + _handle{std::in_place_type_t{}}, + _graph {g} { +} + +// constructs a standalone cudaFlow capturer +inline cudaFlowCapturer::cudaFlowCapturer() : + _handle{std::in_place_type_t{}}, + _graph {std::get(_handle).graph} { +} + +inline cudaFlowCapturer::~cudaFlowCapturer() { + if(_executable != nullptr) { + cudaGraphExecDestroy(_executable); + } +} + +//// Procedure: _create_executable +//inline void cudaFlowCapturer::_create_executable() { +// assert(_executable == nullptr); +// TF_CHECK_CUDA( +// cudaGraphInstantiate( +// &_executable, _graph._native_handle, nullptr, nullptr, 0 +// ), +// "failed to create an executable for captured graph" +// ); +//} + +// Procedure: dump +inline void cudaFlowCapturer::dump(std::ostream& os) const { + _graph.dump(os, nullptr, ""); +} + +// Procedure: _destroy_executable +inline void cudaFlowCapturer::_destroy_executable() { + if(_executable != nullptr) { + TF_CHECK_CUDA( + cudaGraphExecDestroy(_executable), "failed to destroy executable graph" + ); + _executable = nullptr; + } +} + +// Function: capture +template , void>* +> +cudaTask cudaFlowCapturer::on(C&& callable) { + auto node = _graph.emplace_back(_graph, + std::in_place_type_t{}, std::forward(callable) + ); + return cudaTask(node); +} + +// Function: memcpy +inline cudaTask cudaFlowCapturer::memcpy( + void* dst, const void* src, size_t count +) { + return on([dst, src, count] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), + "failed to capture memcpy" + ); + }); +} + +template , void>*> +cudaTask cudaFlowCapturer::copy(T* tgt, const T* src, size_t num) { + return on([tgt, src, num] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), + "failed to capture copy" + ); + }); +} + +// Function: memset +inline cudaTask cudaFlowCapturer::memset(void* ptr, int v, size_t n) { + return on([ptr, v, n] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" + ); + }); +} + +// Function: kernel +template +cudaTask cudaFlowCapturer::kernel( + dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args +) { + return on([g, b, s, f, args...] (cudaStream_t stream) mutable { + f<<>>(args...); + }); +} + + +// Function: single_task +template +cudaTask cudaFlowCapturer::single_task(C&& callable) { + return on([c=std::forward(callable)] (cudaStream_t stream) mutable { + cuda_single_task<<<1, 1, 0, stream>>>(c); + }); +} + +// Function: for_each +template +cudaTask cudaFlowCapturer::for_each(I first, I last, C&& c) { + return on([first, last, c=std::forward(c)](cudaStream_t stream) mutable { + // TODO: special case for N == 0? + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + cuda_for_each<<<(N+B-1)/B, B, 0, stream>>>(first, N, c); + }); +} + +// Function: for_each_index +template +cudaTask cudaFlowCapturer::for_each_index(I beg, I end, I inc, C&& c) { + + if(is_range_invalid(beg, end, inc)) { + TF_THROW("invalid range [", beg, ", ", end, ") with inc size ", inc); + } + + return on([beg, end, inc, c=std::forward(c)] (cudaStream_t stream) mutable { + // TODO: special case when N is 0? + size_t N = distance(beg, end, inc); + size_t B = cuda_default_threads_per_block(N); + cuda_for_each_index<<<(N+B-1)/B, B, 0, stream>>>(beg, inc, N, c); + }); +} + +// Function: transform +template +cudaTask cudaFlowCapturer::transform(I first, I last, C&& c, S... srcs) { + return on([first, last, c=std::forward(c), srcs...] + (cudaStream_t stream) mutable { + // TODO: special case when N is 0? + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + cuda_transform<<<(N+B-1)/B, B, 0, stream>>>(first, N, c, srcs...); + }); +} + +// Function: reduce +template +cudaTask cudaFlowCapturer::reduce(I first, I last, T* result, C&& c) { + + return on([first, last, result, c=std::forward(c)] + (cudaStream_t stream) mutable { + //using value_t = std::decay_t())>; + + // TODO: special case N == 0? + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + + cuda_reduce<<<1, B, B*sizeof(T), stream>>>( + first, N, result, c + ); + }); +} + +// Function: uninitialized_reduce +template +cudaTask cudaFlowCapturer::uninitialized_reduce( + I first, I last, T* result, C&& c +) { + + return on([first, last, result, c=std::forward(c)] + (cudaStream_t stream) mutable { + //using value_t = std::decay_t())>; + + // TODO: special case N == 0? + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + + cuda_reduce<<<1, B, B*sizeof(T), stream>>>( + first, N, result, c + ); + }); +} + +// Procedure: offload_until +template +void cudaFlowCapturer::offload_until(P&& predicate) { + + // If the executable graph does not exist, instantiate it + if(_executable == nullptr) { + + auto captured = _capture(); + + TF_CHECK_CUDA( + cudaGraphInstantiate( + &_executable, captured, nullptr, nullptr, 0 + ), + "failed to create an executable graph" + ); + TF_CHECK_CUDA(cudaGraphDestroy(captured), "failed to destroy captured graph"); + } + + cudaScopedPerThreadStream s; + + while(!predicate()) { + TF_CHECK_CUDA( + cudaGraphLaunch(_executable, s), "failed to launch the exec graph" + ); + + TF_CHECK_CUDA(cudaStreamSynchronize(s), "failed to synchronize stream"); + } +} + +// Procedure: offload_n +inline void cudaFlowCapturer::offload_n(size_t n) { + offload_until([repeat=n] () mutable { return repeat-- == 0; }); +} + +// Procedure: offload +inline void cudaFlowCapturer::offload() { + offload_until([repeat=1] () mutable { return repeat-- == 0; }); +} + +// Function: empty +inline bool cudaFlowCapturer::empty() const { + return _graph.empty(); +} + +// Function: rebind_on +template , void>* +> +cudaTask cudaFlowCapturer::rebind_on(cudaTask task, C&& callable) { + + if(task.type() != cudaTaskType::CAPTURE) { + TF_THROW("invalid cudaTask type (must be CAPTURE)"); + } + + _destroy_executable(); + + std::get((task._node)->_handle).work = std::forward(callable); + + return task; +} + +// Function: rebind_memcpy +inline cudaTask cudaFlowCapturer::rebind_memcpy( + cudaTask task, void* dst, const void* src, size_t count +) { + return rebind_on(task, [dst, src, count](cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), + "failed to capture memcpy" + ); + }); +} + +// Function: rebind_copy +template , void>* +> +cudaTask cudaFlowCapturer::rebind_copy( + cudaTask task, T* tgt, const T* src, size_t num +) { + return rebind_on(task, [tgt, src, num] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream), + "failed to capture copy" + ); + }); +} + +// Function: rebind_memset +inline cudaTask cudaFlowCapturer::rebind_memset( + cudaTask task, void* ptr, int v, size_t n +) { + return rebind_on(task, [ptr, v, n] (cudaStream_t stream) mutable { + TF_CHECK_CUDA( + cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset" + ); + }); +} + +// Function: rebind_kernel +template +cudaTask cudaFlowCapturer::rebind_kernel( + cudaTask task, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args +) { + return rebind_on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable { + f<<>>(args...); + }); +} + +// Function: make_capturer +template +T* cudaFlowCapturer::make_capturer(ArgsT&&... args) { + + static_assert(std::is_base_of_v); + + auto ptr = std::make_unique(std::forward(args)...); + ptr->_factory = this; + auto raw = ptr.get(); + _capturers.push_back(std::move(ptr)); + return raw; +} + +// Function: _capture +inline cudaGraph_t cudaFlowCapturer::_capture() { + return std::visit( + [this](auto&& opt){ + return opt._optimize(_graph); + }, + _optimizer + ); +} + +// Function: make_optimizer +template +OPT& cudaFlowCapturer::make_optimizer(ArgsT&&... args) { + return _optimizer.emplace(std::forward(args)...); +} + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/cuda/cuda_device.hpp b/taskflow/cuda/cuda_device.hpp new file mode 100644 index 0000000..41315c0 --- /dev/null +++ b/taskflow/cuda/cuda_device.hpp @@ -0,0 +1,342 @@ +#pragma once + +#include "cuda_error.hpp" + +/** +@file cuda_device.hpp +@brief CUDA device utilities include file +*/ + +namespace tf { + +/** +@brief queries the number of available devices +*/ +inline size_t cuda_get_num_devices() { + int N = 0; + TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count"); + return static_cast(N); +} + +/** +@brief gets the current device associated with the caller thread +*/ +inline int cuda_get_device() { + int id; + TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id"); + return id; +} + +/** +@brief switches to a given device context +*/ +inline void cuda_set_device(int id) { + TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id); +} + +/** +@brief obtains the device property +*/ +inline void cuda_get_device_property(int i, cudaDeviceProp& p) { + TF_CHECK_CUDA( + cudaGetDeviceProperties(&p, i), "failed to get property of device ", i + ); +} + +/** +@brief obtains the device property +*/ +inline cudaDeviceProp cuda_get_device_property(int i) { + cudaDeviceProp p; + TF_CHECK_CUDA( + cudaGetDeviceProperties(&p, i), "failed to get property of device ", i + ); + return p; +} + +/** +@brief dumps the device property +*/ +inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) { + + os << "Major revision number: " << p.major << '\n' + << "Minor revision number: " << p.minor << '\n' + << "Name: " << p.name << '\n' + << "Total global memory: " << p.totalGlobalMem << '\n' + << "Total shared memory per block: " << p.sharedMemPerBlock << '\n' + << "Total registers per block: " << p.regsPerBlock << '\n' + << "Warp size: " << p.warpSize << '\n' + << "Maximum memory pitch: " << p.memPitch << '\n' + << "Maximum threads per block: " << p.maxThreadsPerBlock << '\n'; + + os << "Maximum dimension of block: "; + for (int i = 0; i < 3; ++i) { + if(i) os << 'x'; + os << p.maxThreadsDim[i]; + } + os << '\n'; + + os << "Maximum dimenstion of grid: "; + for (int i = 0; i < 3; ++i) { + if(i) os << 'x'; + os << p.maxGridSize[i];; + } + os << '\n'; + + os << "Clock rate: " << p.clockRate << '\n' + << "Total constant memory: " << p.totalConstMem << '\n' + << "Texture alignment: " << p.textureAlignment << '\n' + << "Concurrent copy and execution: " << p.deviceOverlap << '\n' + << "Number of multiprocessors: " << p.multiProcessorCount << '\n' + << "Kernel execution timeout: " << p.kernelExecTimeoutEnabled << '\n' + << "GPU sharing Host Memory: " << p.integrated << '\n' + << "Host page-locked mem mapping: " << p.canMapHostMemory << '\n' + << "Alignment for Surfaces: " << p.surfaceAlignment << '\n' + << "Device has ECC support: " << p.ECCEnabled << '\n' + << "Unified Addressing (UVA): " << p.unifiedAddressing << '\n'; +} + +/** +@brief queries the maximum threads per block on a device +*/ +inline size_t cuda_get_device_max_threads_per_block(int d) { + int threads = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d), + "failed to query the maximum threads per block on device ", d + ) + return threads; +} + +/** +@brief queries the maximum x-dimension per block on a device +*/ +inline size_t cuda_get_device_max_x_dim_per_block(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d), + "failed to query the maximum x-dimension per block on device ", d + ) + return dim; +} + +/** +@brief queries the maximum y-dimension per block on a device +*/ +inline size_t cuda_get_device_max_y_dim_per_block(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d), + "failed to query the maximum y-dimension per block on device ", d + ) + return dim; +} + +/** +@brief queries the maximum z-dimension per block on a device +*/ +inline size_t cuda_get_device_max_z_dim_per_block(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d), + "failed to query the maximum z-dimension per block on device ", d + ) + return dim; +} + +/** +@brief queries the maximum x-dimension per grid on a device +*/ +inline size_t cuda_get_device_max_x_dim_per_grid(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d), + "failed to query the maximum x-dimension per grid on device ", d + ) + return dim; +} + +/** +@brief queries the maximum y-dimension per grid on a device +*/ +inline size_t cuda_get_device_max_y_dim_per_grid(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d), + "failed to query the maximum y-dimension per grid on device ", d + ) + return dim; +} + +/** +@brief queries the maximum z-dimension per grid on a device +*/ +inline size_t cuda_get_device_max_z_dim_per_grid(int d) { + int dim = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d), + "failed to query the maximum z-dimension per grid on device ", d + ) + return dim; +} + +/** +@brief queries the maximum shared memory size in bytes per block on a device +*/ +inline size_t cuda_get_device_max_shm_per_block(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d), + "failed to query the maximum shared memory per block on device ", d + ) + return num; +} + +/** +@brief queries the warp size on a device +*/ +inline size_t cuda_get_device_warp_size(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d), + "failed to query the warp size per block on device ", d + ) + return num; +} + +/** +@brief queries the major number of compute capability of a device +*/ +inline int cuda_get_device_compute_capability_major(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d), + "failed to query the major number of compute capability of device ", d + ) + return num; +} + +/** +@brief queries the minor number of compute capability of a device +*/ +inline int cuda_get_device_compute_capability_minor(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d), + "failed to query the minor number of compute capability of device ", d + ) + return num; +} + +/** +@brief queries if the device supports unified addressing +*/ +inline bool cuda_get_device_unified_addressing(int d) { + int num = 0; + TF_CHECK_CUDA( + cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d), + "failed to query unified addressing status on device ", d + ) + return num; +} + +// ---------------------------------------------------------------------------- +// CUDA Version +// ---------------------------------------------------------------------------- + +/** +@brief queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver +*/ +inline int cuda_get_driver_version() { + int num = 0; + TF_CHECK_CUDA( + cudaDriverGetVersion(&num), + "failed to query the latest cuda version supported by the driver" + ); + return num; +} + +/** +@brief queries the CUDA Runtime version (1000 * major + 10 * minor) +*/ +inline int cuda_get_runtime_version() { + int num = 0; + TF_CHECK_CUDA( + cudaRuntimeGetVersion(&num), "failed to query cuda runtime version" + ); + return num; +} + +// ---------------------------------------------------------------------------- +// cudaScopedDevice +// ---------------------------------------------------------------------------- + +/** @class cudaScopedDevice + +@brief RAII-styled device context switch + +Sample usage: + +@code{.cpp} +{ + tf::cudaScopedDevice device(1); // switch to the device context 1 + + // create a stream under device context 1 + cudaStream_t stream; + cudaStreamCreate(&stream); + +} // leaving the scope and goes back to the previous device context +@endcode + +%cudaScopedDevice is neither movable nor copyable. +*/ +class cudaScopedDevice { + + public: + + /** + @brief constructs a RAII-styled device switcher + + @param device device context to scope in the guard + */ + explicit cudaScopedDevice(int device); + + /** + @brief destructs the guard and switches back to the previous device context + */ + ~cudaScopedDevice(); + + private: + + cudaScopedDevice() = delete; + cudaScopedDevice(const cudaScopedDevice&) = delete; + cudaScopedDevice(cudaScopedDevice&&) = delete; + + int _p; +}; + +// Constructor +inline cudaScopedDevice::cudaScopedDevice(int dev) { + TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope"); + if(_p == dev) { + _p = -1; + } + else { + TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev); + } +} + +// Destructor +inline cudaScopedDevice::~cudaScopedDevice() { + if(_p != -1) { + cudaSetDevice(_p); + //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p); + } +} + +} // end of namespace cuda --------------------------------------------------- + + + + + diff --git a/taskflow/cuda/cuda_error.hpp b/taskflow/cuda/cuda_error.hpp new file mode 100644 index 0000000..1bac95b --- /dev/null +++ b/taskflow/cuda/cuda_error.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include +#include + +#include "../utility/stream.hpp" + +#define TF_CUDA_EXPAND( x ) x +#define TF_CUDA_REMOVE_FIRST_HELPER(N, ...) __VA_ARGS__ +#define TF_CUDA_REMOVE_FIRST(...) TF_CUDA_EXPAND(TF_CUDA_REMOVE_FIRST_HELPER(__VA_ARGS__)) +#define TF_CUDA_GET_FIRST_HELPER(N, ...) N +#define TF_CUDA_GET_FIRST(...) TF_CUDA_EXPAND(TF_CUDA_GET_FIRST_HELPER(__VA_ARGS__)) + +#define TF_CHECK_CUDA(...) \ +if(TF_CUDA_GET_FIRST(__VA_ARGS__) != cudaSuccess) { \ + std::ostringstream oss; \ + auto ev = TF_CUDA_GET_FIRST(__VA_ARGS__); \ + oss << "[" << __FILE__ << ":" << __LINE__ << "] " \ + << (cudaGetErrorString(ev)) << " (" \ + << (cudaGetErrorName(ev)) << ") -"; \ + tf::ostreamize(oss, TF_CUDA_REMOVE_FIRST(__VA_ARGS__)); \ + throw std::runtime_error(oss.str()); \ +} + diff --git a/taskflow/cuda/cuda_flow.hpp b/taskflow/cuda/cuda_flow.hpp new file mode 100644 index 0000000..9e0e5ea --- /dev/null +++ b/taskflow/cuda/cuda_flow.hpp @@ -0,0 +1,1219 @@ +#pragma once + +#include "cuda_task.hpp" +#include "cuda_capturer.hpp" +#include "cuda_optimizer.hpp" +#include "cuda_algorithm/cuda_for_each.hpp" +#include "cuda_algorithm/cuda_transform.hpp" +#include "cuda_algorithm/cuda_reduce.hpp" + +/** +@file cuda_flow.hpp +@brief cudaFlow include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// class definition: cudaFlow +// ---------------------------------------------------------------------------- + +/** +@class cudaFlow + +@brief class for building a CUDA task dependency graph + +A %cudaFlow is a high-level interface over CUDA Graph to perform GPU operations +using the task dependency graph model. +The class provides a set of methods for creating and launch different tasks +on one or multiple CUDA devices, +for instance, kernel tasks, data transfer tasks, and memory operation tasks. +The following example creates a %cudaFlow of two kernel tasks, @c task1 and +@c task2, where @c task1 runs before @c task2. + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +taskflow.emplace([&](tf::cudaFlow& cf){ + // create two kernel tasks + tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1); + tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2); + + // kernel1 runs before kernel2 + task1.precede(task2); +}); + +executor.run(taskflow).wait(); +@endcode + +A %cudaFlow is a task (tf::Task) created from tf::Taskflow +and will be run by @em one worker thread in the executor. +That is, the callable that describes a %cudaFlow +will be executed sequentially. +Inside a %cudaFlow task, different GPU tasks (tf::cudaTask) may run +in parallel scheduled by the CUDA runtime. + +Please refer to @ref GPUTaskingcudaFlow for details. +*/ +class cudaFlow { + + friend class Executor; + + struct External { + cudaGraph graph; + }; + + struct Internal { + Executor& executor; + Internal(Executor& e) : executor {e} {} + }; + + using handle_t = std::variant; + + public: + + /** + @brief constructs a standalone %cudaFlow + + A standalone %cudaFlow does not go through any taskflow and + can be run by the caller thread using explicit offload methods + (e.g., tf::cudaFlow::offload). + */ + cudaFlow(); + + /** + @brief destroys the %cudaFlow and its associated native CUDA graph + and executable graph + */ + ~cudaFlow(); + + /** + @brief queries the emptiness of the graph + */ + bool empty() const; + + /** + @brief dumps the %cudaFlow graph into a DOT format through an + output stream + */ + void dump(std::ostream& os) const; + + /** + @brief dumps the native CUDA graph into a DOT format through an + output stream + + The native CUDA graph may be different from the upper-level %cudaFlow + graph when flow capture is involved. + */ + void dump_native_graph(std::ostream& os) const; + + // ------------------------------------------------------------------------ + // Graph building routines + // ------------------------------------------------------------------------ + + /** + @brief creates a no-operation task + + @return a tf::cudaTask handle + + An empty node performs no operation during execution, + but can be used for transitive ordering. + For example, a phased execution graph with 2 groups of @c n nodes + with a barrier between them can be represented using an empty node + and @c 2*n dependency edges, + rather than no empty node and @c n^2 dependency edges. + */ + cudaTask noop(); + + /** + @brief creates a host task that runs a callable on the host + + @tparam C callable type + + @param callable a callable object with neither arguments nor return + (i.e., constructible from @c std::function) + + @return a tf::cudaTask handle + + A host task can only execute CPU-specific functions and cannot do any CUDA calls + (e.g., @c cudaMalloc). + */ + template + cudaTask host(C&& callable); + + /** + @brief creates a kernel task + + @tparam F kernel function type + @tparam ArgsT kernel function parameters type + + @param g configured grid + @param b configured block + @param s configured shared memory size in bytes + @param f kernel function + @param args arguments to forward to the kernel function by copy + + @return a tf::cudaTask handle + */ + template + cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args); + + /** + @brief creates a kernel task on a specific GPU + + @tparam F kernel function type + @tparam ArgsT kernel function parameters type + + @param d device identifier to launch the kernel + @param g configured grid + @param b configured block + @param s configured shared memory size in bytes + @param f kernel function + @param args arguments to forward to the kernel function by copy + + @return a tf::cudaTask handle + */ + template + cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args); + + /** + @brief creates a memset task that fills untyped data with a byte value + + @param dst pointer to the destination device memory area + @param v value to set for each byte of specified memory + @param count size in bytes to set + + @return a tf::cudaTask handle + + A memset task fills the first @c count bytes of device memory area + pointed by @c dst with the byte value @c v. + */ + cudaTask memset(void* dst, int v, size_t count); + + /** + @brief creates a memcpy task that copies untyped data in bytes + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param bytes bytes to copy + + @return a tf::cudaTask handle + + A memcpy task transfers @c bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. + */ + cudaTask memcpy(void* tgt, const void* src, size_t bytes); + + /** + @brief creates a memset task that sets a typed memory block to zero + + @tparam T element type (size of @c T must be either 1, 2, or 4) + @param dst pointer to the destination device memory area + @param count number of elements + + @return a tf::cudaTask handle + + A zero task zeroes the first @c count elements of type @c T + in a device memory area pointed by @c dst. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + cudaTask zero(T* dst, size_t count); + + /** + @brief creates a memset task that fills a typed memory block with a value + + @tparam T element type (size of @c T must be either 1, 2, or 4) + + @param dst pointer to the destination device memory area + @param value value to fill for each element of type @c T + @param count number of elements + + @return a tf::cudaTask handle + + A fill task fills the first @c count elements of type @c T with @c value + in a device memory area pointed by @c dst. + The value to fill is interpreted in type @c T rather than byte. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + cudaTask fill(T* dst, T value, size_t count); + + /** + @brief creates a memcopy task that copies typed data + + @tparam T element type (non-void) + + @param tgt pointer to the target memory block + @param src pointer to the source memory block + @param num number of elements to copy + + @return a tf::cudaTask handle + + A copy task transfers num*sizeof(T) bytes of data from a source location + to a target location. Direction can be arbitrary among CPUs and GPUs. + */ + template , void>* = nullptr + > + cudaTask copy(T* tgt, const T* src, size_t num); + + // ------------------------------------------------------------------------ + // offload methods + // ------------------------------------------------------------------------ + + /** + @brief offloads the %cudaFlow onto a GPU and repeatedly runs it until + the predicate becomes true + + @tparam P predicate type (a binary callable) + + @param predicate a binary predicate (returns @c true for stop) + + Immediately offloads the present %cudaFlow onto a GPU and + repeatedly runs it until the predicate returns @c true. + + An offloaded %cudaFlow forces the underlying graph to be instantiated. + After the instantiation, you should not modify the graph topology + but update node parameters. + + By default, if users do not offload the %cudaFlow, + the executor will offload it once. + */ + template + void offload_until(P&& predicate); + + /** + @brief offloads the %cudaFlow and executes it by the given times + + @param N number of executions + */ + void offload_n(size_t N); + + /** + @brief offloads the %cudaFlow and executes it once + */ + void offload(); + + // ------------------------------------------------------------------------ + // update methods + // ------------------------------------------------------------------------ + + /** + @brief updates parameters of a host task created from tf::cudaFlow::host + + The method updates the parameters of a host callable associated with the + given @c task. + */ + template + void update_host(cudaTask task, C&& callable); + + /** + @brief updates parameters of a kernel task created from tf::cudaFlow::kernel + + The method updates the parameters of a kernel associated with the given + @c task. We do not allow you to change the kernel function. + */ + template + void update_kernel(cudaTask task, dim3 g, dim3 b, size_t shm, ArgsT&&... args); + + /** + @brief updates parameters of a memcpy task to form a copy task + + The method updates the parameters of a copy task. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template < + typename T, + std::enable_if_t, void>* = nullptr + > + void update_copy(cudaTask task, T* tgt, const T* src, size_t num); + + /** + @brief updates parameters of a memcpy task + + The method updates the parameters of a memcpy task. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + void update_memcpy(cudaTask task, void* tgt, const void* src, size_t bytes); + + /** + @brief updates parameters of a memset task + + The method updates the parameters of a memset task. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + void update_memset(cudaTask task, void* dst, int ch, size_t count); + + /** + @brief updates parameters of a memset task to form a fill task + + The method updates the parameters of a copy task. + The given arguments and type must comply with the rules of tf::cudaFlow::fill. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + void update_fill(cudaTask task, T* dst, T value, size_t count); + + /** + @brief updates parameters of a memset task to form a zero task + + The method updates the parameters of a copy task. + The given arguments and type must comply with the rules of tf::cudaFlow::zero. + The source/destination memory may have different address values but + must be allocated from the same contexts as the original + source/destination memory. + */ + template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr + > + void update_zero(cudaTask task, T* dst, size_t count); + + // ------------------------------------------------------------------------ + // generic algorithms + // ------------------------------------------------------------------------ + + /** + @brief runs a callable with only a single kernel thread + + @tparam C callable type + + @param callable callable to run by a single kernel thread + + @return a tf::cudaTask handle + */ + template + cudaTask single_task(C&& callable); + + /** + @brief applies a callable to each dereferenced element of the data array + + @tparam I iterator type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable a callable object to apply to the dereferenced iterator + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + for(auto itr = first; itr != last; itr++) { + callable(*itr); + } + @endcode + */ + template + cudaTask for_each(I first, I last, C&& callable); + + /** + @brief applies a callable to each index in the range with the step size + + @tparam I index type + @tparam C callable type + + @param first beginning index + @param last last index + @param step step size + @param callable the callable to apply to each element in the data array + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + // step is positive [first, last) + for(auto i=first; ilast; i+=step) { + callable(i); + } + @endcode + */ + template + cudaTask for_each_index(I first, I last, I step, C&& callable); + + /** + @brief applies a callable to a source range and stores the result in a target range + + @tparam I iterator type + @tparam C callable type + @tparam S source types + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable the callable to apply to each element in the range + @param srcs iterators to the source ranges + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first != last) { + *first++ = callable(*src1++, *src2++, *src3++, ...); + } + @endcode + */ + template + cudaTask transform(I first, I last, C&& callable, S... srcs); + + /** + @brief performs parallel reduction over a range of items + + @tparam I input iterator type + @tparam T value type + @tparam C callable type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param result pointer to the result with an initialized value + @param op binary reduction operator + + @return a tf::cudaTask handle + + This method is equivalent to the parallel execution of the following loop on a GPU: + + @code{.cpp} + while (first != last) { + *result = op(*result, *first++); + } + @endcode + */ + template + cudaTask reduce(I first, I last, T* result, C&& op); + + /** + @brief similar to tf::cudaFlow::reduce but does not assume any initial + value to reduce + + This method is equivalent to the parallel execution of the following loop + on a GPU: + + @code{.cpp} + *result = *first++; // no initial values partitipcate in the loop + while (first != last) { + *result = op(*result, *first++); + } + @endcode + */ + template + cudaTask uninitialized_reduce(I first, I last, T* result, C&& op); + + // ------------------------------------------------------------------------ + // subflow + // ------------------------------------------------------------------------ + + /** + @brief constructs a subflow graph through tf::cudaFlowCapturer + + @tparam C callable type constructible from + @c std::function + + @param callable the callable to construct a capture flow + + @return a tf::cudaTask handle + + A captured subflow forms a sub-graph to the %cudaFlow and can be used to + capture custom (or third-party) kernels that cannot be directly constructed + from the %cudaFlow. + + Example usage: + + @code{.cpp} + taskflow.emplace([&](tf::cudaFlow& cf){ + + tf::cudaTask my_kernel = cf.kernel(my_arguments); + + // create a flow capturer to capture custom kernels + tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){ + capturer.on([&](cudaStream_t stream){ + invoke_custom_kernel_with_stream(stream, custom_arguments); + }); + }); + + my_kernel.precede(my_subflow); + }); + @endcode + */ + template + cudaTask capture(C&& callable); + + private: + + handle_t _handle; + + cudaGraph& _graph; + + cudaGraphExec_t _executable {nullptr}; + + cudaFlow(Executor&, cudaGraph&); +}; + +// Construct a standalone cudaFlow +inline cudaFlow::cudaFlow() : + _handle {std::in_place_type_t{}}, + _graph {std::get(_handle).graph} { + + TF_CHECK_CUDA( + cudaGraphCreate(&_graph._native_handle, 0), + "cudaFlow failed to create a native graph (external mode)" + ); +} + +// Construct the cudaFlow from executor (internal graph) +inline cudaFlow::cudaFlow(Executor& e, cudaGraph& g) : + _handle {std::in_place_type_t{}, e}, + _graph {g} { + + assert(_graph._native_handle == nullptr); + + TF_CHECK_CUDA( + cudaGraphCreate(&_graph._native_handle, 0), + "cudaFlow failed to create a native graph (internal mode)" + ); +} + +// Destructor +inline cudaFlow::~cudaFlow() { + if(_executable) { + cudaGraphExecDestroy(_executable); + } + cudaGraphDestroy(_graph._native_handle); + _graph._native_handle = nullptr; +} + +// Function: empty +inline bool cudaFlow::empty() const { + return _graph._nodes.empty(); +} + +// Procedure: dump +inline void cudaFlow::dump(std::ostream& os) const { + _graph.dump(os, nullptr, ""); +} + +// Procedure: dump +inline void cudaFlow::dump_native_graph(std::ostream& os) const { + cuda_dump_graph(os, _graph._native_handle); +} + +// ---------------------------------------------------------------------------- +// Graph building methods +// ---------------------------------------------------------------------------- + +// Function: noop +inline cudaTask cudaFlow::noop() { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{} + ); + + TF_CHECK_CUDA( + cudaGraphAddEmptyNode( + &node->_native_handle, _graph._native_handle, nullptr, 0 + ), + "failed to create a no-operation (empty) node" + ); + + return cudaTask(node); +} + +// Function: host +template +cudaTask cudaFlow::host(C&& c) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{}, std::forward(c) + ); + + auto& h = std::get(node->_handle); + + cudaHostNodeParams p; + p.fn = cudaNode::Host::callback; + p.userData = &h; + + TF_CHECK_CUDA( + cudaGraphAddHostNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a host node" + ); + + return cudaTask(node); +} + +// Function: kernel +template +cudaTask cudaFlow::kernel( + dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args +) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{}, (void*)f + ); + + cudaKernelNodeParams p; + void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; + p.func = (void*)f; + p.gridDim = g; + p.blockDim = b; + p.sharedMemBytes = s; + p.kernelParams = arguments; + p.extra = nullptr; + + TF_CHECK_CUDA( + cudaGraphAddKernelNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a kernel task" + ); + + return cudaTask(node); +} + +// Function: kernel +template +cudaTask cudaFlow::kernel_on( + int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args +) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{}, (void*)f + ); + + cudaKernelNodeParams p; + void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; + p.func = (void*)f; + p.gridDim = g; + p.blockDim = b; + p.sharedMemBytes = s; + p.kernelParams = arguments; + p.extra = nullptr; + + cudaScopedDevice ctx(d); + TF_CHECK_CUDA( + cudaGraphAddKernelNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a kernel task on device ", d + ); + + return cudaTask(node); +} + +// Function: zero +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +cudaTask cudaFlow::zero(T* dst, size_t count) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{} + ); + + auto p = cuda_get_zero_parms(dst, count); + + TF_CHECK_CUDA( + cudaGraphAddMemsetNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a memset (zero) task" + ); + + return cudaTask(node); +} + +// Function: fill +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +cudaTask cudaFlow::fill(T* dst, T value, size_t count) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{} + ); + + auto p = cuda_get_fill_parms(dst, value, count); + + TF_CHECK_CUDA( + cudaGraphAddMemsetNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a memset (fill) task" + ); + + return cudaTask(node); +} + +// Function: copy +template < + typename T, + std::enable_if_t, void>* +> +cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{} + ); + + auto p = cuda_get_copy_parms(tgt, src, num); + + TF_CHECK_CUDA( + cudaGraphAddMemcpyNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a memcpy (copy) task" + ); + + return cudaTask(node); +} + +// Function: memset +inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{} + ); + + auto p = cuda_get_memset_parms(dst, ch, count); + + TF_CHECK_CUDA( + cudaGraphAddMemsetNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a memset task" + ); + + return cudaTask(node); +} + +// Function: memcpy +inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) { + + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{} + ); + + auto p = cuda_get_memcpy_parms(tgt, src, bytes); + + TF_CHECK_CUDA( + cudaGraphAddMemcpyNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, &p + ), + "failed to create a memcpy task" + ); + + return cudaTask(node); +} + +// ------------------------------------------------------------------------ +// update methods +// ------------------------------------------------------------------------ + +// Function: host +template +void cudaFlow::update_host(cudaTask task, C&& c) { + + if(task.type() != cudaTaskType::HOST) { + TF_THROW(task, " is not a host task"); + } + + auto& h = std::get(task._node->_handle); + + h.func = std::forward(c); +} + +// Function: update kernel parameters +template +void cudaFlow::update_kernel( + cudaTask ct, dim3 g, dim3 b, size_t s, ArgsT&&... args +) { + + if(ct.type() != cudaTaskType::KERNEL) { + TF_THROW(ct, " is not a kernel task"); + } + + cudaKernelNodeParams p; + + void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; + p.func = std::get((ct._node)->_handle).func; + p.gridDim = g; + p.blockDim = b; + p.sharedMemBytes = s; + p.kernelParams = arguments; + p.extra = nullptr; + + TF_CHECK_CUDA( + cudaGraphExecKernelNodeSetParams( + _executable, ct._node->_native_handle, &p + ), + "failed to update kernel parameters on ", ct + ); +} + +// Function: update copy parameters +template < + typename T, + std::enable_if_t, void>* +> +void cudaFlow::update_copy(cudaTask ct, T* tgt, const T* src, size_t num) { + + if(ct.type() != cudaTaskType::MEMCPY) { + TF_THROW(ct, " is not a memcpy task"); + } + + auto p = cuda_get_copy_parms(tgt, src, num); + + TF_CHECK_CUDA( + cudaGraphExecMemcpyNodeSetParams( + _executable, ct._node->_native_handle, &p + ), + "failed to update memcpy parameters on ", ct + ); +} + +// Function: update memcpy parameters +inline void cudaFlow::update_memcpy( + cudaTask ct, void* tgt, const void* src, size_t bytes +) { + + if(ct.type() != cudaTaskType::MEMCPY) { + TF_THROW(ct, " is not a memcpy task"); + } + + auto p = cuda_get_memcpy_parms(tgt, src, bytes); + + TF_CHECK_CUDA( + cudaGraphExecMemcpyNodeSetParams(_executable, ct._node->_native_handle, &p), + "failed to update memcpy parameters on ", ct + ); +} + +// Procedure: update_memset +inline +void cudaFlow::update_memset(cudaTask ct, void* dst, int ch, size_t count) { + + if(ct.type() != cudaTaskType::MEMSET) { + TF_THROW(ct, " is not a memset task"); + } + + auto p = cuda_get_memset_parms(dst, ch, count); + + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams( + _executable, ct._node->_native_handle, &p + ), + "failed to update memset parameters on ", ct + ); +} + +// Procedure: update_fill +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +void cudaFlow::update_fill(cudaTask task, T* dst, T value, size_t count) { + + if(task.type() != cudaTaskType::MEMSET) { + TF_THROW(task, " is not a memset task"); + } + + auto p = cuda_get_fill_parms(dst, value, count); + + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams( + _executable, task._node->_native_handle, &p + ), + "failed to update memset parameters on ", task + ); +} + +// Procedure: update_zero +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* +> +void cudaFlow::update_zero(cudaTask task, T* dst, size_t count) { + + if(task.type() != cudaTaskType::MEMSET) { + TF_THROW(task, " is not a memset task"); + } + + auto p = cuda_get_zero_parms(dst, count); + + TF_CHECK_CUDA( + cudaGraphExecMemsetNodeSetParams( + _executable, task._node->_native_handle, &p + ), + "failed to update memset parameters on ", task + ); +} + +// ---------------------------------------------------------------------------- +// Generic Algorithm API +// ---------------------------------------------------------------------------- + +// Function: single_task +template +cudaTask cudaFlow::single_task(C&& c) { + return kernel( + 1, 1, 0, cuda_single_task, std::forward(c) + ); +} + +// Function: for_each +template +cudaTask cudaFlow::for_each(I first, I last, C&& c) { + + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + + // TODO: special case when N is 0? + + return kernel( + (N+B-1) / B, B, 0, cuda_for_each, first, N, std::forward(c) + ); +} + +// Function: for_each_index +template +cudaTask cudaFlow::for_each_index(I beg, I end, I inc, C&& c) { + + if(is_range_invalid(beg, end, inc)) { + TF_THROW("invalid range [", beg, ", ", end, ") with inc size ", inc); + } + + // TODO: special case when N is 0? + + size_t N = distance(beg, end, inc); + size_t B = cuda_default_threads_per_block(N); + + return kernel( + (N+B-1) / B, B, 0, cuda_for_each_index, beg, inc, N, std::forward(c) + ); +} + +// Function: transform +template +cudaTask cudaFlow::transform(I first, I last, C&& c, S... srcs) { + + // TODO: special case when N is 0? + + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + + return kernel( + (N+B-1) / B, B, 0, cuda_transform, + first, N, std::forward(c), srcs... + ); +} + +// Function: reduce +template +cudaTask cudaFlow::reduce(I first, I last, T* result, C&& op) { + + //using value_t = std::decay_t())>; + + // TODO: special case N == 0? + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + + return kernel( + 1, B, B*sizeof(T), cuda_reduce, + first, N, result, std::forward(op) + ); +} + +// Function: uninitialized_reduce +template +cudaTask cudaFlow::uninitialized_reduce(I first, I last, T* result, C&& op) { + + //using value_t = std::decay_t())>; + + // TODO: special case N == 0? + size_t N = std::distance(first, last); + size_t B = cuda_default_threads_per_block(N); + + return kernel( + 1, B, B*sizeof(T), cuda_reduce, + first, N, result, std::forward(op) + ); +} + +// ---------------------------------------------------------------------------- +// captured flow +// ---------------------------------------------------------------------------- + +// Function: capture +template +cudaTask cudaFlow::capture(C&& c) { + + // insert a subflow node + auto node = _graph.emplace_back( + _graph, std::in_place_type_t{} + ); + + // construct a captured flow from the callable + auto& node_handle = std::get(node->_handle); + cudaFlowCapturer capturer(node_handle.graph); + + c(capturer); + + // obtain the optimized captured graph + auto captured = capturer._capture(); + //cuda_dump_graph(std::cout, captured); + + TF_CHECK_CUDA( + cudaGraphAddChildGraphNode( + &node->_native_handle, _graph._native_handle, nullptr, 0, captured + ), + "failed to add a cudaFlow capturer task" + ); + + TF_CHECK_CUDA(cudaGraphDestroy(captured), "failed to destroy captured graph"); + + return cudaTask(node); +} + +// ---------------------------------------------------------------------------- +// Offload methods +// ---------------------------------------------------------------------------- + +// Procedure: offload_until +template +void cudaFlow::offload_until(P&& predicate) { + + //_executor->_invoke_cudaflow_task_internal( + // *this, std::forward

(predicate), false + //); + + // transforms cudaFlow to a native cudaGraph under the specified device + // and launches the graph through a given or an internal device stream + if(_executable == nullptr) { + TF_CHECK_CUDA( + cudaGraphInstantiate( + &_executable, _graph._native_handle, nullptr, nullptr, 0 + ), + "failed to create an executable graph" + ); + //cuda_dump_graph(std::cout, cf._graph._native_handle); + } + + cudaScopedPerThreadStream s; + + while(!predicate()) { + TF_CHECK_CUDA( + cudaGraphLaunch(_executable, s), "failed to execute cudaFlow" + ); + + TF_CHECK_CUDA( + cudaStreamSynchronize(s), "failed to synchronize cudaFlow execution" + ); + } +} + +// Procedure: offload_n +inline void cudaFlow::offload_n(size_t n) { + offload_until([repeat=n] () mutable { return repeat-- == 0; }); +} + +// Procedure: offload +inline void cudaFlow::offload() { + offload_until([repeat=1] () mutable { return repeat-- == 0; }); +} + +// ############################################################################ +// Forward declaration: FlowBuilder +// ############################################################################ + +// FlowBuilder::emplace_on +template , void>* +> +Task FlowBuilder::emplace_on(C&& callable, D&& device) { + auto n = _graph.emplace_back( + std::in_place_type_t{}, + [c=std::forward(callable), d=std::forward(device)] + (Executor& executor, Node* node) mutable { + cudaScopedDevice ctx(d); + executor._invoke_cudaflow_task_entry(c, node); + }, + std::make_unique() + ); + return Task(n); +} + +// FlowBuilder::emplace +template , void>*> +Task FlowBuilder::emplace(C&& c) { + return emplace_on(std::forward(c), tf::cuda_get_device()); +} + +// ############################################################################ +// Forward declaration: Executor +// ############################################################################ + +// Procedure: _invoke_cudaflow_task_entry (cudaFlow) +template , void>* +> +void Executor::_invoke_cudaflow_task_entry(C&& c, Node* node) { + + auto& h = std::get(node->_handle); + + cudaGraph* g = dynamic_cast(h.graph.get()); + + g->clear(); + + cudaFlow cf(*this, *g); + + c(cf); + + // join the cudaflow if never offloaded + if(cf._executable == nullptr) { + cf.offload(); + } +} + +// Procedure: _invoke_cudaflow_task_entry (cudaFlowCapturer) +template , void>* +> +void Executor::_invoke_cudaflow_task_entry(C&& c, Node* node) { + + auto& h = std::get(node->_handle); + + cudaGraph* g = dynamic_cast(h.graph.get()); + + g->clear(); + + cudaFlowCapturer fc(*g); + + c(fc); + + if(fc._executable == nullptr) { + fc.offload(); + } +} + + +} // end of namespace tf ----------------------------------------------------- + + diff --git a/taskflow/cuda/cuda_graph.hpp b/taskflow/cuda/cuda_graph.hpp new file mode 100644 index 0000000..f84e0fc --- /dev/null +++ b/taskflow/cuda/cuda_graph.hpp @@ -0,0 +1,725 @@ +#pragma once + +#include "cuda_memory.hpp" +#include "cuda_stream.hpp" + +#include "../utility/object_pool.hpp" +#include "../utility/traits.hpp" +#include "../utility/passive_vector.hpp" + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaGraph_t routines +// ---------------------------------------------------------------------------- + +/** +@brief gets the memcpy node parameter of a copy task +*/ +template , void>* = nullptr +> +cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) { + + using U = std::decay_t; + + cudaMemcpy3DParms p; + + p.srcArray = nullptr; + p.srcPos = ::make_cudaPos(0, 0, 0); + p.srcPtr = ::make_cudaPitchedPtr(const_cast(src), num*sizeof(U), num, 1); + p.dstArray = nullptr; + p.dstPos = ::make_cudaPos(0, 0, 0); + p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1); + p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1); + p.kind = cudaMemcpyDefault; + + return p; +} + +/** +@brief gets the memcpy node parameter of a memcpy task (untyped) +*/ +inline cudaMemcpy3DParms cuda_get_memcpy_parms( + void* tgt, const void* src, size_t bytes +) { + + // Parameters in cudaPitchedPtr + // d - Pointer to allocated memory + // p - Pitch of allocated memory in bytes + // xsz - Logical width of allocation in elements + // ysz - Logical height of allocation in elements + cudaMemcpy3DParms p; + p.srcArray = nullptr; + p.srcPos = ::make_cudaPos(0, 0, 0); + p.srcPtr = ::make_cudaPitchedPtr(const_cast(src), bytes, bytes, 1); + p.dstArray = nullptr; + p.dstPos = ::make_cudaPos(0, 0, 0); + p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1); + p.extent = ::make_cudaExtent(bytes, 1, 1); + p.kind = cudaMemcpyDefault; + + return p; +} + +/** +@brief gets the memset node parameter of a memcpy task (untyped) +*/ +inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) { + + cudaMemsetParams p; + p.dst = dst; + p.value = ch; + p.pitch = 0; + //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1; + //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count; + p.elementSize = 1; // either 1, 2, or 4 + p.width = count; + p.height = 1; + + return p; +} + +/** +@brief gets the memset node parameter of a fill task (typed) +*/ +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr +> +cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) { + + cudaMemsetParams p; + p.dst = dst; + + // perform bit-wise copy + p.value = 0; // crucial + static_assert(sizeof(T) <= sizeof(p.value), "internal error"); + std::memcpy(&p.value, &value, sizeof(T)); + + p.pitch = 0; + p.elementSize = sizeof(T); // either 1, 2, or 4 + p.width = count; + p.height = 1; + + return p; +} + +/** +@brief gets the memset node parameter of a zero task (typed) +*/ +template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr +> +cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) { + + cudaMemsetParams p; + p.dst = dst; + p.value = 0; + p.pitch = 0; + p.elementSize = sizeof(T); // either 1, 2, or 4 + p.width = count; + p.height = 1; + + return p; +} + +/** +@brief queries the number of root nodes in a native CUDA graph +*/ +inline size_t cuda_get_graph_num_root_nodes(cudaGraph_t graph) { + size_t num_nodes; + TF_CHECK_CUDA( + cudaGraphGetRootNodes(graph, nullptr, &num_nodes), + "failed to get native graph root nodes" + ); + return num_nodes; +} + +/** +@brief queries the number of nodes in a native CUDA graph +*/ +inline size_t cuda_get_graph_num_nodes(cudaGraph_t graph) { + size_t num_nodes; + TF_CHECK_CUDA( + cudaGraphGetNodes(graph, nullptr, &num_nodes), + "failed to get native graph nodes" + ); + return num_nodes; +} + +/** +@brief queries the number of edges in a native CUDA graph +*/ +inline size_t cuda_get_graph_num_edges(cudaGraph_t graph) { + size_t num_edges; + TF_CHECK_CUDA( + cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges), + "failed to get native graph edges" + ); + return num_edges; +} + +/** +@brief acquires the nodes in a native CUDA graph +*/ +inline std::vector cuda_get_graph_nodes(cudaGraph_t graph) { + size_t num_nodes = cuda_get_graph_num_nodes(graph); + std::vector nodes(num_nodes); + TF_CHECK_CUDA( + cudaGraphGetNodes(graph, nodes.data(), &num_nodes), + "failed to get native graph nodes" + ); + return nodes; +} + +/** +@brief acquires the root nodes in a native CUDA graph +*/ +inline std::vector cuda_get_graph_root_nodes(cudaGraph_t graph) { + size_t num_nodes = cuda_get_graph_num_root_nodes(graph); + std::vector nodes(num_nodes); + TF_CHECK_CUDA( + cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes), + "failed to get native graph nodes" + ); + return nodes; +} + +/** +@brief acquires the edges in a native CUDA graph +*/ +inline std::vector> +cuda_get_graph_edges(cudaGraph_t graph) { + size_t num_edges = cuda_get_graph_num_edges(graph); + std::vector froms(num_edges), tos(num_edges); + TF_CHECK_CUDA( + cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges), + "failed to get native graph edges" + ); + std::vector> edges(num_edges); + for(size_t i=0; i +void cuda_dump_graph(T& os, cudaGraph_t graph) { + + os << "digraph cudaGraph {\n"; + + std::stack> stack; + stack.push(std::make_tuple(graph, nullptr, 1)); + + int pl = 0; + + while(stack.empty() == false) { + + auto [graph, parent, l] = stack.top(); + stack.pop(); + + for(int i=0; i " << 'p' << to << ";\n"; + } + + for(auto& node : nodes) { + auto type = cuda_get_graph_node_type(node); + if(type == cudaGraphNodeTypeGraph) { + + cudaGraph_t graph; + TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &graph), ""); + stack.push(std::make_tuple(graph, node, l+1)); + + os << 'p' << node << "[" + << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, " + << "label=\"cudaGraph-L" << l+1 + << "\"];\n"; + } + else { + os << 'p' << node << "[label=\"" + << cuda_graph_node_type_to_string(type) + << "\"];\n"; + } + } + + // precede to parent + if(parent != nullptr) { + std::unordered_set successors; + for(const auto& p : edges) { + successors.insert(p.first); + } + for(auto node : nodes) { + if(successors.find(node) == successors.end()) { + os << 'p' << node << " -> " << 'p' << parent << ";\n"; + } + } + } + + // set the previous level + pl = l; + } + + for(int i=0; i<=pl; i++) { + os << "}\n"; + } +} + +// ---------------------------------------------------------------------------- +// cudaGraph class +// ---------------------------------------------------------------------------- + +// class: cudaGraph +class cudaGraph : public CustomGraphBase { + + friend class cudaNode; + friend class cudaTask; + friend class cudaFlowCapturerBase; + friend class cudaFlowCapturer; + friend class cudaFlow; + friend class cudaCapturingBase; + friend class cudaSequentialCapturing; + friend class cudaRoundRobinCapturing; + friend class Taskflow; + friend class Executor; + + public: + + cudaGraph() = default; + ~cudaGraph(); + + cudaGraph(const cudaGraph&) = delete; + cudaGraph(cudaGraph&&); + + cudaGraph& operator = (const cudaGraph&) = delete; + cudaGraph& operator = (cudaGraph&&); + + template + cudaNode* emplace_back(ArgsT&&...); + + bool empty() const; + + void clear(); + void dump(std::ostream&, const void*, const std::string&) const override final; + + private: + + cudaGraph_t _native_handle {nullptr}; + + // TODO: nvcc complains deleter of unique_ptr + //std::vector> _nodes; + std::vector _nodes; +}; + +// ---------------------------------------------------------------------------- +// cudaNode class +// ---------------------------------------------------------------------------- + +// class: cudaNode +// each create_native_node is wrapped in a function to call at runtime +// in order to work with gpu context +class cudaNode { + + friend class cudaGraph; + friend class cudaTask; + friend class cudaFlow; + friend class cudaFlowCapturer; + friend class cudaFlowCapturerBase; + friend class cudaCapturingBase; + friend class cudaSequentialCapturing; + friend class cudaRoundRobinCapturing; + friend class cudaGreedyCapturing; + friend class Taskflow; + friend class Executor; + + // Empty handle + struct Empty { + }; + + // Host handle + struct Host { + + template + Host(C&&); + + std::function func; + + static void callback(void*); + }; + + // Memset handle + struct Memset { + }; + + // Memcpy handle + struct Memcpy { + }; + + // Kernel handle + struct Kernel { + + template + Kernel(F&& f); + + void* func {nullptr}; + }; + + // Subflow handle + struct Subflow { + cudaGraph graph; + }; + + // Capture + struct Capture { + + template + Capture(C&&); + + std::function work; + + cudaEvent_t event; + size_t level; + size_t lid; + size_t idx; + }; + + using handle_t = std::variant< + Empty, + Host, + Memset, + Memcpy, + Kernel, + Subflow, + Capture + >; + + constexpr static auto STATE_VISITED = 0x1; + + public: + + // variant index + constexpr static auto EMPTY = get_index_v; + constexpr static auto HOST = get_index_v; + constexpr static auto MEMSET = get_index_v; + constexpr static auto MEMCPY = get_index_v; + constexpr static auto KERNEL = get_index_v; + constexpr static auto SUBFLOW = get_index_v; + constexpr static auto CAPTURE = get_index_v; + + cudaNode() = delete; + + template + cudaNode(cudaGraph&, ArgsT&&...); + + private: + + cudaGraph& _graph; + + std::string _name; + + handle_t _handle; + + cudaGraphNode_t _native_handle {nullptr}; + + std::vector _successors; + std::vector _dependents; + + void _precede(cudaNode*); + //void _set_state(int); + //void _unset_state(int); + //void _clear_state(); + //bool _has_state(int) const; +}; + +// ---------------------------------------------------------------------------- +// cudaNode definitions +// ---------------------------------------------------------------------------- + +// Host handle constructor +template +cudaNode::Host::Host(C&& c) : func {std::forward(c)} { +} + +// Host callback +inline void cudaNode::Host::callback(void* data) { + static_cast(data)->func(); +}; + +// Kernel handle constructor +template +cudaNode::Kernel::Kernel(F&& f) : + func {std::forward(f)} { +} + +// Capture handle constructor +template +cudaNode::Capture::Capture(C&& work) : + work {std::forward(work)} { +} + +// Constructor +template +cudaNode::cudaNode(cudaGraph& graph, ArgsT&&... args) : + _graph {graph}, + _handle {std::forward(args)...} { +} + +// Procedure: _precede +inline void cudaNode::_precede(cudaNode* v) { + + _successors.push_back(v); + v->_dependents.push_back(this); + + // capture node doesn't have the native graph yet + if(_handle.index() != cudaNode::CAPTURE) { + TF_CHECK_CUDA( + ::cudaGraphAddDependencies( + _graph._native_handle, &_native_handle, &v->_native_handle, 1 + ), + "failed to add a preceding link ", this, "->", v + ); + } +} + +//// Procedure: _set_state +//inline void cudaNode::_set_state(int flag) { +// _state |= flag; +//} +// +//// Procedure: _unset_state +//inline void cudaNode::_unset_state(int flag) { +// _state &= ~flag; +//} +// +//// Procedure: _clear_state +//inline void cudaNode::_clear_state() { +// _state = 0; +//} +// +//// Function: _has_state +//inline bool cudaNode::_has_state(int flag) const { +// return _state & flag; +//} + +// ---------------------------------------------------------------------------- +// cudaGraph definitions +// ---------------------------------------------------------------------------- + +// Destructor +inline cudaGraph::~cudaGraph() { + clear(); + assert(_native_handle == nullptr); +} + +// Move constructor +inline cudaGraph::cudaGraph(cudaGraph&& g) : + _native_handle {g._native_handle}, + _nodes {std::move(g._nodes)} { + + g._native_handle = nullptr; + + assert(g._nodes.empty()); +} + +// Move assignment +inline cudaGraph& cudaGraph::operator = (cudaGraph&& rhs) { + + clear(); + + // lhs + _native_handle = rhs._native_handle; + _nodes = std::move(rhs._nodes); + + assert(rhs._nodes.empty()); + + // rhs + rhs._native_handle = nullptr; + + return *this; +} + +// Function: empty +inline bool cudaGraph::empty() const { + return _nodes.empty(); +} + +// Procedure: clear +inline void cudaGraph::clear() { + for(auto n : _nodes) { + delete n; + } + _nodes.clear(); +} + +// Function: emplace_back +template +cudaNode* cudaGraph::emplace_back(ArgsT&&... args) { + //auto node = std::make_unique(std::forward(args)...); + //_nodes.emplace_back(std::move(node)); + //return _nodes.back().get(); + // TODO: object pool + + auto node = new cudaNode(std::forward(args)...); + _nodes.push_back(node); + return node; +} + +// Procedure: dump the graph to a DOT format +inline void cudaGraph::dump( + std::ostream& os, const void* root, const std::string& root_name +) const { + + // recursive dump with stack + std::stack> stack; + stack.push(std::make_tuple(this, nullptr, 1)); + + int pl = 0; + + while(!stack.empty()) { + + auto [graph, parent, l] = stack.top(); + stack.pop(); + + for(int i=0; i_name.empty()) os << 'p' << parent; + else os << parent->_name; + os << "\";\n" << "color=\"purple\"\n"; + } + + for(auto& v : graph->_nodes) { + + os << 'p' << v << "[label=\""; + if(v->_name.empty()) { + os << 'p' << v << "\""; + } + else { + os << v->_name << "\""; + } + + switch(v->_handle.index()) { + case cudaNode::KERNEL: + os << " style=\"filled\"" + << " color=\"white\" fillcolor=\"black\"" + << " fontcolor=\"white\"" + << " shape=\"box3d\""; + break; + + case cudaNode::SUBFLOW: + stack.push(std::make_tuple( + &std::get(v->_handle).graph, v, l+1) + ); + os << " style=\"filled\"" + << " color=\"black\" fillcolor=\"purple\"" + << " fontcolor=\"white\"" + << " shape=\"folder\""; + break; + + default: + break; + } + + os << "];\n"; + + for(const auto s : v->_successors) { + os << 'p' << v << " -> " << 'p' << s << ";\n"; + } + + if(v->_successors.size() == 0) { + if(parent == nullptr) { + if(root) { + os << 'p' << v << " -> p" << root << ";\n"; + } + } + else { + os << 'p' << v << " -> p" << parent << ";\n"; + } + } + } + + // set the previous level + pl = l; + } + + for(int i=0; iN*sizeof(T) bytes of memory +on the given device @c d and returns a pointer to the starting address of +the device memory. +*/ +template +T* cuda_malloc_device(size_t N, int d) { + cudaScopedDevice ctx(d); + T* ptr {nullptr}; + TF_CHECK_CUDA( + cudaMalloc(&ptr, N*sizeof(T)), + "failed to allocate memory (", N*sizeof(T), "bytes) on device ", d + ) + return ptr; +} + +/** +@brief allocates memory on the current device associated with the caller + +The function calls cuda_malloc_device from the current device associated +with the caller. +*/ +template +T* cuda_malloc_device(size_t N) { + return cuda_malloc_device(N, cuda_get_device()); +} + +/** +@brief allocates shared memory for holding @c N elements of type @c T + +The function calls @c cudaMallocManaged to allocate N*sizeof(T) bytes +of memory and returns a pointer to the starting address of the shared memory. +*/ +template +T* cuda_malloc_shared(size_t N) { + T* ptr {nullptr}; + TF_CHECK_CUDA( + cudaMallocManaged(&ptr, N*sizeof(T)), + "failed to allocate shared memory (", N*sizeof(T), "bytes)" + ) + return ptr; +} + +/** +@brief frees memory on the GPU device + +@tparam T pointer type +@param ptr device pointer to memory to free +@param d device context identifier + +This methods call @c cudaFree to free the memory space pointed to by @c ptr +using the given device context. +*/ +template +void cuda_free(T* ptr, int d) { + cudaScopedDevice ctx(d); + TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr, " on GPU ", d); +} + +/** +@brief frees memory on the GPU device + +@tparam T pointer type +@param ptr device pointer to memory to free + +This methods call @c cudaFree to free the memory space pointed to by @c ptr +using the current device context of the caller. +*/ +template +void cuda_free(T* ptr) { + cuda_free(ptr, cuda_get_device()); +} + +/** +@brief copies data between host and device asynchronously through a stream + +@param stream stream identifier +@param dst destination memory address +@param src source memory address +@param count size in bytes to copy + +The method calls @c cudaMemcpyAsync with the given @c stream +using @c cudaMemcpyDefault to infer the memory space of the source and +the destination pointers. The memory areas may not overlap. +*/ +inline void cuda_memcpy_async( + cudaStream_t stream, void* dst, const void* src, size_t count +) { + TF_CHECK_CUDA( + cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream), + "failed to perform cudaMemcpyAsync" + ); +} + +/** +@brief initializes or sets GPU memory to the given value byte by byte + +@param stream stream identifier +@param devPtr pointer to GPU mempry +@param value value to set for each byte of the specified memory +@param count size in bytes to set + +The method calls @c cudaMemsetAsync with the given @c stream +to fill the first @c count bytes of the memory area pointed to by @c devPtr +with the constant byte value @c value. +*/ +inline void cuda_memset_async( + cudaStream_t stream, void* devPtr, int value, size_t count +){ + TF_CHECK_CUDA( + cudaMemsetAsync(devPtr, value, count, stream), + "failed to perform cudaMemsetAsync" + ); +} + +// ---------------------------------------------------------------------------- +// Shared Memory +// ---------------------------------------------------------------------------- +// +// Because dynamically sized shared memory arrays are declared "extern", +// we can't templatize them directly. To get around this, we declare a +// simple wrapper struct that will declare the extern array with a different +// name depending on the type. This avoids compiler errors about duplicate +// definitions. +// +// To use dynamically allocated shared memory in a templatized __global__ or +// __device__ function, just replace code like this: +// +// template +// __global__ void +// foo( T* g_idata, T* g_odata) +// { +// // Shared mem size is determined by the host app at run time +// extern __shared__ T sdata[]; +// ... +// doStuff(sdata); +// ... +// } +// +// With this: +// +// template +// __global__ void +// foo( T* g_idata, T* g_odata) +// { +// // Shared mem size is determined by the host app at run time +// cudaSharedMemory smem; +// T* sdata = smem.get(); +// ... +// doStuff(sdata); +// ... +// } +// ---------------------------------------------------------------------------- + +// This is the un-specialized struct. Note that we prevent instantiation of this +// struct by putting an undefined symbol in the function body so it won't compile. +/** +@private +*/ +template +struct cudaSharedMemory +{ + // Ensure that we won't compile any un-specialized types + __device__ T *get() + { + extern __device__ void error(void); + error(); + return NULL; + } +}; + +// Following are the specializations for the following types. +// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double +// One could also specialize it for user-defined types. + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ int *get() + { + extern __shared__ int s_int[]; + return s_int; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ unsigned int *get() + { + extern __shared__ unsigned int s_uint[]; + return s_uint; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ char *get() + { + extern __shared__ char s_char[]; + return s_char; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ unsigned char *get() + { + extern __shared__ unsigned char s_uchar[]; + return s_uchar; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ short *get() + { + extern __shared__ short s_short[]; + return s_short; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ unsigned short *get() + { + extern __shared__ unsigned short s_ushort[]; + return s_ushort; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ long *get() + { + extern __shared__ long s_long[]; + return s_long; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ unsigned long *get() + { + extern __shared__ unsigned long s_ulong[]; + return s_ulong; + } +}; + +//template <> +//struct cudaSharedMemory +//{ +// __device__ size_t *get() +// { +// extern __shared__ size_t s_sizet[]; +// return s_sizet; +// } +//}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ bool *get() + { + extern __shared__ bool s_bool[]; + return s_bool; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ float *get() + { + extern __shared__ float s_float[]; + return s_float; + } +}; + +/** +@private +*/ +template <> +struct cudaSharedMemory +{ + __device__ double *get() + { + extern __shared__ double s_double[]; + return s_double; + } +}; + +} // end of namespace tf ----------------------------------------------------- + + + + + + diff --git a/taskflow/cuda/cuda_optimizer.hpp b/taskflow/cuda/cuda_optimizer.hpp new file mode 100644 index 0000000..bb3ea6d --- /dev/null +++ b/taskflow/cuda/cuda_optimizer.hpp @@ -0,0 +1,638 @@ +#pragma once + +#include "cuda_graph.hpp" + +/** +@file cuda_optimizer.hpp +@brief %cudaFlow capturing algorithms include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaCapturingBase +// ---------------------------------------------------------------------------- + +/** +@private + +@brief class to provide helper common methods for optimization algorithms +*/ +class cudaCapturingBase { + + protected: + + std::vector _toposort(cudaGraph&); + std::vector> _levelize(cudaGraph&); +}; + +// Function: _toposort +inline std::vector cudaCapturingBase::_toposort(cudaGraph& graph) { + + std::vector res; + std::queue bfs; + + res.reserve(graph._nodes.size()); + + // insert the first level of nodes into the queue + for(auto u : graph._nodes) { + + auto& hu = std::get(u->_handle); + hu.level = u->_dependents.size(); + + if(hu.level == 0) { + bfs.push(u); + } + } + + // levelize the graph using bfs + while(!bfs.empty()) { + + auto u = bfs.front(); + bfs.pop(); + + res.push_back(u); + + auto& hu = std::get(u->_handle); + + for(auto v : u->_successors) { + auto& hv = std::get(v->_handle); + if(--hv.level == 0) { + bfs.push(v); + } + } + } + + return res; +} + +// Function: _levelize +inline std::vector> +cudaCapturingBase::_levelize(cudaGraph& graph) { + + std::queue bfs; + + size_t max_level = 0; + + // insert the first level of nodes into the queue + for(auto u : graph._nodes) { + + auto& hu = std::get(u->_handle); + hu.level = u->_dependents.size(); + + if(hu.level == 0) { + bfs.push(u); + } + } + + // levelize the graph using bfs + while(!bfs.empty()) { + + auto u = bfs.front(); + bfs.pop(); + + auto& hu = std::get(u->_handle); + + for(auto v : u->_successors) { + auto& hv = std::get(v->_handle); + if(--hv.level == 0) { + hv.level = hu.level + 1; + if(hv.level > max_level) { + max_level = hv.level; + } + bfs.push(v); + } + } + } + + // set level_graph and each node's idx + std::vector> level_graph(max_level+1); + for(auto u : graph._nodes) { + auto& hu = std::get(u->_handle); + hu.lid = level_graph[hu.level].size(); + level_graph[hu.level].emplace_back(u); + + //for(auto s : u->_successors) { + // assert(hu.level < std::get(s->_handle).level); + //} + } + + return level_graph; +} + +// ---------------------------------------------------------------------------- +// class definition: cudaSequentialCapturing +// ---------------------------------------------------------------------------- + +/** +@class cudaSequentialCapturing + +@brief class to capture the described graph into a native cudaGraph + using a single stream + +A sequential capturing algorithm finds a topological order of +the described graph and captures dependent GPU tasks using a single stream. +All GPU tasks run sequentially without breaking inter dependencies. +*/ +class cudaSequentialCapturing : public cudaCapturingBase { + + friend class cudaFlowCapturer; + + public: + + /** + @brief constructs a sequential optimizer + */ + cudaSequentialCapturing() = default; + + private: + + cudaGraph_t _optimize(cudaGraph& graph); +}; + +inline cudaGraph_t cudaSequentialCapturing::_optimize(cudaGraph& graph) { + // acquire per-thread stream and turn it into capture mode + // we must use ThreadLocal mode to avoid clashing with CUDA global states + cudaScopedPerThreadStream stream; + + cudaGraph_t native_g; + + TF_CHECK_CUDA( + cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal), + "failed to turn stream into per-thread capture mode" + ); + + auto ordered = _toposort(graph); + for(auto& node : ordered) { + std::get(node->_handle).work(stream); + } + + TF_CHECK_CUDA( + cudaStreamEndCapture(stream, &native_g), "failed to end capture" + ); + + return native_g; +} + +// ---------------------------------------------------------------------------- +// class definition: cudaRoundRobinCapturing +// ---------------------------------------------------------------------------- + +/** +@class cudaRoundRobinCapturing + +@brief class to capture the described graph into a native cudaGraph + using a greedy round-robin algorithm on a fixed number of streams + +A round-robin capturing algorithm levelizes the user-described graph +and assign streams to nodes in a round-robin order level by level. +*/ +class cudaRoundRobinCapturing : public cudaCapturingBase { + + friend class cudaFlowCapturer; + + public: + + /** + @brief constructs a round-robin optimizer with 4 streams by default + */ + cudaRoundRobinCapturing(); + + /** + @brief constructs a round-robin optimizer with the given number of streams + */ + cudaRoundRobinCapturing(size_t num_streams); + + /** + @brief queries the number of streams used by the optimizer + */ + size_t num_streams() const; + + /** + @brief sets the number of streams used by the optimizer + */ + void num_streams(size_t n); + + private: + + size_t _num_streams {4}; + + cudaGraph_t _optimize(cudaGraph& graph); + + void _reset(std::vector>& graph); + +}; + +// Constructor +inline cudaRoundRobinCapturing::cudaRoundRobinCapturing(size_t num_streams) : + _num_streams {num_streams} { + + if(num_streams == 0) { + TF_THROW("number of streams must be at least one"); + } +} + +// Function: num_streams +inline size_t cudaRoundRobinCapturing::num_streams() const { + return _num_streams; +} + +// Procedure: num_streams +inline void cudaRoundRobinCapturing::num_streams(size_t n) { + if(n == 0) { + TF_THROW("number of streams must be at least one"); + } + _num_streams = n; +} + +inline void cudaRoundRobinCapturing::_reset(std::vector>& graph) { + //level == global id + //idx == stream id we want to skip + size_t id{0}; + for(auto& each_level: graph) { + for(auto& node: each_level) { + auto& hn = std::get(node->_handle); + hn.level = id++; + hn.idx = _num_streams; + hn.event = nullptr; + } + } +} + +// Function: _optimize +inline cudaGraph_t cudaRoundRobinCapturing::_optimize(cudaGraph& graph) { + + // levelize the graph + auto levelized = _levelize(graph); + + // initialize the data structure + _reset(levelized); + + // begin to capture + std::vector streams(_num_streams); + + TF_CHECK_CUDA( + cudaStreamBeginCapture(streams[0], cudaStreamCaptureModeThreadLocal), + "failed to turn stream into per-thread capture mode" + ); + + // reserve space for scoped events + std::vector events; + events.reserve((_num_streams >> 1) + levelized.size()); + + // fork + cudaEvent_t fork_event = events.emplace_back(); + TF_CHECK_CUDA( + cudaEventRecord(fork_event, streams[0]), "faid to record fork" + ); + + for(size_t i = 1; i < streams.size(); ++i) { + TF_CHECK_CUDA( + cudaStreamWaitEvent(streams[i], fork_event, 0), "failed to wait on fork" + ); + } + + // assign streams to levelized nodes in a round-robin manner + for(auto& each_level: levelized) { + for(auto& node: each_level) { + auto& hn = std::get(node->_handle); + size_t sid = hn.lid % _num_streams; + + //wait events + cudaNode* wait_node{nullptr}; + for(auto& pn: node->_dependents) { + auto& phn = std::get(pn->_handle); + size_t psid = phn.lid % _num_streams; + + //level == global id + //idx == stream id we want to skip + if(psid == hn.idx) { + if(wait_node == nullptr || std::get(wait_node->_handle).level < phn.level) { + wait_node = pn; + } + } + else if(psid != sid) { + TF_CHECK_CUDA( + cudaStreamWaitEvent(streams[sid], phn.event, 0), + "failed to wait on node's stream" + ); + } + } + + if(wait_node != nullptr) { + assert(std::get(wait_node->_handle).event); + TF_CHECK_CUDA( + cudaStreamWaitEvent( + streams[sid], + std::get(wait_node->_handle).event, + 0 + ), "failed to wait on node's stream" + ); + } + + //capture + hn.work(streams[sid]); + + //create/record stream + for(auto& sn: node->_successors) { + auto& shn = std::get(sn->_handle); + size_t ssid = shn.lid % _num_streams; + if(ssid != sid) { + if(!hn.event) { + hn.event = events.emplace_back(); + TF_CHECK_CUDA( + cudaEventRecord(hn.event, streams[sid]), "failed to record node's stream" + ); + } + //idx == stream id we want to skip + shn.idx = sid; + } + } + } + } + + // join + for(size_t i=1; i<_num_streams; ++i) { + cudaEvent_t join_event = events.emplace_back(); + TF_CHECK_CUDA( + cudaEventRecord(join_event, streams[i]), "failed to record join" + ); + TF_CHECK_CUDA( + cudaStreamWaitEvent(streams[0], join_event), "failed to wait on join" + ); + } + + cudaGraph_t native_g; + + TF_CHECK_CUDA( + cudaStreamEndCapture(streams[0], &native_g), "failed to end capture" + ); + + //tf::cuda_dump_graph(std::cout, native_g); + //std::cout << '\n'; + + return native_g; +} + + +/*class cudaGreedyCapturing: public cudaCapturingBase { + + friend class cudaFlowCapturer; + + public: + + cudaGreedyCapturing(); + + cudaGreedyCapturing(size_t num_stream); + + private: + + size_t _num_streams{4}; + + cudaGraph_t _optimize(cudaGraph& graph); +}; + +inline cudaGreedyCapturing::cudaGreedyCapturing(size_t num_streams): + _num_streams {num_streams} { + + if(num_streams == 0) { + TF_THROW("number of streams must be at least one"); + } +} + +inline cudaGraph_t cudaGreedyCapturing::_optimize(cudaGraph& graph) { + // levelize the graph + auto level_graph = _levelize(graph); + + // begin to capture + std::vector streams(_num_streams); + + TF_CHECK_CUDA( + cudaStreamBeginCapture(streams[0], cudaStreamCaptureModeThreadLocal), + "failed to turn stream into per-thread capture mode" + ); + + // reserve space for scoped events + std::vector events; + events.reserve((_num_streams >> 1) + level_graph.size()); + + // fork + cudaEvent_t fork_event = events.emplace_back(); + TF_CHECK_CUDA( + cudaEventRecord(fork_event, streams[0]), "faid to record fork" + ); + + for(size_t i = 1; i < streams.size(); ++i) { + TF_CHECK_CUDA( + cudaStreamWaitEvent(streams[i], fork_event, 0), "failed to wait on fork" + ); + } + + //assign sid to each node + std::vector assign(streams.size()); + std::vector prev_assign(streams.size()); + std::queue remains;; + size_t counter{0}; + + //first level (we don't have any predecessors in the firset level) + for(auto& node : level_graph[0]) { + auto& hn = std::get(node->_handle); + + hn.work(streams[counter]); + hn.sid = counter; + prev_assign[hn.sid] = node; + if(++counter == streams.size()) { + counter = 0; + } + } + + //other levels + for(size_t l = 1; l < level_graph.size(); ++l) { + for(size_t n = 0; n < level_graph[l].size(); ++n) { + + auto& node = level_graph[l][n]; + auto& hn = std::get(node->_handle); + auto& preds = node->_dependents; + + //1. try to assign the same stream as one of parent nodes' + //maybe assigned by 2 in advance + if(hn.sid == -1) { + for(auto& pn: preds) { + auto& phn = std::get(pn->_handle); + //TODO:ignore cross-level? (currently no, since we still need to create events for cross-level dependencies) + //TODO::we may not choose the most suitable stream if the node has multiple parents + if(assign[phn.sid] == nullptr) { + hn.sid = phn.sid; + assign[hn.sid] = node; + //sgraph[hn.sid].emplace_back(node); + + ++counter; + break; + } + } + } + + if(hn.sid == -1) { + //2. if 1. failed + //try to find idle stream + //however, the idle stream may be 'booked' by incoming nodes + //we then skip such case to avoid the cost of creating event + //in other words, we let future nodes use 1. as much as possible + for(size_t s = 0; s < streams.size(); ++s) { + if(assign[s] == nullptr) { + bool is_assigned{false}; + + if(prev_assign[s] != nullptr) { + auto& pan = prev_assign[s]; + auto& pahn = std::get(pan->_handle); + + for(auto& sn: pan->_successors) { + auto& shn = std::get(sn->_handle); + if(shn.level - pahn.level == 1 && shn.sid == -1) { + shn.sid = s; + assign[s] = sn; + ++counter; + is_assigned = true; + break; + } + } + } + + if(!is_assigned) { + hn.sid = s; + assign[s] = node; + ++counter; + } + + break; + } + } + } + + //3. if 1. and 2. failed + //emplace_back to remains + if(hn.sid == -1) { + remains.push(node); + } + + if(counter == streams.size()) { + //insert remaining nodes into remains + for(size_t i = n + 1; i < level_graph[l].size(); ++i) { + auto& ln = level_graph[l][i]; + auto& lhn = std::get(ln->_handle); + if(lhn.sid == -1) { + remains.push(ln); + } + } + + prev_assign = std::move(assign); + assign.resize(streams.size()); + counter = 0; + + break; + } + } + + + while(!remains.empty()) { + //r1. same as 1 + auto& node = remains.front(); + auto& hn = std::get(node->_handle); + auto& preds = node->_dependents; + remains.pop(); + + for(auto& pn: preds) { + auto& phn = std::get(pn->_handle); + //TODO: ignore cross-level? + if(assign[phn.sid] == nullptr) { + hn.sid = phn.sid; + assign[hn.sid] = node; + prev_assign[hn.sid] = node; + //sgraph[hn.sid].emplace_back(node); + ++counter; + } + } + + //if r1 failed + //r2. + if(hn.sid == -1) { + for(size_t s = 0; s < streams.size(); ++s) { + if(assign[s] == nullptr) { + hn.sid = s; + assign[hn.sid] = node; + prev_assign[hn.sid] = node; + //sgraph[hn.sid].emplace_back(node); + ++counter; + } + } + } + + if(counter == streams.size()) { + assign.clear(); + assign.resize(streams.size()); + counter = 0; + } + } + + //reset + counter = 0; + assign.clear(); + assign.resize(streams.size()); + } + + //add events and work + for(auto& level_nodes: level_graph) { + for(auto& node: level_nodes) { + auto& hn = std::get(node->_handle); + + //wait + for(auto& pn: node->_dependents) { + auto& phn = std::get(pn->_handle); + if(phn.sid != hn.sid) { + TF_CHECK_CUDA( + cudaStreamWaitEvent(streams[hn.sid], phn.event), "failed to wait on the node" + ); + } + } + + //capture + hn.work(streams[hn.sid]); + + //create event + for(auto& sn : node->_successors) { + auto& shn = std::get(sn->_handle); + if(hn.sid != shn.sid && hn.event == nullptr) { + hn.event = events.emplace_back(); + TF_CHECK_CUDA( + cudaEventRecord(hn.event, streams[hn.sid]), "failed to record" + ); + } + } + } + } + + + // join + for(size_t i=1; i<_num_streams; ++i) { + cudaEvent_t join_event = events.emplace_back(); + TF_CHECK_CUDA( + cudaEventRecord(join_event, streams[i]), "failed to record join" + ); + TF_CHECK_CUDA( + cudaStreamWaitEvent(streams[0], join_event), "failed to wait on join" + ); + } + + cudaGraph_t native_g; + + TF_CHECK_CUDA( + cudaStreamEndCapture(streams[0], &native_g), "failed to end capture" + ); + + //tf::cuda_dump_graph(std::cout, native_g); + + return native_g; +} */ + +} // end of namespace tf ----------------------------------------------------- + diff --git a/taskflow/cuda/cuda_pool.hpp b/taskflow/cuda/cuda_pool.hpp new file mode 100644 index 0000000..a8d15ab --- /dev/null +++ b/taskflow/cuda/cuda_pool.hpp @@ -0,0 +1,182 @@ +#pragma once + +#include "cuda_error.hpp" + +namespace tf { + +/** +@brief per-thread object pool to manage CUDA device object + +@tparam H object type +@tparam C function object to create a library object +@tparam D function object to delete a library object + +A CUDA device object has a lifetime associated with a device, +for example, @c cudaStream_t, @c cublasHandle_t, etc. +Creating a device object is typically expensive (e.g., 10-200 ms) +and destroying it may trigger implicit device synchronization. +For applications tha intensively make use of device objects, +it is desirable to reuse them as much as possible. + +There exists an one-to-one relationship between CUDA devices in CUDA Runtime API +and CUcontexts in the CUDA Driver API within a process. +The specific context which the CUDA Runtime API uses for a device +is called the device's primary context. +From the perspective of the CUDA Runtime API, +a device and its primary context are synonymous. + +We design the device object pool in a decentralized fashion by keeping +(1) a global pool to keep track of potentially usable objects and +(2) a per-thread pool to footprint objects with shared ownership. +The global pool does not own the object and therefore does not destruct any of them. +The per-thread pool keeps the footprints of objects with shared ownership +and will destruct them if the thread holds the last reference count after it joins. +The motivation of this decentralized control is to avoid device objects +from being destroyed while the context had been destroyed due to driver shutdown. + +*/ +template +class cudaPerThreadDeviceObjectPool { + + public: + + /** + @brief structure to store a context object + */ + struct Object { + + int device; + H value; + + Object(int); + ~Object(); + + Object(const Object&) = delete; + Object(Object&&) = delete; + }; + + private: + + // Master thread hold the storage to the pool. + // Due to some ordering, cuda context may be destroyed when the master + // program thread destroys the cuda object. + // Therefore, we use a decentralized approach to let child thread + // destroy cuda objects while the master thread only keeps a weak reference + // to those objects for reuse. + struct cudaGlobalDeviceObjectPool { + + std::shared_ptr acquire(int); + void release(int, std::weak_ptr); + + std::mutex mutex; + std::unordered_map>> pool; + }; + + public: + + /** + @brief default constructor + */ + cudaPerThreadDeviceObjectPool() = default; + + /** + @brief acquires a device object with shared ownership + */ + std::shared_ptr acquire(int); + + /** + @brief releases a device object with moved ownership + */ + void release(std::shared_ptr&&); + + /** + @brief queries the number of device objects with shared ownership + */ + size_t footprint_size() const; + + private: + + inline static cudaGlobalDeviceObjectPool _shared_pool; + + std::unordered_set> _footprint; +}; + +// ---------------------------------------------------------------------------- +// cudaPerThreadDeviceObject::cudaHanale definition +// ---------------------------------------------------------------------------- + +template +cudaPerThreadDeviceObjectPool::Object::Object(int d) : + device {d} { + cudaScopedDevice ctx(device); + value = C{}(); +} + +template +cudaPerThreadDeviceObjectPool::Object::~Object() { + cudaScopedDevice ctx(device); + D{}(value); +} + +// ---------------------------------------------------------------------------- +// cudaPerThreadDeviceObject::cudaHanaldePool definition +// ---------------------------------------------------------------------------- + +template +std::shared_ptr::Object> +cudaPerThreadDeviceObjectPool::cudaGlobalDeviceObjectPool::acquire(int d) { + std::scoped_lock lock(mutex); + if(auto itr = pool.find(d); itr != pool.end()) { + while(!itr->second.empty()) { + auto sptr = itr->second.back().lock(); + itr->second.pop_back(); + if(sptr) { + return sptr; + } + } + } + return nullptr; +} + +template +void cudaPerThreadDeviceObjectPool::cudaGlobalDeviceObjectPool::release( + int d, std::weak_ptr ptr +) { + std::scoped_lock lock(mutex); + pool[d].push_back(ptr); +} + +// ---------------------------------------------------------------------------- +// cudaPerThreadDeviceObject definition +// ---------------------------------------------------------------------------- + +template +std::shared_ptr::Object> +cudaPerThreadDeviceObjectPool::acquire(int d) { + + auto ptr = _shared_pool.acquire(d); + + if(!ptr) { + ptr = std::make_shared(d); + } + + return ptr; +} + +template +void cudaPerThreadDeviceObjectPool::release( + std::shared_ptr&& ptr +) { + _shared_pool.release(ptr->device, ptr); + _footprint.insert(std::move(ptr)); +} + +template +size_t cudaPerThreadDeviceObjectPool::footprint_size() const { + return _footprint.size(); +} + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/taskflow/cuda/cuda_stream.hpp b/taskflow/cuda/cuda_stream.hpp new file mode 100644 index 0000000..3e80b48 --- /dev/null +++ b/taskflow/cuda/cuda_stream.hpp @@ -0,0 +1,286 @@ +#pragma once + +#include "cuda_pool.hpp" + +/** +@file cuda_stream.hpp +@brief CUDA stream utilities include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaStreamCreator and cudaStreamDeleter for per-thread stream pool +// ---------------------------------------------------------------------------- + +/** @private */ +struct cudaStreamCreator { + + /** + @brief operator to create a CUDA stream + */ + cudaStream_t operator () () const { + cudaStream_t stream; + TF_CHECK_CUDA(cudaStreamCreate(&stream), "failed to create a CUDA stream"); + return stream; + } +}; + +/** @private */ +struct cudaStreamDeleter { + + /** + @brief operator to destroy a CUDA stream + */ + void operator () (cudaStream_t stream) const { + cudaStreamDestroy(stream); + } +}; + +/** +@brief alias of per-thread stream pool type + */ +using cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool< + cudaStream_t, cudaStreamCreator, cudaStreamDeleter +>; + +/** +@brief acquires the per-thread cuda stream pool +*/ +inline cudaPerThreadStreamPool& cuda_per_thread_stream_pool() { + thread_local cudaPerThreadStreamPool pool; + return pool; +} + +// ---------------------------------------------------------------------------- +// cudaScopedPerThreadStream definition +// ---------------------------------------------------------------------------- + +/** +@brief class that provides RAII-styled guard of stream acquisition + +Sample usage: + +@code{.cpp} +{ + tf::cudaScopedPerThreadStream stream(1); // acquires a stream on device 1 + + // use stream as a normal cuda stream (cudaStream_t) + cudaStreamWaitEvent(stream, ...); + +} // leaving the scope releases the stream back to the pool on device 1 +@endcode + +The scoped per-thread stream is primarily used by tf::Executor to execute +CUDA tasks (e.g., tf::cudaFlow, tf::cudaFlowCapturer). + +%cudaScopedPerThreadStream is non-copyable. +*/ +class cudaScopedPerThreadStream { + + public: + + /** + @brief constructs a scoped stream under the given device + + The constructor acquires a stream from a per-thread stream pool. + + @param device device context of the requested stream + */ + explicit cudaScopedPerThreadStream(int device) : + _ptr {cuda_per_thread_stream_pool().acquire(device)} { + } + + /** + @brief constructs a scoped stream under the current device. + + The constructor acquires a stream from a per-thread stream pool. + */ + cudaScopedPerThreadStream() : + _ptr {cuda_per_thread_stream_pool().acquire(cuda_get_device())} { + } + + /** + @brief destructs the scoped stream guard + + The destructor releases the stream to the per-thread stream pool. + */ + ~cudaScopedPerThreadStream() { + if(_ptr) { + cuda_per_thread_stream_pool().release(std::move(_ptr)); + } + } + + /** + @brief implicit conversion to the native CUDA stream (cudaStream_t) + */ + operator cudaStream_t () const { + return _ptr->value; + } + + /** + @brief disabled copy constructor + */ + cudaScopedPerThreadStream(const cudaScopedPerThreadStream&) = delete; + + /** + @brief default move constructor + */ + cudaScopedPerThreadStream(cudaScopedPerThreadStream&&) = default; + + /** + @brief disabled copy assignment + */ + cudaScopedPerThreadStream& operator = (const cudaScopedPerThreadStream&) = delete; + + /** + @brief default move assignment + */ + cudaScopedPerThreadStream& operator = (cudaScopedPerThreadStream&&) = delete; + + private: + + std::shared_ptr _ptr; + +}; + +// ---------------------------------------------------------------------------- +// cudaStreamCreator and cudaStreamDeleter for per-thread event pool +// ---------------------------------------------------------------------------- + +/** @private */ +struct cudaEventCreator { + + /** + @brief operator to create a CUDA event + */ + cudaEvent_t operator () () const { + cudaEvent_t event; + TF_CHECK_CUDA(cudaEventCreate(&event), "failed to create a CUDA event"); + return event; + } +}; + +/** @private */ +struct cudaEventDeleter { + + /** + @brief operator to destroy a CUDA event + */ + void operator () (cudaEvent_t event) const { + cudaEventDestroy(event); + } +}; + +/** +@brief alias of per-thread event pool type + */ +using cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool< + cudaEvent_t, cudaEventCreator, cudaEventDeleter +>; + +/** +@brief per-thread cuda event pool +*/ +inline cudaPerThreadEventPool& cuda_per_thread_event_pool() { + thread_local cudaPerThreadEventPool pool; + return pool; +} + +// ---------------------------------------------------------------------------- +// cudaScopedPerThreadEvent definition +// ---------------------------------------------------------------------------- + +/** +@brief class that provides RAII-styled guard of event acquisition + +Sample usage: + +@code{.cpp} +{ + tf::cudaScopedPerThreadEvent event(1); // acquires a event on device 1 + + // use event as a normal cuda event (cudaEvent_t) + cudaStreamWaitEvent(stream, event); + +} // leaving the scope releases the event back to the pool on device 1 +@endcode + +The scoped per-thread event is primarily used by tf::Executor to execute +CUDA tasks (e.g., tf::cudaFlow, tf::cudaFlowCapturer). + +%cudaScopedPerThreadEvent is non-copyable. +*/ +class cudaScopedPerThreadEvent { + + public: + + /** + @brief constructs a scoped event under the given device + + The constructor acquires a event from a per-thread event pool. + + @param device device context of the requested event + */ + explicit cudaScopedPerThreadEvent(int device) : + _ptr {cuda_per_thread_event_pool().acquire(device)} { + } + + /** + @brief constructs a scoped event under the current device. + + The constructor acquires a event from a per-thread event pool. + */ + cudaScopedPerThreadEvent() : + _ptr {cuda_per_thread_event_pool().acquire(cuda_get_device())} { + } + + /** + @brief destructs the scoped event guard + + The destructor releases the event to the per-thread event pool. + */ + ~cudaScopedPerThreadEvent() { + if(_ptr) { + cuda_per_thread_event_pool().release(std::move(_ptr)); + } + } + + /** + @brief implicit conversion to the native CUDA event (cudaEvent_t) + */ + operator cudaEvent_t () const { + return _ptr->value; + } + + /** + @brief disabled copy constructor + */ + cudaScopedPerThreadEvent(const cudaScopedPerThreadEvent&) = delete; + + /** + @brief default move constructor + */ + cudaScopedPerThreadEvent(cudaScopedPerThreadEvent&&) = default; + + /** + @brief disabled copy assignment + */ + cudaScopedPerThreadEvent& operator = (const cudaScopedPerThreadEvent&) = delete; + + /** + @brief default move assignment + */ + cudaScopedPerThreadEvent& operator = (cudaScopedPerThreadEvent&&) = delete; + + private: + + std::shared_ptr _ptr; + +}; + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/taskflow/cuda/cuda_task.hpp b/taskflow/cuda/cuda_task.hpp new file mode 100644 index 0000000..200ccae --- /dev/null +++ b/taskflow/cuda/cuda_task.hpp @@ -0,0 +1,227 @@ +#pragma once + +#include "cuda_graph.hpp" + +/** +@file cuda_task.hpp +@brief cudaTask include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// cudaTask Types +// ---------------------------------------------------------------------------- + +/** +@enum cudaTaskType + +@brief enumeration of all %cudaTask types +*/ +enum class cudaTaskType : int { + EMPTY = 0, + HOST, + MEMSET, + MEMCPY, + KERNEL, + SUBFLOW, + CAPTURE, + UNDEFINED +}; + +/** +@brief convert a cuda_task type to a human-readable string +*/ +constexpr const char* to_string(cudaTaskType type) { + switch(type) { + case cudaTaskType::EMPTY: return "empty"; + case cudaTaskType::HOST: return "host"; + case cudaTaskType::MEMSET: return "memset"; + case cudaTaskType::MEMCPY: return "memcpy"; + case cudaTaskType::KERNEL: return "kernel"; + case cudaTaskType::SUBFLOW: return "subflow"; + case cudaTaskType::CAPTURE: return "capture"; + default: return "undefined"; + } +} + +// ---------------------------------------------------------------------------- +// cudaTask +// ---------------------------------------------------------------------------- + +/** +@class cudaTask + +@brief handle to a node of the internal CUDA graph +*/ +class cudaTask { + + friend class cudaFlow; + friend class cudaFlowCapturer; + friend class cudaFlowCapturerBase; + + friend std::ostream& operator << (std::ostream&, const cudaTask&); + + public: + + /** + @brief constructs an empty cudaTask + */ + cudaTask() = default; + + /** + @brief copy-constructs a cudaTask + */ + cudaTask(const cudaTask&) = default; + + /** + @brief copy-assigns a cudaTask + */ + cudaTask& operator = (const cudaTask&) = default; + + /** + @brief adds precedence links from this to other tasks + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template + cudaTask& precede(Ts&&... tasks); + + /** + @brief adds precedence links from other tasks to this + + @tparam Ts parameter pack + + @param tasks one or multiple tasks + + @return @c *this + */ + template + cudaTask& succeed(Ts&&... tasks); + + /** + @brief assigns a name to the task + + @param name a @std_string acceptable string + + @return @c *this + */ + cudaTask& name(const std::string& name); + + /** + @brief queries the name of the task + */ + const std::string& name() const; + + /** + @brief queries the number of successors + */ + size_t num_successors() const; + + /** + @brief queries if the task is associated with a cudaNode + */ + bool empty() const; + + /** + @brief queries the task type + */ + cudaTaskType type() const; + + /** + @brief dumps the task through an output stream + + @tparam T output stream type with insertion operator (<<) defined + @param ostream an output stream target + */ + template + void dump(T& ostream) const; + + private: + + cudaTask(cudaNode*); + + cudaNode* _node {nullptr}; +}; + +// Constructor +inline cudaTask::cudaTask(cudaNode* node) : _node {node} { +} + +// Function: precede +template +cudaTask& cudaTask::precede(Ts&&... tasks) { + (_node->_precede(tasks._node), ...); + return *this; +} + +// Function: succeed +template +cudaTask& cudaTask::succeed(Ts&&... tasks) { + (tasks._node->_precede(_node), ...); + return *this; +} + +// Function: empty +inline bool cudaTask::empty() const { + return _node == nullptr; +} + +// Function: name +inline cudaTask& cudaTask::name(const std::string& name) { + _node->_name = name; + return *this; +} + +// Function: name +inline const std::string& cudaTask::name() const { + return _node->_name; +} + +// Function: num_successors +inline size_t cudaTask::num_successors() const { + return _node->_successors.size(); +} + +// Function: type +inline cudaTaskType cudaTask::type() const { + switch(_node->_handle.index()) { + case cudaNode::EMPTY: return cudaTaskType::HOST; + case cudaNode::MEMSET: return cudaTaskType::MEMSET; + case cudaNode::MEMCPY: return cudaTaskType::MEMCPY; + case cudaNode::KERNEL: return cudaTaskType::KERNEL; + case cudaNode::SUBFLOW: return cudaTaskType::SUBFLOW; + case cudaNode::CAPTURE: return cudaTaskType::CAPTURE; + default: return cudaTaskType::UNDEFINED; + } +} + +// Procedure: dump +template +void cudaTask::dump(T& os) const { + os << "cudaTask "; + if(_node->_name.empty()) os << _node; + else os << _node->_name; + os << " [type=" << to_string(type()) << ']'; +} + +// ---------------------------------------------------------------------------- +// global ostream +// ---------------------------------------------------------------------------- + +/** +@brief overload of ostream inserter operator for cudaTask +*/ +inline std::ostream& operator << (std::ostream& os, const cudaTask& ct) { + ct.dump(os); + return os; +} + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/taskflow/cudaflow.hpp b/taskflow/cudaflow.hpp new file mode 100644 index 0000000..2724052 --- /dev/null +++ b/taskflow/cudaflow.hpp @@ -0,0 +1,14 @@ +#pragma once + +// taskflow.hpp +// ^ +// | +// cudaflow.hpp + +#include "taskflow.hpp" +#include "cuda/cuda_flow.hpp" + +/** +@file cudaflow.hpp +@brief main cudaFlow include file +*/ diff --git a/taskflow/dsl/connection.hpp b/taskflow/dsl/connection.hpp new file mode 100644 index 0000000..e4dad72 --- /dev/null +++ b/taskflow/dsl/connection.hpp @@ -0,0 +1,53 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "../core/flow_builder.hpp" +#include "task_trait.hpp" +#include "tuple_utils.hpp" +#include "type_list.hpp" + +namespace tf { +namespace dsl { +template class Connection { + using FROMs = typename TaskTrait::TaskList; + using TOs = typename TaskTrait::TaskList; + +public: + using FromTaskList = Unique_t>; + using ToTaskList = Unique_t>; +}; + +template > struct Chain; + +template struct Chainvoid, OUT> { + using From = F; + using type = OUT; +}; + +template +struct ChainT, OUT> { +private: + using To = typename Chain::From; + +public: + using From = F; + using type = typename Chain< + T, typename OUT::template appendTo>>::type; +}; + +template struct OneToOneLink { + template struct InstanceType { + constexpr void build(TasksCB &tasksCb) { + constexpr size_t TasksCBSize = std::tuple_size::value; + constexpr size_t FromTaskIndex = + TupleElementByF_v::template apply>; + constexpr size_t ToTaskIndex = + TupleElementByF_v::template apply>; + static_assert(FromTaskIndex < TasksCBSize && ToTaskIndex < TasksCBSize, + "fatal: not find TaskCb in TasksCB"); + std::get(tasksCb).task_.precede( + std::get(tasksCb).task_); + } + }; +}; +} // namespace dsl +}; // namespace tf diff --git a/taskflow/dsl/dsl.hpp b/taskflow/dsl/dsl.hpp new file mode 100644 index 0000000..e4130e8 --- /dev/null +++ b/taskflow/dsl/dsl.hpp @@ -0,0 +1,13 @@ +// TaskflowDSL is an experimental project that leverages C++17 to +// provide a dedicated interface for expressive taskflow programming +// +// Created by netcan: https://github.com/netcan + +#pragma once + +#include "dsl/task_dsl.hpp" + +namespace tf { + + +} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/dsl/meta_macro.hpp b/taskflow/dsl/meta_macro.hpp new file mode 100644 index 0000000..758bf68 --- /dev/null +++ b/taskflow/dsl/meta_macro.hpp @@ -0,0 +1,72 @@ +// 2020/08/30 - Created by netcan: https://github.com/netcan +// ref https://github.com/Erlkoenig90/map-macro/ +#pragma once +#ifdef _MSC_VER +#define TF_EMPTY() +#define TF_GET_ARG_COUNT_(...) \ + TF_PASTE(TF_GET_ARG_COUNT_I(__VA_ARGS__, 64, 63, 62, 61, 60, 59, 58, 57, 56, \ + 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, \ + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, \ + 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, \ + 6, 5, 4, 3, 2, 1, 0, ), \ + TF_EMPTY()) + +#else +#define TF_GET_ARG_COUNT_(...) \ + TF_GET_ARG_COUNT_I(__VA_ARGS__, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, \ + 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, \ + 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, \ + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, \ + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ) +#endif + +#define TF_GET_ARG_COUNT(...) TF_GET_ARG_COUNT_(__dummy__, ##__VA_ARGS__) +#define TF_GET_ARG_COUNT_I( \ + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, \ + e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, \ + e32, e33, e34, e35, e36, e37, e38, e39, e40, e41, e42, e43, e44, e45, e46, \ + e47, e48, e49, e50, e51, e52, e53, e54, e55, e56, e57, e58, e59, e60, e61, \ + e62, e63, e64, size, ...) \ + size + +#define TF_GET_FIRST(a, ...) a +#define TF_GET_SECOND(a, b, ...) b +#define TF_CONCATE(x, y) x##y +#define TF_PASTE(x, y) TF_CONCATE(x, y) + +#define TF_EVAL0(...) __VA_ARGS__ +#define TF_EVAL1(...) TF_EVAL0(TF_EVAL0(TF_EVAL0(__VA_ARGS__))) +#define TF_EVAL2(...) TF_EVAL1(TF_EVAL1(TF_EVAL1(__VA_ARGS__))) +#define TF_EVAL3(...) TF_EVAL2(TF_EVAL2(TF_EVAL2(__VA_ARGS__))) +#define TF_EVAL4(...) TF_EVAL3(TF_EVAL3(TF_EVAL3(__VA_ARGS__))) +#define TF_EVAL5(...) TF_EVAL4(TF_EVAL4(TF_EVAL4(__VA_ARGS__))) + +#ifdef _MSC_VER +// MSVC needs more evaluations +#define TF_EVAL6(...) TF_EVAL5(TF_EVAL5(TF_EVAL5(__VA_ARGS__))) +#define TF_EVAL(...) TF_EVAL6(TF_EVAL6(__VA_ARGS__)) +#else +#define TF_EVAL(...) TF_EVAL5(__VA_ARGS__) +#endif + +#define TF_MAP_END(...) +#define TF_MAP_OUT + +#define EMPTY() +#define DEFER(id) id EMPTY() + +#define TF_MAP_GET_END2() 0, TF_MAP_END +#define TF_MAP_GET_END1(...) TF_MAP_GET_END2 +#define TF_MAP_GET_END(...) TF_MAP_GET_END1 +#define TF_MAP_NEXT0(test, next, ...) next TF_MAP_OUT +#define TF_MAP_NEXT1(test, next) DEFER(TF_MAP_NEXT0)(test, next, 0) +#define TF_MAP_NEXT(test, next) TF_MAP_NEXT1(TF_MAP_GET_END test, next) + +#define TF_MAP0(f, x, peek, ...) \ + f(x) DEFER(TF_MAP_NEXT(peek, TF_MAP1))(f, peek, __VA_ARGS__) +#define TF_MAP1(f, x, peek, ...) \ + f(x) DEFER(TF_MAP_NEXT(peek, TF_MAP0))(f, peek, __VA_ARGS__) + +#define TF_MAP(f, ...) \ + TF_EVAL(TF_MAP1(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) diff --git a/taskflow/dsl/task_analyzer.hpp b/taskflow/dsl/task_analyzer.hpp new file mode 100644 index 0000000..295c50b --- /dev/null +++ b/taskflow/dsl/task_analyzer.hpp @@ -0,0 +1,40 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "connection.hpp" +#include "type_list.hpp" +#include + +namespace tf { +namespace dsl { +template class TaskAnalyzer { + template + struct BuildOneToOneLink; + + template + struct BuildOneToOneLink, Ts> { + using type = Concat_t::type...>; + }; + + template + struct BuildOneToOneLink, + std::enable_if_t>> { + using type = TypeList...>; + }; + + template class OneToOneLinkSetF { + using FromTaskList = typename Link::FromTaskList; + using ToTaskList = typename Link::ToTaskList; + + public: + using type = typename BuildOneToOneLink::type; + }; + +public: + using AllTasks = Unique_t< + Concat_t>; + using OneToOneLinkSet = + Unique_t, OneToOneLinkSetF>>>; +}; + +} // namespace dsl +} // namespace tf diff --git a/taskflow/dsl/task_dsl.hpp b/taskflow/dsl/task_dsl.hpp new file mode 100644 index 0000000..9b362cf --- /dev/null +++ b/taskflow/dsl/task_dsl.hpp @@ -0,0 +1,104 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "../core/flow_builder.hpp" +#include "meta_macro.hpp" +#include "task_analyzer.hpp" +#include "task_trait.hpp" + +namespace tf { +namespace dsl { +struct EmptyContext {}; +template class TaskDsl { + using Links = Unique_t::type...>>>; + using Analyzer = typename Links::template exportTo; + + using AllTasks = typename Analyzer::AllTasks; + + template struct TaskCbWithContext { + using type = TaskCb; + }; + using TasksCB = + typename Map_t::template exportTo; + + using OneToOneLinkSet = typename Analyzer::OneToOneLinkSet; + template struct OneToOneLinkInstanceType { + using type = typename OneToOneLink::template InstanceType; + }; + using OneToOneLinkInstances = + typename Map_t::template exportTo; + +public: + constexpr TaskDsl(FlowBuilder &flow_builder, const CONTEXT &context = {}) { + build_tasks_cb(flow_builder, context, + std::make_index_sequence{}); + build_links(std::make_index_sequence{}); + } + + template Task &get_task() { + constexpr size_t TasksCBSize = std::tuple_size::value; + constexpr size_t TaskIndex = + TupleElementByF_v::template apply>; + static_assert(TaskIndex < TasksCBSize, "fatal: not find TaskCb in TasksCB"); + return std::get(tasksCb_).task_; + } + +private: + template + void build_tasks_cb(FlowBuilder &flow_builder, const CONTEXT &context, + std::index_sequence) { + auto _ = {0, (std::get(tasksCb_).build(flow_builder, context), 0)...}; + (void)_; + } + + template void build_links(std::index_sequence) { + auto _ = {0, (std::get(links_).build(tasksCb_), 0)...}; + (void)_; + } + +private: + TasksCB tasksCb_; + OneToOneLinkInstances links_; +}; + +template +constexpr TaskDsl taskDsl(FlowBuilder &flow_builder, + CONTEXT &&context = {}) { + return {flow_builder, context}; +} + +} // namespace dsl +} // namespace tf + +/////////////////////////////////////////////////////////////////////////////// +#define TF_CHAIN(link) , link->void +#define TF_CONTEXT_1(name) tf::dsl::EmptyContext +#define TF_CONTEXT_2(name, context) context +#define TF_CAPTURE_THIS_1 +#define TF_CAPTURE_THIS_2 *this + +/////////////////////////////////////////////////////////////////////////////// +// make_task(TASK_NAME, { return a action lambda }) +#define make_task(name, ...) \ + struct TF_GET_FIRST name : tf::dsl::TaskSignature, \ + TF_PASTE(TF_CONTEXT_, TF_GET_ARG_COUNT name) \ + name { \ + using _ContextType = TF_PASTE(TF_CONTEXT_, TF_GET_ARG_COUNT name) name; \ + TF_GET_FIRST name(const _ContextType &context) : _ContextType(context) {} \ + auto operator()() { \ + return [TF_PASTE(TF_CAPTURE_THIS_, TF_GET_ARG_COUNT name)] __VA_ARGS__; \ + } \ + } + +// some_tasks(A, B, C) means SomeTask +#define some_tasks(...) auto (*)(tf::dsl::SomeTask<__VA_ARGS__>) +// same as some_tasks +#define fork_tasks(...) some_tasks(__VA_ARGS__) +// same as some_tasks +#define merge_tasks(...) some_tasks(__VA_ARGS__) +// task(A) means a task A +#define task(Task) auto (*)(Task) +// taskbuild(...) build a task dsl graph +#define build_taskflow(...) tf::dsl::taskDsl + diff --git a/taskflow/dsl/task_trait.hpp b/taskflow/dsl/task_trait.hpp new file mode 100644 index 0000000..bc8eeb6 --- /dev/null +++ b/taskflow/dsl/task_trait.hpp @@ -0,0 +1,46 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include "../core/flow_builder.hpp" +#include "../core/task.hpp" +#include "type_list.hpp" +#include + +namespace tf { +namespace dsl { +struct TaskSignature {}; + +template struct TaskCb { + using TaskType = TASK; + void build(FlowBuilder &build, const CONTEXT &context) { + task_ = build.emplace(TaskType{context}()); + } + + Task task_; +}; + +template struct IsTask { + template struct apply { + constexpr static bool value = + std::is_same::value; + }; +}; + +template struct TaskTrait; + +template struct SomeTask { + using TaskList = + Unique_t::TaskList...>>>; +}; + +// a task self +template +struct TaskTrait< + TASK, std::enable_if_t::value>> { + using TaskList = TypeList; +}; + +template struct TaskTrait> { + using TaskList = typename SomeTask::TaskList; +}; +} // namespace dsl +} // namespace tf diff --git a/taskflow/dsl/tuple_utils.hpp b/taskflow/dsl/tuple_utils.hpp new file mode 100644 index 0000000..633ba0e --- /dev/null +++ b/taskflow/dsl/tuple_utils.hpp @@ -0,0 +1,43 @@ +// 2020/08/28 - Created by netcan: https://github.com/netcan +#pragma once +#include +#include + +namespace tf { +namespace dsl { +namespace detail { +// get tuple element index by f, if not exists then index >= tuple_size +template class F, typename = void> +struct TupleElementByF { + constexpr static size_t Index = 0; +}; + +template