From 95a323aaa148edb53f9308fd82d0d08638fdb0a4 Mon Sep 17 00:00:00 2001 From: Zhi <5145158+zhiics@users.noreply.github.com> Date: Thu, 9 May 2019 21:29:16 -0700 Subject: [PATCH] [codegen] heterogeneous build for c++ (#3144) * heterogeneous build for c++ * merge relay buildmodule to codegen build * use module split * use target_host * remove sse3 * retrigger ci --- include/tvm/build_module.h | 29 +++++ src/codegen/build_module.cc | 126 ++++++++++++++----- src/relay/backend/build_module.cc | 51 +------- tests/cpp/build_module_test.cc | 132 ++++++++++++++++++++ tests/python/relay/test_cpp_build_module.py | 2 +- 5 files changed, 262 insertions(+), 78 deletions(-) diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h index 334fe169ad41..208f086f86c0 100644 --- a/include/tvm/build_module.h +++ b/include/tvm/build_module.h @@ -371,6 +371,35 @@ TVM_DLL runtime::Module build(const Array& funcs, const Target& target_host, const BuildConfig& config); +/*! + * \brief Build a device and host module for a specific target from a map + * contains target to a list of lowered functions pairs. This function is used + * for heterogeneous build. + * \param input The map contains target to a list of lowered functions pairs. + * \param target_host The target for building host code. To use the default, + * pass Target(). + * \param config The build configuration. + * \return The built module that contains code for different processors. + */ +TVM_DLL runtime::Module build(const Map>& input, + const Target& target_host, + const BuildConfig& config); + +/*! + * \brief Build a device and host module for a specific target from a map + * contains target to a list of lowered functions pairs. This function is used + * for heterogeneous build. + * \param input The map contains target string to a list of lowered functions + * pairs. + * \param target_host The target for building host code. To use the default, + * pass Target(). + * \param config The build configuration. + * \return The built module that contains code for different processors. + */ +TVM_DLL runtime::Module build(const Map>& input, + const Target& target_host, + const BuildConfig& config); + class GenericFuncNode; /*! diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 01ebcacf6180..57e300fafec2 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -428,20 +428,19 @@ Array > split_dev_host_funcs(const Array& funcs, const Target& target_host, const BuildConfig& config) { std::unordered_set all_names; - for (const auto &x : funcs) { - CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name; + for (const auto& x : funcs) { + CHECK(all_names.count(x->name) == 0) + << "Duplicate function name " << x->name; all_names.insert(x->name); } - auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target); - Array fhost; Array fdevice; for (const auto& x : funcs) { CHECK(ir::VerifyMemory(x, target->device_type)) - << "Direct host side access to device memory is detected in " << x->func_name() - << ". Did you forget to bind?"; + << "Direct host side access to device memory is detected in " + << x->func_name() << ". Did you forget to bind?"; if (x->func_type == kMixedFunc) { auto func = x; @@ -450,6 +449,7 @@ Array > split_dev_host_funcs(const Array& funcs, } func = ir::ThreadSync(func, "shared"); + func = ir::ThreadSync(func, "warp"); func = ir::LowerThreadAllreduce(func, target->thread_warp_size); auto fsplits = ir::SplitHostDevice(func); fhost.push_back(fsplits[0]); @@ -465,12 +465,32 @@ Array > split_dev_host_funcs(const Array& funcs, } } + for (size_t i = 0; i < fdevice.size(); i++) { + auto warp_size = target->thread_warp_size; + auto func = fdevice[i]; + func = ir::LowerWarpMemory(fdevice[i], warp_size); + fdevice.Set(i, func); + } + auto keys = target->keys(); - bool target_is_gpu = - std::find(keys.begin(), keys.end(), "gpu") != keys.end(); + bool target_is_gpu = std::find(keys.begin(), keys.end(), "gpu") != keys.end(); if (target_is_gpu && fdevice.size() == 0) { - LOG(WARNING) << "Specified target " + target->str() + - " but cannot find device code. Did you forget to bind?"; + LOG(WARNING) << "Specified target " + << target->str() + << " but cannot find device code. Did you forget to bind?"; + } + + for (size_t i = 0; i < fdevice.size(); ++i) { + auto func = fdevice[i]; + func = ir::LowerIntrin(func, target->target_name); + fdevice.Set(i, func); + } + + if (target->device_type == target::llvm()->device_type && + target_host == target) { + CHECK(fdevice.empty()) << "No device code should be generated when target " + << "and host_target are both llvm target." + << "\n"; } for (size_t i = 0; i < fhost.size(); ++i) { @@ -480,41 +500,91 @@ Array > split_dev_host_funcs(const Array& funcs, fhost.Set(i, func); } - - for (size_t i = 0; i < fdevice.size(); ++i) { - auto func = fdevice[i]; - func = ir::LowerIntrin(func, target->target_name); - fdevice.Set(i, func); - } - for (size_t i = 0; i < fhost.size(); ++i) { auto func = fhost[i]; - func = ir::LowerIntrin(func, target_host_val->target_name); + func = ir::LowerIntrin(func, target_host->target_name); func = ir::CombineContextCall(func); fhost.Set(i, func); } return {fhost, fdevice}; } -runtime::Module build(const Array& funcs, - const Target& target, +// Create a module for a specific device (target). The lowered functions +// associated with the host is returned as well. +runtime::Module DeviceBuild(const Array& fdevice, + const Target& target) { + if (!fdevice.empty()) { + return codegen::Build(fdevice, target->str()); + } else { + return runtime::Module(nullptr); + } +} + +// Build for heterogeneous execution. +runtime::Module build(const Map>& inputs, const Target& target_host, const BuildConfig& config) { - auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target); - auto host_dev_funcs = split_dev_host_funcs(funcs, target, target_host, config); - auto& fhost = host_dev_funcs[0]; - auto& fdevice = host_dev_funcs[1]; + Array fhost_all; + std::vector device_modules; + + Target target_host_val = target_host; + if (!target_host.defined()) { + for (const auto& it : inputs) { + if (it.first->device_type == kDLCPU) { + target_host_val = it.first; + break; + } + } + } - auto mhost = codegen::Build(fhost, target_host_val->str()); + if (!target_host_val.defined()) { + target_host_val = DefaultTargetHost(target_host_val); + } - if (fdevice.size() > 0) { - auto mdev = codegen::Build(fdevice, target->str()); - mhost.Import(mdev); + for (const auto& it : inputs) { + auto host_dev_funcs = + split_dev_host_funcs(it.second, it.first, target_host_val, config); + auto& fhost = host_dev_funcs[0]; + auto& fdevice = host_dev_funcs[1]; + // Get the module for a certain target. + runtime::Module mdev = DeviceBuild(fdevice, it.first); + for (const auto& it : fhost) { + fhost_all.push_back(it); + } + device_modules.push_back(mdev); } + runtime::Module mhost = codegen::Build(fhost_all, target_host_val->str()); + // Import all modules + for (const auto& it : device_modules) { + if (it.operator->()) { + mhost.Import(it); + } + } return mhost; } +// Build for heterogeneous execution when target is a string. +runtime::Module build(const Map>& inputs, + const Target& target_host, + const BuildConfig& config) { + Map> updated_input; + for (const auto& it : inputs) { + auto target = Target::create(it.first); + updated_input.Set(target, it.second); + } + return build(updated_input, target_host, config); +} + +// Build for homogeneous execution. +runtime::Module build(const Array& funcs, + const Target& target, + const Target& target_host, + const BuildConfig& config) { + Map> inputs = {{target, funcs}}; + return build(inputs, target_host, config); +} + BuildConfig build_config() { return BuildConfig(make_node()); } diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index 564715c00d90..08a88d53350f 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -601,52 +601,6 @@ class RelayBuildModule : public runtime::ModuleNode { } return func; } - /*! - * \brief Build module given lowered functions for each target - * - * \param lowered_funcs target_str -> Array map - * \param targets Targets map - * \param cfg Building configuration - */ - void BuildModule(const Map >& lowered_funcs, - const Map& targets, - const BuildConfig& cfg) { - auto target_host = Target::create(cfg_.fallback_device); - for (const auto& kv : lowered_funcs) { - std::unordered_set fname_set; - for (auto f : kv.second) { - if (fname_set.count(f->name)) { - LOG(FATAL) << "Duplicate function name " - << f->name; - } - fname_set.insert(f->name); - } - } - std::unordered_map target_map; - for (const auto& kv : lowered_funcs) { - target_map[kv.first] = Target::create(kv.first); - } - Array fhost_all; - std::vector device_module; - for (const auto& kv : lowered_funcs) { - auto target = target_map[kv.first]; - auto host_dev_funcs = split_dev_host_funcs(kv.second, target, target_host, cfg); - for (auto f : host_dev_funcs[0]) { - fhost_all.push_back(f); - } - if (host_dev_funcs[1].size()) { - auto mdev = codegen::Build(host_dev_funcs[1], target->str()); - device_module.push_back(mdev); - } - } - - auto mhost = codegen::Build(fhost_all, target_host->str()); - - for (auto mdev : device_module) { - mhost.Import(mdev); - } - ret_.mod = mhost; - } /*! * \brief Build relay function to runtime module @@ -686,9 +640,8 @@ class RelayBuildModule : public runtime::ModuleNode { ret_.graph_json = graph_codegen_->GetJSON(); ret_.params = graph_codegen_->GetParams(); - BuildModule(graph_codegen_->GetLoweredFunc(), - device_target, - tvm_cfg_); + auto target_host = Target::create(target_host_); + ret_.mod = tvm::build(graph_codegen_->GetLoweredFunc(), target_host, tvm_cfg_); } protected: diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index 734e457d3bf1..393714d8f636 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -19,10 +19,14 @@ #include #include +#include #include #include #include +#include +#include + TEST(BuildModule, Basic) { using namespace tvm; auto n = var("n"); @@ -56,6 +60,134 @@ TEST(BuildModule, Basic) { CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali"); } +TEST(BuildModule, Heterogeneous) { + /* The testing network is like following, where the element-wise add and sub + * ops are allocated to GPU and CPU, respectively: + * + * A B + * \ / + * elemwise_add (gpu) + * \ + * copy C + * \ / + * elemwise_sub (cpu) + */ + + using namespace tvm; + const runtime::PackedFunc* pf = runtime::Registry::Get("module._Enabled"); + bool enabled = (*pf)("cuda"); + if (!enabled) { + LOG(INFO) << "Skip heterogeneous test because cuda is not enabled." + << "\n"; + return; + } + + auto target_llvm = target::llvm(); + auto target_cuda = target::cuda(); + + // The shape of input tensors. + const int n = 4; + Array shape{n}; + + auto A = placeholder(shape, Float(32), "A"); + auto B = placeholder(shape, Float(32), "B"); + auto C = placeholder(shape, Float(32), "C"); + + auto elemwise_add = compute(A->shape, [&A, &B](Expr i) { + return A[i] + B[i]; + }, "elemwise_add"); + + auto copy = placeholder(shape, Float(32), "__copy"); + auto elemwise_sub = compute(C->shape, [©, &C](Expr i) { + return copy[i] - C[i]; + }, "elemwise_sub"); + + auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add}); + auto s2 = create_schedule({elemwise_sub->op}); + + auto config = build_config(); + auto args1 = Array({A, B, elemwise_add}); + auto args2 = Array({copy, C, elemwise_sub}); + + std::unordered_map binds; + auto lowered_s1 = lower(s1, args1, "elemwise_add", binds, config); + auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds, config); + Map> inputs = {{target_cuda, lowered_s1}, + {target_llvm, lowered_s2}}; + auto module = build(inputs, Target(), config); + + // Assertion for build. + CHECK_EQ(module->imports().size(), 1); + + // Execute the graph and check the correctness. + // Setup graph json. + std::string json = + "{\"nodes\": [{\"op\": \"null\", \"name\": \"A\", \"inputs\": []}, " + "{\"op\": \"null\", \"name\": \"B\", \"inputs\": []}, {\"op\": " + "\"tvm_op\", \"name\": \"elemwise_add\", \"attrs\": {\"flatten_data\": " + "\"1\", \"func_name\": \"elemwise_add\", \"num_inputs\": \"2\", " + "\"num_outputs\": \"1\"}, \"inputs\": [[0, 0, 0], [1, 0, 0]]}, {\"op\": " + "\"tvm_op\", \"name\": \"__copy_add_to_sub\", \"attrs\": " + "{\"flatten_data\": \"0\", \"func_name\": \"__copy\", \"num_inputs\": " + "\"1\", \"num_outputs\": \"1\"}, \"inputs\": [[2, 0, 0]]}, {\"op\": " + "\"null\", \"name\": \"C\", \"inputs\": []}, {\"op\": \"tvm_op\", " + "\"name\": \"elemwise_sub\", \"attrs\": {\"flatten_data\": \"0\", " + "\"func_name\": \"elemwise_sub\", \"num_inputs\": \"2\", " + "\"num_outputs\": \"1\"}, \"inputs\": [[3, 0, 0], [4, 0, 0]]}], " + "\"arg_nodes\": [0, 1, 4], \"node_row_ptr\": [0, 1, 2, 3, 4, 5, 6], " + "\"heads\": [[5, 0, 0]], \"attrs\": {\"storage_id\": [\"list_int\", [3, " + "4, 0, 1, 5, 2]], \"shape\": [\"list_shape\", [[4], [4], [4], [4], [4], " + "[4]]], \"device_index\": [\"list_int\", [2, 2, 2, 1, 1, 1]], \"dtype\": " + "[\"list_int\", [0, 0, 0, 0, 0, 0]], \"dltype\": [\"list_str\", " + "[\"float32\", \"float32\", \"float32\", \"float32\", \"float32\", " + "\"float32\"]]}}"; + + // Setup inputs. + auto a_val = + runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto b_val = + runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto c_val = + runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); + + auto pa = (float*)a_val.ToDLPack()->dl_tensor.data; + auto pb = (float*)b_val.ToDLPack()->dl_tensor.data; + auto pc = (float*)c_val.ToDLPack()->dl_tensor.data; + + // Assign values. + for (int i = 0; i < n; i++) { + pa[i] = i; + pb[i] = i + 1.0; + pc[i] = i - 1.0; + } + + // Initialize graph runtime. + int cpu_dev_ty = static_cast(kDLCPU); + int cpu_dev_id = 0; + int gpu_dev_ty = static_cast(kDLGPU); + int gpu_dev_id = 0; + + const runtime::PackedFunc* graph_runtime = + tvm::runtime::Registry::Get("tvm.graph_runtime.create"); + runtime::Module mod = (*graph_runtime)( + json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id); + + PackedFunc set_input = mod.GetFunction("set_input", false); + PackedFunc run = mod.GetFunction("run", false); + PackedFunc get_output = mod.GetFunction("get_output", false); + set_input("A", a_val); + set_input("B", b_val); + set_input("C", c_val); + + run(); + tvm::runtime::NDArray out = get_output(0); + float* p_out = (float*)out.ToDLPack()->dl_tensor.data; + + // Check correctness. + for (int i = 0; i < n; ++i) { + CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5); + } +} int main(int argc, char ** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py index c69d877d3b09..b94f57d77286 100644 --- a/tests/python/relay/test_cpp_build_module.py +++ b/tests/python/relay/test_cpp_build_module.py @@ -89,7 +89,7 @@ def test_build(): tgt: tgt } m_bld.set_opt_level(3) - m_bld.build(func, targets, "llvm -mcpu=sse3", params=params) + m_bld.build(func, targets, "llvm", params=params) g_json = m_bld.get_json() mmod = m_bld.get_module() params = m_bld.get_params()