From 82a55464304889c91dd0670c6e21adc3e883703a Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Fri, 14 Jun 2019 15:18:14 -0700 Subject: [PATCH] [Relay][VM] Add AllocTensor instruction and better instruction printer (#3306) * Update vm print & add AllocTensor instruction * patch * fix invoke packed * update cmake * tweak move * update invoke_closure * lint * add doc * tweak --- CMakeLists.txt | 1 + include/tvm/runtime/vm.h | 36 ++++-- src/relay/backend/vm/compiler.cc | 75 ++---------- src/runtime/vm/vm.cc | 202 ++++++++++++++++++++----------- 4 files changed, 170 insertions(+), 144 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a140c597a89f..80b121477631 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,6 +226,7 @@ add_library(tvm_runtime_static STATIC ${RUNTIME_SRCS}) if(USE_RELAY_DEBUG) message(STATUS "Building Relay in debug mode...") set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "USE_RELAY_DEBUG") +else() set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "NDEBUG") endif(USE_RELAY_DEBUG) diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h index 8911ad499e4c..028a5ff9d1ad 100644 --- a/include/tvm/runtime/vm.h +++ b/include/tvm/runtime/vm.h @@ -56,13 +56,14 @@ enum class Opcode { InvokeClosure = 3U, InvokePacked = 4U, AllocTensor = 5U, - AllocDatatype = 6U, - AllocClosure = 7U, - GetField = 8U, - If = 9U, - Select = 10U, - LoadConst = 11U, - Goto = 12U + AllocTensorReg = 6U, + AllocDatatype = 7U, + AllocClosure = 8U, + GetField = 9U, + If = 10U, + Select = 11U, + LoadConst = 12U, + Goto = 13U }; /*! \brief A single virtual machine instruction. @@ -83,11 +84,19 @@ struct Instruction { union { struct /* AllocTensor Operands */ { + /*! \brief The number of dimensions. */ + uint32_t ndim; + /*! \brief The shape of tensor. */ + int64_t* shape; + /*! \brief The datatype of tensor to be allocated. */ + DLDataType dtype; + } alloc_tensor; + struct /* AllocTensorReg Operands */ { /*! \brief The register to read the shape out of. */ RegName shape_register; /*! \brief The datatype of tensor to be allocated. */ DLDataType dtype; - }; + } alloc_tensor_reg; struct /* InvokeClosure Operands */ { /*! \brief The register containing the closure. */ RegName closure; @@ -192,13 +201,20 @@ struct Instruction { */ static Instruction InvokePacked(Index packed_index, Index arity, Index output_size, const std::vector& args); - /*! \brief Construct an allocate tensor instruction. + /*! \brief Construct an allocate tensor instruction with constant shape. + * \param shape The shape of the tensor. + * \param dtype The dtype of the tensor. + * \param dst The destination register. + * \return The allocate tensor instruction. + */ + static Instruction AllocTensor(std::vector shape, DLDataType dtype, RegName dst); + /*! \brief Construct an allocate tensor instruction with register. * \param shape_register The register containing the shape. * \param dtype The dtype of the tensor. * \param dst The destination register. * \return The allocate tensor instruction. */ - static Instruction AllocTensor(RegName shape_register, DLDataType dtype, RegName dst); + static Instruction AllocTensorReg(RegName shape_register, DLDataType dtype, RegName dst); /*! \brief Construct an allocate datatype instruction. * \param tag The datatype tag. * \param num_fields The number of fields for the datatype. diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 9b4ab6b8f6c8..3e41ce717e71 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -103,13 +103,6 @@ struct ConstantPool : ExprVisitor { } } - void AddConstantTensorShape(TensorType expr, NDArray value) { - auto it = this->const_tensor_shape_map.find(expr); - if (it == this->const_tensor_shape_map.end()) { - this->const_tensor_shape_map.insert({expr, std::make_pair(index++, value)}); - } - } - void VisitExpr_(const ConstantNode* const_node) { auto konst = GetRef(const_node); auto it = this->const_map.find(konst); @@ -117,48 +110,6 @@ struct ConstantPool : ExprVisitor { this->const_map.insert({konst, index++}); } } - - NDArray GetTensorConstant(const TensorTypeNode* ttype) { - std::vector shapes; - for (auto sh : ttype->shape) { - shapes.push_back(Downcast(sh)->value); - } - int64_t s = shapes.size(); - DLContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; - auto shape_tensor = NDArray::Empty({s}, Type2TVMType(Int(64)), cpu_ctx); - int64_t* dims = static_cast(shape_tensor->data); - for (size_t i = 0; i < shapes.size(); ++i) { - dims[i] = shapes[i]; - } - return shape_tensor; - } - - void VisitExpr_(const CallNode* call_node) { - for (auto arg : call_node->args) { - this->VisitExpr(arg); - } - - Expr op = call_node->op; - auto func_node = op.as(); - if (func_node) { - auto ret_type = call_node->checked_type(); - if (const TensorTypeNode* ttype = ret_type.as()) { - auto shape = GetTensorConstant(ttype); - auto tensor_type = GetRef(ttype); - AddConstantTensorShape(tensor_type, shape); - } else if (const TupleTypeNode* ttype = ret_type.as()) { - for (size_t i = 0; i < ttype->fields.size(); ++i) { - auto f = ttype->fields[i]; - auto f_type = f.as(); - auto shape = GetTensorConstant(f_type); - auto tensor_type = GetRef(f_type); - AddConstantTensorShape(tensor_type, shape); - } - } - } - } }; std::tuple LayoutConstantPool(const Module& module) { @@ -206,6 +157,7 @@ struct VMCompiler : ExprFunctor { switch (instr.op) { case Opcode::AllocDatatype: case Opcode::AllocTensor: + case Opcode::AllocTensorReg: case Opcode::GetField: case Opcode::LoadConst: case Opcode::Select: @@ -259,14 +211,14 @@ struct VMCompiler : ExprFunctor { void VisitExpr_(const MatchNode* match_node) { auto match = GetRef(match_node); - LOG(FATAL) << "translation of match nodes to the VM is " - << "currently unsupported" << std::endl; + LOG(FATAL) << "translation of match nodes to the VM is" + << "currently unsupported"; } void VisitExpr_(const LetNode* let_node) { - DLOG(INFO) << let_node->value << std::endl; + DLOG(INFO) << let_node->value; this->VisitExpr(let_node->value); - DLOG(INFO) << this->last_register << std::endl; + DLOG(INFO) << this->last_register; var_register_map.insert({let_node->var, this->last_register}); this->VisitExpr(let_node->body); } @@ -327,18 +279,13 @@ struct VMCompiler : ExprFunctor { } Instruction AllocTensorFromType(const TensorTypeNode* ttype) { - DataType dtype = ttype->dtype; - TVMType dltype = Type2TVMType(dtype); - + TVMType dltype = Type2TVMType(ttype->dtype); auto tensor_type = GetRef(ttype); - auto it = this->context->const_tensor_shape_map.find(tensor_type); - if (it == this->context->const_tensor_shape_map.end()) { - DLOG(INFO) << "Can not find constant shape for " << tensor_type; - } else { - Emit(Instruction::LoadConst(it->second.first, NewRegister())); + std::vector shape; + for (auto dim : tensor_type->shape) { + shape.push_back(Downcast(dim)->value); } - - return Instruction::AllocTensor(last_register, dltype, NewRegister()); + return Instruction::AllocTensor(shape, dltype, NewRegister()); } void EmitInvokePrimitive(const Function& func, @@ -532,7 +479,7 @@ void PopulatePackedFuncMap(const std::vector& lowered_funcs, } VMFunction CompileFunc(VMCompilerContext* context, const GlobalVar& var, const Function& func) { - DLOG(INFO) << "CompileFunc: " << var << std::endl << AsText(func, false) << std::endl; + DLOG(INFO) << "CompileFunc: " << var << std::endl << AsText(func, false); size_t params = func->params.size(); VMCompiler compiler(context); compiler.Compile(func); diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index 6f9190e8907a..5ba20982e90f 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -67,8 +67,14 @@ Instruction::Instruction(const Instruction& instr) { this->result = instr.result; return; case Opcode::AllocTensor: - this->shape_register = instr.shape_register; - this->dtype = instr.dtype; + this->alloc_tensor.ndim = instr.alloc_tensor.ndim; + this->alloc_tensor.shape = Duplicate(instr.alloc_tensor.shape, + instr.alloc_tensor.ndim); + this->alloc_tensor.dtype = instr.alloc_tensor.dtype; + return; + case Opcode::AllocTensorReg: + this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register; + this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype; return; case Opcode::AllocDatatype: this->constructor_tag = instr.constructor_tag; @@ -142,8 +148,14 @@ Instruction& Instruction::operator=(const Instruction& instr) { this->result = instr.result; return *this; case Opcode::AllocTensor: - this->shape_register = instr.shape_register; - this->dtype = instr.dtype; + this->alloc_tensor.ndim = instr.alloc_tensor.ndim; + this->alloc_tensor.shape = Duplicate(instr.alloc_tensor.shape, + instr.alloc_tensor.ndim); + this->alloc_tensor.dtype = instr.alloc_tensor.dtype; + return *this; + case Opcode::AllocTensorReg: + this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register; + this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype; return *this; case Opcode::AllocDatatype: this->constructor_tag = instr.constructor_tag; @@ -203,12 +215,15 @@ Instruction::~Instruction() { case Opcode::Move: case Opcode::Select: case Opcode::Ret: - case Opcode::AllocTensor: + case Opcode::AllocTensorReg: case Opcode::If: case Opcode::LoadConst: case Opcode::GetField: case Opcode::Goto: return; + case Opcode::AllocTensor: + delete this->alloc_tensor.shape; + return; case Opcode::AllocDatatype: delete this->datatype_fields; return; @@ -226,8 +241,7 @@ Instruction::~Instruction() { return; default: std::ostringstream out; - LOG(FATAL) << "Invalid instruction " << static_cast(this->op) - << "\n"; + LOG(FATAL) << "Invalid instruction " << static_cast(this->op); } } @@ -252,12 +266,25 @@ Instruction Instruction::InvokePacked(Index packed_index, Index arity, Index out return instr; } -Instruction Instruction::AllocTensor(RegName shape_register, DLDataType dtype, Index dst) { +Instruction Instruction::AllocTensor(std::vector shape, DLDataType dtype, Index dst) { Instruction instr; instr.op = Opcode::AllocTensor; instr.dst = dst; - instr.shape_register = shape_register; - instr.dtype = dtype; + instr.alloc_tensor.ndim = shape.size(); + instr.alloc_tensor.shape = new int64_t[shape.size()]; + for (size_t i = 0; i < shape.size(); ++i) { + instr.alloc_tensor.shape[i] = shape[i]; + } + instr.alloc_tensor.dtype = dtype; + return instr; +} + +Instruction Instruction::AllocTensorReg(RegName shape_register, DLDataType dtype, Index dst) { + Instruction instr; + instr.op = Opcode::AllocTensorReg; + instr.dst = dst; + instr.alloc_tensor_reg.shape_register = shape_register; + instr.alloc_tensor_reg.dtype = dtype; return instr; } @@ -381,85 +408,92 @@ void DLDatatypePrint(std::ostream& os, const DLDataType& dtype) { break; } - os << dtype.bits; - if (dtype.lanes != 0) { - os << "[" << dtype.lanes << "]"; + os << int(dtype.bits); + if (dtype.lanes != 1) { + os << "x" << dtype.lanes; } } +template +std::string StrJoin(T* items, int offset, int cnt, std::string delim = ",") { + if (cnt == 0) { + return ""; + } + std::ostringstream oss; + oss << items[offset]; + for (int i = 1; i < cnt; ++i) { + oss << delim << items[offset + i]; + } + return oss.str(); +} + void InstructionPrint(std::ostream& os, const Instruction& instr) { switch (instr.op) { case Opcode::Move: { - os << "move " << instr.from << " " << instr.dst; + os << "move $" << instr.dst << " $" << instr.from; break; } case Opcode::Ret: { - os << "ret " << instr.result; + os << "ret $" << instr.result; break; } case Opcode::InvokePacked: { - os << "invoke_packed "; - os << instr.packed_index; - os << " " << instr.arity; - os << "("; - for (Index i = 0; i < instr.arity; ++i) { - os << instr.packed_args[i] << ","; - } - os << ")"; - os << " " << instr.output_size; + os << "invoke_packed PackedFunc[" << instr.packed_index << "](in: $" + << StrJoin(instr.packed_args, 0, instr.arity - instr.output_size, ",$") + << ", out: $" + << StrJoin(instr.packed_args, instr.arity - instr.output_size, + instr.output_size, ",$") + << ")"; break; } case Opcode::AllocTensor: { - os << "alloc_tensor "; - os << instr.dst << " "; - os << instr.shape_register << " "; - DLDatatypePrint(os, instr.dtype); + os << "alloc_tensor $" << instr.dst << " [" + << StrJoin(instr.alloc_tensor.shape, 0, instr.alloc_tensor.ndim) + << "] "; + DLDatatypePrint(os, instr.alloc_tensor.dtype); + break; + } + case Opcode::AllocTensorReg: { + os << "alloc_tensor_reg $" << instr.dst << " $" + << instr.alloc_tensor_reg.shape_register << " "; + DLDatatypePrint(os, instr.alloc_tensor_reg.dtype); break; } case Opcode::AllocDatatype: { - os << "alloc_data "; - os << instr.dst << " "; - os << instr.constructor_tag << " "; - os << instr.num_fields; + os << "alloc_data $" << instr.dst << " tag(" << instr.constructor_tag << ") [$" + << StrJoin(instr.datatype_fields, 0, instr.num_fields, ",$") << "]"; break; } case Opcode::AllocClosure: { - os << "alloc_closure "; - os << instr.dst << " "; - os << instr.clo_index << " "; - os << instr.num_freevar << "("; - for (Index i = 0; i < instr.num_freevar; ++i) { - os << instr.free_vars[i] << ","; - } - os << ")"; + os << "alloc_closure $" << instr.dst << " VMFunc[" << instr.clo_index + << "]($" << StrJoin(instr.free_vars, 0, instr.num_freevar, ",$") + << ")"; break; } case Opcode::If: { - os << "if " - << "$" << instr.if_cond << " " << instr.true_offset << " " << instr.false_offset; + os << "if " << "$" << instr.if_cond << " " << instr.true_offset << " " + << instr.false_offset; break; } case Opcode::Invoke: { - os << "invoke " - << "$" << instr.dst << " " << instr.func_index << " " << instr.num_args << "("; - for (Index i = 0; i < instr.num_args; ++i) { - os << instr.invoke_args_registers[i] << ","; - } - os << ")"; + os << "invoke $" << instr.dst << " VMFunc[" << instr.func_index << "]($" + << StrJoin(instr.invoke_args_registers, 0, instr.num_args, ",$") + << ")"; break; } case Opcode::InvokeClosure: { - os << "invoke_closure " - << "$" << instr.dst << " " << instr.closure << " " << instr.closure_args_num << "()"; + os << "invoke_closure $" << instr.dst << " $" << instr.closure << "($" + << StrJoin(instr.closure_args, 0, instr.closure_args_num, ",$") + << ")"; break; } case Opcode::LoadConst: { - os << "load_const " - << "$" << instr.dst << " " << instr.const_index; + os << "load_const $" << instr.dst << " Const[" << instr.const_index << "]"; break; } case Opcode::GetField: { - os << "get_field " << instr.dst << " " << instr.object << " " << instr.field_index; + os << "get_field $" << instr.dst << " $" << instr.object << "[" + << instr.field_index << "]"; break; } case Opcode::Goto: { @@ -467,8 +501,8 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) { break; } case Opcode::Select: { - os << "select " << instr.dst << " " << instr.select_cond << " " << instr.select_op1 << " " - << instr.select_op2; + os << "select $" << instr.dst << " $" << instr.select_cond << " $" + << instr.select_op1 << " $" << instr.select_op2; break; } default: @@ -513,48 +547,64 @@ Index VirtualMachine::PopFrame() { } void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector& args) { - DLOG(INFO) << "===================\nInvoking global " << func.name << " " << args.size() - << std::endl; + DLOG(INFO) << "Invoking global " << func.name << " " << args.size(); PushFrame(func.params, this->pc + 1, func); for (size_t i = 0; i < args.size(); ++i) { WriteRegister(i, args[i]); } - DLOG(INFO) << "func.params= " << func.params << std::endl; + DLOG(INFO) << "func.params= " << func.params; code = func.instructions.data(); pc = 0; } Object VirtualMachine::Invoke(const VMFunction& func, const std::vector& args) { - DLOG(INFO) << "Executing Function: " << std::endl << func << std::endl; + DLOG(INFO) << "Executing Function: " << std::endl << func; InvokeGlobal(func, args); Run(); auto alloc = MemoryManager::Global()->GetAllocator(ctxs[0]); - DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B\n"; + DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B"; return return_register; } Object VirtualMachine::Invoke(const std::string& name, const std::vector& args) { auto func_index = this->global_map_[name]; - DLOG(INFO) << "Invoke Global " << name << " at index " << func_index << std::endl; + DLOG(INFO) << "Invoke Global " << name << " at index " << func_index; return Invoke(this->functions[func_index], args); } void InvokePacked(const PackedFunc& func, Index arg_count, Index output_size, const std::vector& args) { - std::vector values(arg_count); - std::vector codes(arg_count); - runtime::TVMArgsSetter setter(values.data(), codes.data()); + size_t arity = 0; + for (Index i = 0; i < arg_count; i++) { + if (args[i].ptr_->tag == ObjectTag::kDatatype) { + arity += args[i].AsDatatype()->fields.size(); + } else { + ++arity; + } + } + std::vector values(arity); + std::vector codes(arity); + runtime::TVMArgsSetter setter(values.data(), codes.data()); + int idx = 0; for (Index i = 0; i < arg_count; i++) { - NDArray data = ToNDArray(args[i]); - setter(i, data); + if (args[i].ptr_->tag == ObjectTag::kDatatype) { + auto dt_cell = args[i].AsDatatype(); + for (auto obj : dt_cell->fields) { + NDArray data = ToNDArray(obj); + setter(idx++, data); + } + } else { + NDArray data = ToNDArray(args[i]); + setter(idx++, data); + } } TVMRetValue rv; - func.CallPacked(TVMArgs(values.data(), codes.data(), arg_count), &rv); + func.CallPacked(TVMArgs(values.data(), codes.data(), arity), &rv); } void VirtualMachine::Init(const std::vector& ctxs) { this->ctxs = ctxs; } @@ -574,7 +624,7 @@ void VirtualMachine::Run() { while (true) { main_loop: auto const& instr = this->code[this->pc]; - DLOG(INFO) << "\nExecuting(" << pc << "): "; + DLOG(INFO) << "Executing(" << pc << "): "; #if USE_RELAY_DEBUG InstructionPrint(std::cout, instr); #endif // USE_RELAY_DEBUG @@ -669,11 +719,23 @@ void VirtualMachine::Run() { goto main_loop; } case Opcode::AllocTensor: { + auto shape = std::vector(instr.alloc_tensor.ndim); + for (uint i = 0; i < instr.alloc_tensor.ndim; ++i) { + shape[i] = instr.alloc_tensor.shape[i]; + } + auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]); + auto data = allocator->Empty(shape, instr.alloc_tensor.dtype, ctxs[0]); + auto obj = Object::Tensor(data); + WriteRegister(instr.dst, obj); + pc++; + goto main_loop; + } + case Opcode::AllocTensorReg: { DLContext cpu_ctx; cpu_ctx.device_type = kDLCPU; cpu_ctx.device_id = 0; - auto shape_tensor_obj = ReadRegister(instr.shape_register); + auto shape_tensor_obj = ReadRegister(instr.alloc_tensor_reg.shape_register); NDArray shape_tensor = ToNDArray(shape_tensor_obj).CopyTo(cpu_ctx); int64_t* dims = static_cast(shape_tensor->data); @@ -681,7 +743,7 @@ void VirtualMachine::Run() { auto shape = std::vector(shape_tensor->shape[0]); shape.assign(dims, dims + num_dims); auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]); - auto data = allocator->Empty(shape, instr.dtype, ctxs[0]); + auto data = allocator->Empty(shape, instr.alloc_tensor_reg.dtype, ctxs[0]); auto obj = Object::Tensor(data); WriteRegister(instr.dst, obj); pc++;