From b9c0a86ceb0c9cd6fbdc414a01f64bbb10249216 Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Thu, 9 Dec 2021 00:19:12 +0800 Subject: [PATCH 1/4] nn.Graph reuse eager lbn without create duplicate variable op --- oneflow/core/framework/nn_graph.cpp | 4 ++++ .../op_interpreter/lazy_op_interpreter.cpp | 18 +++++++++++++----- oneflow/core/framework/tensor_name_scope.cpp | 5 +++++ oneflow/core/framework/tensor_name_scope.h | 3 +++ 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/oneflow/core/framework/nn_graph.cpp b/oneflow/core/framework/nn_graph.cpp index e27d2098ce4..404262b29fd 100644 --- a/oneflow/core/framework/nn_graph.cpp +++ b/oneflow/core/framework/nn_graph.cpp @@ -22,6 +22,7 @@ limitations under the License. #include "oneflow/core/framework/instructions_builder.h" #include "oneflow/core/framework/multi_client_session_context.h" #include "oneflow/core/framework/nd_sbp.h" +#include "oneflow/core/framework/tensor_name_scope.h" #include "oneflow/core/functional/functional.h" #include "oneflow/core/graph/op_graph.h" #include "oneflow/core/job/compiler.h" @@ -253,6 +254,9 @@ Maybe NNGraph::CompileAndInitRuntime() { // TODO(chengcheng): CHECK job valid for each rank. JUST(CreateAndRegisterNewVariableOpInJobPass()); + // NOTE(chengcheng): TensorNameScope need to be cleared after current graph build. + one::TensorNameScope::Global()->Clear(); + // NOTE(chengcheng): Global need be clear before GlobalJobDescScope construct. if (Global::Get() != nullptr) { Global::Delete(); } diff --git a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp index 14799a83f6c..7b0b473c0a3 100644 --- a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp @@ -252,6 +252,19 @@ Maybe LazyInterpreter::ApplyImpl(const FeedVariableOpExpr& op_expr, const const std::shared_ptr& input_tensor = inputs.at(0); CHECK_OR_RETURN(input_tensor->is_eager()); + // Check outputs num and setup output tensor properties. + CHECK_EQ_OR_RETURN(outputs->size(), 1); + CHECK_EQ_OR_RETURN(op_expr.output_size(), 1); + CHECK_OR_RETURN(!(*outputs)[0]); + + const std::string& opt_lbn = TensorNameScope::Global()->Lookup(input_tensor); + if (!opt_lbn.empty()) { + // NOTE(chengcheng): This eager tensor has been feed as variable op before, so we just use the + // lbn, and will NOT create duplicate variable op again. + (*outputs)[0] = input_tensor; + return Maybe::Ok(); + } + std::shared_ptr scope = JUST(NewScopeWithParallelDescByTensor(input_tensor)); OperatorConf op_conf; @@ -288,11 +301,6 @@ Maybe LazyInterpreter::ApplyImpl(const FeedVariableOpExpr& op_expr, const int64_t parallel_desc_sym_id = JUST(scope->GetParallelDescSymbolId(op_conf)); auto blob_parallel_desc = JUST(GetSymbol(parallel_desc_sym_id)); - // Check outputs num and setup output tensor properties. - CHECK_EQ_OR_RETURN(outputs->size(), 1); - CHECK_EQ_OR_RETURN(op_expr.output_size(), 1); - CHECK_OR_RETURN(!(*outputs)[0]); - const std::string obn = "out"; // NOTE(chengcheng): obn is NOT op_expr.indexed_obns (*outputs)[0] = JUST(BuildTensor(op_attr, obn, blob_parallel_desc, /* is_lazy= */ true, /* is_local */ input_tensor->is_local())); diff --git a/oneflow/core/framework/tensor_name_scope.cpp b/oneflow/core/framework/tensor_name_scope.cpp index ff0c484cb81..1a841cdd76a 100644 --- a/oneflow/core/framework/tensor_name_scope.cpp +++ b/oneflow/core/framework/tensor_name_scope.cpp @@ -42,5 +42,10 @@ void TensorNameScope::Record(const std::shared_ptr& tensor, const std::s tensor_names_[key] = name; } +void TensorNameScope::Clear() { + std::lock_guard lock(mutex_); + tensor_names_.clear(); +} + } // namespace one } // namespace oneflow diff --git a/oneflow/core/framework/tensor_name_scope.h b/oneflow/core/framework/tensor_name_scope.h index b9bfad9b8eb..e512a90682f 100644 --- a/oneflow/core/framework/tensor_name_scope.h +++ b/oneflow/core/framework/tensor_name_scope.h @@ -31,6 +31,9 @@ class TensorNameScope { void Record(const std::shared_ptr& tensor, const std::string& name); + // NOTE(chengcheng): TensorNameScope need to be cleared after current graph build. + void Clear(); + private: TensorNameScope() : default_tensor_name_("") {} virtual ~TensorNameScope() = default; From 75ea0bdb6ff205fea5e2a391816a4e6ba44685d7 Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Thu, 9 Dec 2021 00:44:17 +0800 Subject: [PATCH 2/4] Add log --- .../core/framework/op_interpreter/lazy_op_interpreter.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp index 7b0b473c0a3..82eb84c3a35 100644 --- a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp @@ -252,6 +252,8 @@ Maybe LazyInterpreter::ApplyImpl(const FeedVariableOpExpr& op_expr, const const std::shared_ptr& input_tensor = inputs.at(0); CHECK_OR_RETURN(input_tensor->is_eager()); + auto infer_ctx = JUST(GetCurInferCtx()); + // Check outputs num and setup output tensor properties. CHECK_EQ_OR_RETURN(outputs->size(), 1); CHECK_EQ_OR_RETURN(op_expr.output_size(), 1); @@ -262,6 +264,9 @@ Maybe LazyInterpreter::ApplyImpl(const FeedVariableOpExpr& op_expr, const // NOTE(chengcheng): This eager tensor has been feed as variable op before, so we just use the // lbn, and will NOT create duplicate variable op again. (*outputs)[0] = input_tensor; + VLOG(2) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name() + << " try to add variable op name : \n: " << op_expr.op_name() + << " but it has been created as : " << opt_lbn << ". \n So we just reuse this tensor."; return Maybe::Ok(); } @@ -288,7 +293,6 @@ Maybe LazyInterpreter::ApplyImpl(const FeedVariableOpExpr& op_expr, const if (unlikely(l2 != 0.0)) { var_conf->mutable_regularizer()->mutable_l1_l2_conf()->set_l2(l2); } } - auto infer_ctx = JUST(GetCurInferCtx()); VLOG(2) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name() << " try to add op: \n: " << op_conf.DebugString() << std::endl; OpAttribute op_attr = *JUST(infer_ctx->AddAndInferConsistentOp(op_conf)); From 6e2309a85742a5714ec820197b43b77abe2e03c1 Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Fri, 10 Dec 2021 11:42:32 +0800 Subject: [PATCH 3/4] refine note spell --- oneflow/core/framework/nn_graph.cpp | 2 +- oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp | 2 +- oneflow/core/framework/tensor_name_scope.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/oneflow/core/framework/nn_graph.cpp b/oneflow/core/framework/nn_graph.cpp index 404262b29fd..76c9ddf5720 100644 --- a/oneflow/core/framework/nn_graph.cpp +++ b/oneflow/core/framework/nn_graph.cpp @@ -254,7 +254,7 @@ Maybe NNGraph::CompileAndInitRuntime() { // TODO(chengcheng): CHECK job valid for each rank. JUST(CreateAndRegisterNewVariableOpInJobPass()); - // NOTE(chengcheng): TensorNameScope need to be cleared after current graph build. + // NOTE(chengcheng): TensorNameScope need to be cleared after current graph is built. one::TensorNameScope::Global()->Clear(); // NOTE(chengcheng): Global need be clear before GlobalJobDescScope construct. diff --git a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp index 82eb84c3a35..f6886078e80 100644 --- a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp @@ -261,7 +261,7 @@ Maybe LazyInterpreter::ApplyImpl(const FeedVariableOpExpr& op_expr, const const std::string& opt_lbn = TensorNameScope::Global()->Lookup(input_tensor); if (!opt_lbn.empty()) { - // NOTE(chengcheng): This eager tensor has been feed as variable op before, so we just use the + // NOTE(chengcheng): This eager tensor has been fed as variable op before, so we just use the // lbn, and will NOT create duplicate variable op again. (*outputs)[0] = input_tensor; VLOG(2) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name() diff --git a/oneflow/core/framework/tensor_name_scope.h b/oneflow/core/framework/tensor_name_scope.h index e512a90682f..8299a7ebfa5 100644 --- a/oneflow/core/framework/tensor_name_scope.h +++ b/oneflow/core/framework/tensor_name_scope.h @@ -31,7 +31,7 @@ class TensorNameScope { void Record(const std::shared_ptr& tensor, const std::string& name); - // NOTE(chengcheng): TensorNameScope need to be cleared after current graph build. + // NOTE(chengcheng): TensorNameScope need to be cleared after current graph is built. void Clear(); private: From 14af15914ae036e066fe4688a31cb564571e580f Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Fri, 10 Dec 2021 19:14:49 +0800 Subject: [PATCH 4/4] Fix bug of LazyInterpret handle inplace eager tensor --- .../framework/op_interpreter/lazy_op_interpreter.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp index f6886078e80..596d6049205 100644 --- a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp @@ -73,12 +73,8 @@ Maybe BuildTensor(const OpAttribute& op_attribute, const std::string& bn Maybe CheckTensorMatchAttr(const std::shared_ptr& tensor, const OpAttribute& op_attribute, const std::string& bn_in_op, const std::shared_ptr& parallel_desc, - const bool is_lazy, const bool is_local, const bool requires_grad, - const bool is_leaf) { - CHECK_EQ_OR_RETURN(tensor->is_lazy(), is_lazy); + const bool is_local) { CHECK_EQ_OR_RETURN(tensor->is_local(), is_local); - CHECK_EQ_OR_RETURN(tensor->requires_grad(), requires_grad); - CHECK_EQ_OR_RETURN(tensor->is_leaf(), is_leaf); CHECK_OR_RETURN(op_attribute.has_logical_blob_desc_signature()); const auto& blob_desc_sign_map = op_attribute.logical_blob_desc_signature().bn_in_op2blob_desc(); @@ -682,10 +678,7 @@ Maybe LazyInterpreter::ApplyImpl(const UserOpExpr& op_expr, const TensorTu JUST(BuildTensor(op_attr, obn, blob_parallel_desc, /* is_lazy= */ true, is_local)); } else { const std::shared_ptr& inplace_out = (*outputs)[i]; - JUST(CheckTensorMatchAttr(inplace_out, op_attr, obn, blob_parallel_desc, /* is_lazy= */ true, - is_local, - /* requires_grad */ false, - /* is_leaf */ true)); + JUST(CheckTensorMatchAttr(inplace_out, op_attr, obn, blob_parallel_desc, is_local)); } TensorNameScope::Global()->Record((*outputs)[i], GenLogicalBlobName(new_op_name, obn)); }