From 6730807958ed9c4e17454da3995d7a5373a306fa Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 15 Dec 2021 15:33:25 +0800 Subject: [PATCH 1/4] add pass debug --- oneflow/core/job/job_build_and_infer_ctx.cpp | 33 +++++++++++++++++-- .../job_rewriter/fuse_add_to_output_pass.cpp | 7 ++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index b9a8bdf0b4b..cfa53dac27e 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -965,8 +965,35 @@ Maybe LazyJobBuildAndInferCtx::Complete() { Global::Delete(); auto scope = std::make_unique(mut_job()->job_conf(), job_id()); JobPassCtx job_pass_ctx(GlobalJobDesc()); - auto DoPass = [&](const std::string& pass_name) -> Maybe { - return JobPass4Name(pass_name)(mut_job(), &job_pass_ctx); + const auto& job_name = job().job_conf().job_name(); + auto LogJob = [&](const std::string& name_suffix) -> void { + std::string full_log_name = job_name + "-id" + std::to_string(job_id()) + "-" + name_suffix; + TeePersistentLogStream::Create(full_log_name)->Write(job()); + Global::New(job()); + Global::Get()->ToDotWithFilePath(full_log_name + ".dot"); + Global::Delete(); + }; + std::string debug_pass_name = GetStringFromEnv("ONEFLOW_DEBUG_PASS", ""); + auto NeedLogJob = [&] (const std::string& pass_name) -> bool { + if ("ALL" == debug_pass_name) { + return true; + } else if ( pass_name == debug_pass_name) { + return true; + } else { + return false; + } + }; + auto DoPass = [&](const std::string& pass_name, int32_t cnt = 0) -> Maybe { + if (unlikely(NeedLogJob(pass_name))) { + std::string cnt_str = cnt > 0 ? std::to_string(cnt) : ""; + LogJob(pass_name + cnt_str + "-before"); + } + JUST(JobPass4Name(pass_name)(mut_job(), &job_pass_ctx)); + if (unlikely(NeedLogJob(pass_name))) { + std::string cnt_str = cnt > 0 ? std::to_string(cnt) : ""; + LogJob(pass_name + cnt_str + "-after"); + } + return Maybe::Ok(); }; if (Global::Get()->enable_debug_mode() @@ -1007,7 +1034,7 @@ Maybe LazyJobBuildAndInferCtx::Complete() { JUST(DoPass("FuseAddToOutputPass")); // run this pass again to fuse ops created in the first run. // TODO(guoran): loop multiple times inside the pass - JUST(DoPass("FuseAddToOutputPass")); + JUST(DoPass("FuseAddToOutputPass", 1)); JUST(DoPass("IndexedSlicesOptimizerRewritePass")); JUST(DoPass("SplitSparseSoftmaxCrossEntropyOpPass")); JUST(DoPass("DoParallelCastBeforeWideningTypeCast")); diff --git a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp index f1cf8a443c9..a3c64b22501 100644 --- a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp +++ b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp @@ -73,6 +73,8 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ if (user_op_conf.has_input("_add_to_output", 0)) { return false; } return true; }; + + // Save all op's ctrl in op name in a set. HashSet ctrl_in_op_names; op_graph.ForEachNode([&](const OpNode* op_node) { for (const std::string& ctrl_in_op_name : op_node->op().op_conf().ctrl_in_op_name()) { @@ -80,6 +82,7 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ } }); + // Whether 2 ops has path between them. auto IsReachable = op_graph.MakePredicatorIsOpNameDataOrCtrlReachable(); std::vector delete_ops; op_graph.ForEachNode([&](const OpNode* op_node) { @@ -121,10 +124,13 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ for (const OpEdge* out_edge : op_node->out_edges()) { const OpNode* consumer = out_edge->dst_node(); const std::string& consumer_op_name = consumer->op().op_name(); + // remeber add_n op's consumer ops if (op_name2op_conf.find(consumer_op_name) == op_name2op_conf.end()) { op_name2op_conf[consumer_op_name] = consumer->op().op_conf(); } + // for each input of add_n op's consumer for (const std::string& ibn : consumer->op().input_bns()) { + // If the input is add_n op's out, find the consum edge, then consume new_add_to_op if (consumer->op().BnInOp2Lbi(ibn) == out) { OperatorConf& consumer_op_conf = op_name2op_conf.at(consumer_op_name); const auto& new_val = GenLogicalBlobName(*sum_lbi); @@ -133,6 +139,7 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ } } } + // add this add_n op to remove list delete_ops.emplace_back(op_conf); }); job_builder->DelOps(delete_ops); From be578af853b785497fb3aeddf2cfcc2dbe06f46d Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 15 Dec 2021 22:51:01 +0800 Subject: [PATCH 2/4] debug pass --- oneflow/core/job/job_build_and_infer_ctx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index cfa53dac27e..9feb8288ec4 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -967,7 +967,7 @@ Maybe LazyJobBuildAndInferCtx::Complete() { JobPassCtx job_pass_ctx(GlobalJobDesc()); const auto& job_name = job().job_conf().job_name(); auto LogJob = [&](const std::string& name_suffix) -> void { - std::string full_log_name = job_name + "-id" + std::to_string(job_id()) + "-" + name_suffix; + std::string full_log_name = job_name + "-job_id_" + std::to_string(job_id()) + "-" + name_suffix; TeePersistentLogStream::Create(full_log_name)->Write(job()); Global::New(job()); Global::Get()->ToDotWithFilePath(full_log_name + ".dot"); From 2071002eeb3f26d01e9d1eb87e12556f9084ac26 Mon Sep 17 00:00:00 2001 From: strint Date: Sun, 19 Dec 2021 22:19:53 +0800 Subject: [PATCH 3/4] refine comment of fuse add pass --- oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp index a3c64b22501..13f08773177 100644 --- a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp +++ b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp @@ -82,7 +82,6 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ } }); - // Whether 2 ops has path between them. auto IsReachable = op_graph.MakePredicatorIsOpNameDataOrCtrlReachable(); std::vector delete_ops; op_graph.ForEachNode([&](const OpNode* op_node) { @@ -116,6 +115,7 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ } else { return; } + // Make a new_add_to_op to fuse add_n into this op. OperatorConf new_add_to_op_conf = add_to_node->op().op_conf(); *(*(new_add_to_op_conf.mutable_user_conf()->mutable_input()))["_add_to_output"] .mutable_s() @@ -124,13 +124,11 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ for (const OpEdge* out_edge : op_node->out_edges()) { const OpNode* consumer = out_edge->dst_node(); const std::string& consumer_op_name = consumer->op().op_name(); - // remeber add_n op's consumer ops if (op_name2op_conf.find(consumer_op_name) == op_name2op_conf.end()) { op_name2op_conf[consumer_op_name] = consumer->op().op_conf(); } - // for each input of add_n op's consumer + // Make add_n op's consumer to consume the new_add_to_op for (const std::string& ibn : consumer->op().input_bns()) { - // If the input is add_n op's out, find the consum edge, then consume new_add_to_op if (consumer->op().BnInOp2Lbi(ibn) == out) { OperatorConf& consumer_op_conf = op_name2op_conf.at(consumer_op_name); const auto& new_val = GenLogicalBlobName(*sum_lbi); @@ -139,7 +137,7 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ } } } - // add this add_n op to remove list + // Add the add_n op to removing list delete_ops.emplace_back(op_conf); }); job_builder->DelOps(delete_ops); From 4226902610e1d9dd4d0192830d2b0e5a5afd1b34 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Fri, 24 Dec 2021 04:02:20 +0000 Subject: [PATCH 4/4] auto format by CI --- oneflow/core/job/job_build_and_infer_ctx.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index 387a537c3a7..7c5c7dc0df4 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -967,17 +967,18 @@ Maybe LazyJobBuildAndInferCtx::Complete() { JobPassCtx job_pass_ctx(GlobalJobDesc()); const auto& job_name = job().job_conf().job_name(); auto LogJob = [&](const std::string& name_suffix) -> void { - std::string full_log_name = job_name + "-job_id_" + std::to_string(job_id()) + "-" + name_suffix; + std::string full_log_name = + job_name + "-job_id_" + std::to_string(job_id()) + "-" + name_suffix; TeePersistentLogStream::Create(full_log_name)->Write(job()); Global::New(job()); Global::Get()->ToDotWithFilePath(full_log_name + ".dot"); Global::Delete(); }; std::string debug_pass_name = GetStringFromEnv("ONEFLOW_DEBUG_PASS", ""); - auto NeedLogJob = [&] (const std::string& pass_name) -> bool { + auto NeedLogJob = [&](const std::string& pass_name) -> bool { if ("ALL" == debug_pass_name) { return true; - } else if ( pass_name == debug_pass_name) { + } else if (pass_name == debug_pass_name) { return true; } else { return false;