Skip to content

Commit

Permalink
Feat/zero mix with mp (#8036)
Browse files Browse the repository at this point in the history
* add zero limit

* add debug

* add mix zero test

* refactor zero api

* zero test with mp

* add 2d test

* add zero nd

* add nd zero

* add sbp cast

* test passed soft limit consumer

* refine size api

* zero use stage 2

* add limit consumer api

* add new api

* refine zero s select

* fix index out of range

* rm zero limit on device type

* zero test with activation checkpointing

* add indentity when dp sequence len is 1

* move to base with master

* fix

* fix

* fix

* add test

* debug bad case

* refine test for eager and graph boxing

* test case ready

* simplify

* refine test

* fix buff size

* fix conflict

* refine zero nd

* refine

* add full test

* revert change

* refine split check

* fix typo

* rm log

* spit long func

* restore test

* Update optimizer_placement_optimization_pass.cpp

* auto format by CI

* auto format by CI

* fix static check

* add tips for zero api change

* auto format by CI

Co-authored-by: oneflow-ci-bot <ci-bot@oneflow.org>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Jun 10, 2022
1 parent 6aa2416 commit 95240c2
Show file tree
Hide file tree
Showing 11 changed files with 497 additions and 129 deletions.
3 changes: 1 addition & 2 deletions docs/source/graph.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@ Base class for running neural networks in Static Graph Mode.

.. autoclass:: oneflow.nn.graph.graph_config.GraphConfig
:members: enable_amp,
enable_zero,
allow_fuse_model_update_ops,
allow_fuse_add_to_output,
allow_fuse_cast_scale,
set_gradient_accumulation_steps,
set_zero_redundancy_optimizer_mode,
set_zero_redundancy_optimizer_min_size_after_split,
enable_cudnn_conv_heuristic_search_algo,
:member-order: bysource

Expand Down
3 changes: 3 additions & 0 deletions oneflow/core/job/eager_nccl_comm_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ void CreateNcclComm(ncclComm_t* comm, const int dev, const std::string& key,
<< ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank
<< ", key = {" << key << "}\n";
OF_NCCL_CHECK(ncclCommInitRank(comm, device_vec.size(), nccl_unique_id, rank));
VLOG(2) << " EagerNcclCommMgr::ncclCommInitRank succeed device_vec.size() = " << device_vec.size()
<< ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank
<< ", key = {" << key << "}\n";
}

} // namespace
Expand Down
6 changes: 6 additions & 0 deletions oneflow/core/job/job_build_and_infer_ctx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -997,13 +997,19 @@ Maybe<void> LazyJobBuildAndInferCtx::Complete() {
}
};
int32_t pass_cnt = 0;
const int64_t prev_v = FLAGS_v;
auto DoPass = [&](const std::string& pass_name, int32_t cnt = 0) -> Maybe<void> {
VLOG(1) << job_name << " is compiling with pass"
<< " pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name
<< (cnt > 0 ? std::to_string(cnt) : "");
if (unlikely(NeedLogJob(pass_name))) {
std::string cnt_str = cnt > 0 ? std::to_string(cnt) : "";
LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-before");
FLAGS_v = 3;
}
JUST(JobPass4Name(pass_name)(mut_job(), &job_pass_ctx));
if (unlikely(NeedLogJob(pass_name))) {
FLAGS_v = prev_v;
std::string cnt_str = cnt > 0 ? std::to_string(cnt) : "";
LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-after");
}
Expand Down
1 change: 1 addition & 0 deletions oneflow/core/job/job_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License.
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/container_util.h"
#include "oneflow/core/job/job.pb.h"
#include "oneflow/core/job/sbp_parallel.pb.h"
#include "oneflow/core/operator/operator.h"

namespace oneflow {
Expand Down
1 change: 1 addition & 0 deletions oneflow/core/job/job_conf.proto
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ message JobConfigProto {
optional bool enable_gradients_stats_aggregation = 106 [default = true];
optional string optimizer_placement_optimization_mode = 107;
optional int64 optimizer_placement_optimization_threshold = 108 [default = 1024];
optional int64 optimizer_placement_optimization_shard_restore_level = 110 [default = 2];

optional QatConfig qat_config = 109;

Expand Down
Loading

0 comments on commit 95240c2

Please sign in to comment.