Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plan separation compile #9920

Merged
merged 218 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from 70 commits
Commits
Show all changes
218 commits
Select commit Hold shift + click to select a range
560a511
implement RankTaskGraph
lixinqi Sep 19, 2022
51034f0
RankCompiler
lixinqi Sep 19, 2022
3736494
fix compiler complaints
lixinqi Sep 20, 2022
5807882
CompTaskNode::ConsumeFakeRegsts
lixinqi Sep 20, 2022
c07ea5e
TransportTaskProto::lbi
lixinqi Sep 20, 2022
1fd10ef
makes sure all ranks know all var_op_names
lixinqi Sep 22, 2022
74c96df
RankTaskGraph::ForEachDutyRank
lixinqi Sep 22, 2022
e89143b
PortableCtrlEdge
lixinqi Sep 23, 2022
1b10509
compile in MultiThreadLoop
lixinqi Sep 23, 2022
44bf12b
CompileMode
lixinqi Sep 23, 2022
bd50bc7
rebuild new_task_id_ before ProduceRegst
lixinqi Sep 26, 2022
7853956
RankTaskGraph::InitRegstDescsConsumers()
lixinqi Sep 26, 2022
b725318
PlanUtil::GenReachableTaskPairs
lixinqi Sep 27, 2022
45bc629
disable checking consumer_task_regst_desc_id_size
lixinqi Sep 27, 2022
3c4ea9d
TaskNode::InitConsumedRegstsFromProto
lixinqi Sep 27, 2022
9880ba4
remove RegstDesc::InitConsumersFromProto
lixinqi Sep 27, 2022
20175fc
refactor CompTaskNode::ConsumeFakeRegstsIf
lixinqi Sep 27, 2022
fbff274
refactor CompTaskNode::ConsumeFakeRegsts
lixinqi Sep 27, 2022
ede3cd2
remove Plan::fake_consumed_regst_desc_id
lixinqi Sep 27, 2022
3ba45e5
revert part of code in job/plan_util.cpp
lixinqi Sep 28, 2022
2e9ab1a
refacotr ParallelDesc::TryGetParallelId
lixinqi Sep 28, 2022
93a7947
cut boxing_task_graph by rank
lixinqi Sep 29, 2022
818d14d
make sure TaskIdGenerator::Generator is thread safe
lixinqi Oct 8, 2022
8ca22bf
atomic<int64_t> mem_block_id
lixinqi Oct 8, 2022
2adbb13
chunk id add lock
strint Oct 8, 2022
ccf9bea
get chunk proto with lock
strint Oct 9, 2022
2c577df
create chunk with lock
strint Oct 9, 2022
fa49459
mutable std::mutex
lixinqi Oct 9, 2022
a4e67b0
Rank task graph merge master (#9440)
strint Nov 22, 2022
7d69c25
fix conflict
strint Nov 22, 2022
d4782a7
auto format by CI
oneflow-ci-bot Nov 22, 2022
eb76987
fix conflict
strint Nov 22, 2022
bb9e65e
Merge branch 'rank_task_graph' of https://github.com/Oneflow-Inc/onef…
strint Nov 22, 2022
6b575fc
fix conflict
strint Nov 22, 2022
13ba2ac
auto format by CI
oneflow-ci-bot Nov 22, 2022
92face0
fix conflict
strint Nov 22, 2022
1b2edca
fix
strint Nov 22, 2022
3910af6
address pr comments
lixinqi Nov 24, 2022
a37f9f8
fix bug
strint Dec 13, 2022
132a8a7
Rank task graph fix (#9749)
strint Feb 28, 2023
f1352e6
rm useless
strint Feb 28, 2023
6b13581
fix muti thread merge bug
strint Feb 28, 2023
4f11a23
merge master
strint Feb 28, 2023
75e024b
fix seg fault
strint Mar 1, 2023
c11c934
refine fx test
strint Mar 1, 2023
a8cbe2b
refine master merge
strint Mar 1, 2023
402b4e2
Merge branch 'master' into plan_sep_compile_merge
strint Mar 1, 2023
5b82586
auto format by CI
oneflow-ci-bot Mar 1, 2023
e62ea6a
Merge branch 'master' into plan_sep_compile_merge
strint Mar 3, 2023
d617f0e
config straighten alg with env var
strint Mar 3, 2023
81a1f57
auto format by CI
oneflow-ci-bot Mar 3, 2023
8b9c084
refine deallocator
strint Mar 6, 2023
48ed476
index search O(log(n)) to O(1)
strint Mar 7, 2023
16a1268
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Mar 7, 2023
799c943
fix static check
strint Mar 7, 2023
d93b7a0
with to guard
strint Mar 7, 2023
2c142d0
refine thread mgr
strint Mar 7, 2023
b56afc0
rm debug code
strint Mar 9, 2023
1cb6a31
Merge branch 'master' into plan_sep_compile_merge
strint Mar 10, 2023
d0611eb
Update oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_…
strint Mar 13, 2023
5e23757
Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into …
strint Mar 15, 2023
0b1cbc9
Update oneflow/core/graph/compute_task_node.h
strint Mar 15, 2023
f78359e
refine code and comments
strint Mar 15, 2023
f3661e2
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Mar 15, 2023
717f72a
add comment
strint Mar 15, 2023
34baacd
Update oneflow/core/job/plan_util.cpp
strint Mar 15, 2023
0786e36
Update oneflow/core/job/plan_util.h
strint Mar 15, 2023
3fc7fe8
comment out debug test
strint Mar 15, 2023
c360988
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Mar 15, 2023
8faa985
address review
strint Mar 16, 2023
83aca5b
add to do of reachable_cb_pairs
strint Mar 16, 2023
8e0a201
rm debug code
strint Mar 16, 2023
0a14449
fix unnecessary cuda context
strint Mar 19, 2023
711d192
fix zombie process at compile error
strint Mar 20, 2023
3f929cc
fix singnal
strint Mar 20, 2023
7d71e88
debug 3d error
strint Mar 21, 2023
57a0259
fix SetForceInplaceMemBlock
strint Mar 23, 2023
52df355
add comment
strint Mar 23, 2023
9cdc0fd
log rank plan
strint Mar 23, 2023
c1b8dfe
debug 3d ac bug (#10031)
strint Mar 24, 2023
2871fa6
merge master
strint Mar 27, 2023
b8f8b08
Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into …
strint Mar 28, 2023
7ca3f5c
fix intrupt cannot exit
strint Mar 20, 2023
5597fe8
fix env cannot exit
strint Mar 28, 2023
b92ce13
add comment
strint Mar 28, 2023
74335c7
debug lazy run
strint Mar 28, 2023
3030a0f
revert change
strint Mar 29, 2023
1c09796
Merge branch 'fix_proc_not_exit' into add_debug_of_actor_v1
strint Mar 29, 2023
bda427d
add coll boxing log
strint Mar 29, 2023
be6c271
refine log
strint Mar 29, 2023
b2ec5fd
refine log
strint Mar 29, 2023
37dd7ce
merge updtream
strint Mar 29, 2023
564e262
refine thread manage
strint Mar 29, 2023
10603f1
add get range index of balance_spliter
strint Mar 29, 2023
c69f405
Merge branch 'spe4_balanced_split_get_idx' of https://github.com/Onef…
strint Mar 29, 2023
5990e3b
open ap test
strint Mar 29, 2023
fb4bfd7
format id pairs
strint Mar 29, 2023
d177b63
add new id pairst
strint Mar 29, 2023
0be303e
rm useless include
strint Mar 29, 2023
50ec139
add sep compile test
strint Mar 29, 2023
a442869
Update test_alexnet_auto_parallel.py
strint Mar 30, 2023
118784f
add sep compile unittest
strint Mar 30, 2023
9dba71e
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Mar 30, 2023
1d7b196
rm lazy run debug log
strint Apr 3, 2023
e74fd7e
Merge branch 'sep3_refin_thread' into spe4_balanced_split_get_idx
strint Apr 3, 2023
a18bee5
merge upstream
strint Apr 3, 2023
6259dfd
merge and fix
strint Apr 3, 2023
6d56de6
Merge branch 'master' into sep3_refin_thread
strint Apr 3, 2023
b8d3033
Update thread.cpp
strint Apr 3, 2023
6fc41b8
Merge branch 'sep3_refin_thread' of https://github.com/Oneflow-Inc/on…
strint Apr 3, 2023
e749fd4
address review
strint Apr 3, 2023
e24c9d8
merge master
strint Apr 3, 2023
f1c2078
remove mem_chain merge
chengtbf Apr 10, 2023
9f6d21e
TaskNode::order_in_chain
chengtbf Apr 10, 2023
51efb80
Merge branch 'master' into dev_cc_order_in_chain
chengtbf Apr 10, 2023
2c73e74
Update oneflow/core/graph/task_graph.cpp
chengtbf Apr 11, 2023
e4d30df
Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into …
chengtbf Apr 11, 2023
677ece4
refine task proto id
chengtbf Apr 11, 2023
ec7d071
Merge branch 'master' into dev_cc_order_in_chain
mergify[bot] Apr 11, 2023
95dd077
merge upstream
strint Apr 11, 2023
b4b1c37
auto format by CI
oneflow-ci-bot Apr 11, 2023
8dd52e9
comment mode and fix merge
strint Apr 11, 2023
973fa46
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Apr 11, 2023
fe31a50
fix merge
strint Apr 12, 2023
81311e1
auto format by CI
oneflow-ci-bot Apr 12, 2023
891d5b3
Update actor.cpp
strint Apr 12, 2023
e9d9c5b
rm collective boxing in seperation compile (#10112)
strint Apr 12, 2023
e3833cd
revert to master test
strint Apr 12, 2023
5cc147b
auto format by CI
oneflow-ci-bot Apr 12, 2023
8731999
Revert "rm collective boxing in seperation compile" (#10113)
strint Apr 12, 2023
cf8cf13
comment out this test
strint Apr 12, 2023
62dee17
add task to/from proto
strint Apr 12, 2023
db84632
Update boxing_task_graph.proto
strint Apr 12, 2023
49c8d18
Update task_edge.proto
strint Apr 12, 2023
041a4b3
Update task_graph_rebuild_ctx.cpp
strint Apr 12, 2023
3e08944
Update task_graph_rebuild_ctx.h
strint Apr 12, 2023
e0cf92b
Update transport_task_node.cpp
strint Apr 12, 2023
108891e
merge upstream
strint Apr 12, 2023
5f312a6
Merge branch 'sep0_task_proto' of https://github.com/Oneflow-Inc/onef…
strint Apr 12, 2023
08aa237
auto format by CI
oneflow-ci-bot Apr 12, 2023
db5fdc6
fix merge
strint Apr 12, 2023
ddffa4a
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Apr 12, 2023
5a7f554
Plan sep compile merge rm collective boxing (#10114)
strint Apr 12, 2023
008239e
support infer desc choose method
strint Apr 13, 2023
311cb7a
merge upstream
strint Apr 13, 2023
87dae49
rm unrelated (#10126)
strint Apr 13, 2023
be2987d
refine comment
strint Apr 13, 2023
9acbc79
rm useless
strint Apr 14, 2023
9e7f0ec
Merge branch 'sep2_custom_blobdesc_infer' of https://github.com/Onefl…
strint Apr 14, 2023
0c4ff96
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Apr 14, 2023
b0332fb
Update collective_boxing_task_node.cpp
strint Apr 14, 2023
111673c
Update task_node.cpp
strint Apr 14, 2023
9d75b96
Update tick_compute_task_node.cpp
strint Apr 14, 2023
34b3133
add comsume fake regst
strint Apr 14, 2023
acff92c
fix typo
strint Apr 14, 2023
bfcebd3
Merge branch 'sep3_fake_regst' into plan_sep_compile_merge
strint Apr 14, 2023
aabc9c7
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Apr 14, 2023
17203d0
add task factory to create new task node
strint Apr 14, 2023
4377368
Merge branch 'sep0_task_proto' into sep2_custom_blobdesc_infer
strint Apr 14, 2023
88f4297
add infer from ndsbp
strint Apr 14, 2023
0e7b8ed
Merge branch 'sep2_custom_blobdesc_infer' into sep3_fake_regst
strint Apr 14, 2023
993efb8
Merge branch 'sep3_fake_regst' into plan_sep_compile_merge
strint Apr 14, 2023
8f5e6f7
merge upstream
strint Apr 14, 2023
dbae209
rm duplicated
strint Apr 14, 2023
266c388
rm useless
strint Apr 14, 2023
9f952bd
Merge branch 'sep0_task_proto' into sep2_custom_blobdesc_infer
strint Apr 14, 2023
a9ad100
Merge branch 'sep2_custom_blobdesc_infer' into sep3_fake_regst
strint Apr 14, 2023
3921231
Merge branch 'sep3_fake_regst' into plan_sep_compile_merge
strint Apr 14, 2023
7514cbe
fix merge
strint Apr 14, 2023
5805ea5
rm useless
strint Apr 14, 2023
e8b3053
rm useless
strint Apr 14, 2023
d7b7594
add rank compiler
strint Apr 14, 2023
95f59a3
merge upstream
strint Apr 14, 2023
2c4c3e2
fix merge
strint Apr 15, 2023
09154d0
add rank compiler
strint Apr 15, 2023
fd2c2a3
merge upstream
strint Apr 15, 2023
1936fdc
rm debug code
strint Apr 15, 2023
f211abb
rm lazy.h
strint Apr 15, 2023
b0c7ad0
rm useless change
strint Apr 15, 2023
298aea4
rm useless
strint Apr 15, 2023
5fd0870
rm deallocate ctx (#10143)
strint Apr 15, 2023
7df10e7
Remove debug compile mode (#10145)
strint Apr 15, 2023
410c71f
merge master
strint May 11, 2023
e9a20de
auto format by CI
oneflow-ci-bot May 11, 2023
57499b7
fix merge
strint May 12, 2023
d169bdd
Merge branch 'sep4_rank_task_graph' of https://github.com/Oneflow-Inc…
strint May 12, 2023
c956597
Merge branch 'master' into sep4_rank_task_graph
strint May 12, 2023
c2039df
fix licence
strint May 12, 2023
09293c4
Merge branch 'sep4_rank_task_graph' of https://github.com/Oneflow-Inc…
strint May 12, 2023
4586502
move CreateOpAttributeRef
strint May 20, 2023
45c1ca4
Merge branch 'master' into sep4_rank_task_graph
strint May 21, 2023
bded88f
fix NeedBoxing for NDSBP
strint May 24, 2023
af8303e
Merge branch 'sep4_rank_task_graph' of https://github.com/Oneflow-Inc…
strint May 24, 2023
adb1b08
address review
strint May 27, 2023
429aa14
Merge branch 'master' into sep4_rank_task_graph
strint Jun 3, 2023
a916407
fix static check
strint Jun 3, 2023
c00c942
merge upstream
strint Jun 3, 2023
76f0157
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Jun 3, 2023
cc91242
Merge branch 'master' into plan_sep_compile_merge
strint Jun 4, 2023
70c2192
fix conflict
strint Jun 4, 2023
105244d
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Jun 4, 2023
184289f
pass test
strint Jun 4, 2023
7efa97a
auto format by CI
oneflow-ci-bot Jun 4, 2023
9a8499f
revert test
strint Jun 4, 2023
bfcd7d7
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Jun 4, 2023
a5b0251
refine
strint Jun 4, 2023
b98e25f
auto format by CI
oneflow-ci-bot Jun 4, 2023
6d4e214
fix static check
strint Jun 5, 2023
3d677f3
Merge branch 'plan_sep_compile_merge' of https://github.com/Oneflow-I…
strint Jun 5, 2023
6ae7714
Update test_graph_separate_compile.py
strint Jun 7, 2023
a1ed190
fix test
strint Jun 12, 2023
40e9160
auto format by CI
oneflow-ci-bot Jun 12, 2023
47a64f3
Merge branch 'master' into plan_sep_compile_merge
strint Jun 13, 2023
48c73cf
refine case
strint Jun 13, 2023
d9d4771
rm test
strint Jun 13, 2023
35aae5f
refine test
strint Jun 13, 2023
e6c539b
refine test
strint Jun 13, 2023
03d8f0a
refine test
strint Jun 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ endif()

message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")

set(COMPILER_VERSION_ERROR_MSG "At least gcc 9, clang 5 or Apple clang 12 is supported.")
set(COMPILER_VERSION_ERROR_MSG
"At least gcc 9, clang 5 or Apple clang 12 is supported. Current version ${CMAKE_CXX_COMPILER_VERSION}."
)

if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 9)
Expand Down Expand Up @@ -151,6 +153,10 @@ if(BUILD_PROFILER)
add_definitions(-DOF_ENABLE_PROFILER)
endif()

if(BUILD_DEBUG_LAZY_RUNTIME)
add_definitions(-DOF_DEBUG_LAZY_RUNTIME)
endif()

if(OF_SOFTMAX_USE_FAST_MATH)
add_definitions(-DOF_SOFTMAX_USE_FAST_MATH)
endif()
Expand Down
29 changes: 29 additions & 0 deletions oneflow/core/common/async_deallocate_context.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/common/async_deallocate_context.h"
#include "oneflow/core/thread/thread_pool.h"

namespace oneflow {

AsyncDeallocateContext::AsyncDeallocateContext() : thread_pool_(std::make_unique<ThreadPool>(1)) {}

AsyncDeallocateContext::~AsyncDeallocateContext() {}

void AsyncDeallocateContext::Dispatch(std::function<void()> Handle) {
thread_pool_->AddWork(Handle);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

支持在一个独立线程里释放大对象

Copy link
Contributor

@leaves-zwx leaves-zwx Mar 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

需要实现一个新的类吗?直接调用全局 thread pool 是不是也是一样的?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同问,这里的 AsyncDeallocateContext 感觉意义不是很大, 直接使用全局的 thread pool 会有什么问题呢?

Copy link
Contributor Author

@strint strint Apr 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里是为了临时启用一个线程,做异步的大对象释放(开销明显),主线程可以省掉大对象析构的时间。

这个线程在做对象释放时,主线程可能在驱动 Global thread pool 去做通信。所以不适合复用。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

主线程可能在驱动 Global thread pool 去做通信

什么情况下 Global thread pool 会去做通信呢

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

主线程可能在驱动 Global thread pool 去做通信

什么情况下 Global thread pool 会去做通信呢

// Separation plan compilation is done by:
// a. Master broadcast job(or logical graph) to all workers, make all rank use the same job.
// b. Mater compile BoxingTaskGraph and broadcast it to all workers. BoxingTaskGraph needs to be
// done on master rank.
// c. Each rank compile it's related task node with RankCompiler. RankCompiler compile with the
// BoxingTaskGraph and the job.
// d. Master CollectiveBoxingPlan and then broadcast to all the workers.

比如 a d 里面的同步数据的操作,调用了 MultiThreadLoop

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// b. Mater compile BoxingTaskGraph and broadcast it to all workers. BoxingTaskGraph needs to be
// done on master rank.

b 里面不会触发数据同步 和 MultiThreadLoop 吗?

Copy link
Contributor Author

@strint strint Apr 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

b 里面不会触发数据同步 和 MultiThreadLoop 吗?

也会,通信都用的 MultiThreadLoop

}

} // namespace oneflow
41 changes: 41 additions & 0 deletions oneflow/core/common/async_deallocate_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_COMMON_ASYNC_DEALLOCATE_CONTEXT_H_
#define ONEFLOW_CORE_COMMON_ASYNC_DEALLOCATE_CONTEXT_H_

#include "oneflow/core/common/deallocate_context.h"
#include "oneflow/core/common/util.h"

namespace oneflow {

class ThreadPool;

// Support releasing large objects in a separate thread.
class AsyncDeallocateContext final : public DeallocateContext {
public:
AsyncDeallocateContext();
~AsyncDeallocateContext() override;
OF_DISALLOW_COPY_AND_MOVE(AsyncDeallocateContext);

void Dispatch(std::function<void()> Handle) override;

private:
std::shared_ptr<ThreadPool> thread_pool_;
};

} // namespace oneflow

#endif // ONEFLOW_CORE_COMMON_ASYNC_DEALLOCATE_CONTEXT_H_
13 changes: 13 additions & 0 deletions oneflow/core/common/balanced_splitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ BalancedSplitter::BalancedSplitter(int64_t total_num, int64_t split_num) {
base_part_size_ = total_num / split_num;
base_begin_idx_ = total_num % split_num;
split_num_ = split_num;
CHECK_EQ(this->total_num(), total_num);
}

int64_t BalancedSplitter::total_num() const { return At(split_num_ - 1).end(); }

Range BalancedSplitter::At(int64_t idx) const {
CHECK_LT(idx, split_num_);
int64_t left_bound = -1;
Expand All @@ -46,4 +49,14 @@ Range BalancedSplitter::At(int64_t first_idx, int64_t last_idx) const {
return Range(first_range.begin(), last_range.end());
}

int64_t BalancedSplitter::GetRangIndex(int64_t value) const {
CHECK_GE(value, 0);
CHECK_LT(value, total_num());
int64_t base_size = (base_part_size_ + 1) * base_begin_idx_;
if (value < base_size) {
return value / (base_part_size_ + 1);
} else {
return base_begin_idx_ + (value - base_size) / base_part_size_;
}
}
} // namespace oneflow
4 changes: 4 additions & 0 deletions oneflow/core/common/balanced_splitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ class BalancedSplitter final {
Range At(int64_t idx) const;
Range At(int64_t first_idx, int64_t last_idx) const;

// Get the index number of a range.
int64_t GetRangIndex(int64_t value) const;
int64_t total_num() const;

private:
int64_t base_part_size_;
int64_t base_begin_idx_;
Expand Down
13 changes: 13 additions & 0 deletions oneflow/core/common/balanced_splitter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,17 @@ TEST(BalancedSplitter, split_2_to_3_part) {
ASSERT_TRUE(splitter.At(2) == Range(2, 2));
}

TEST(BalancedSplitter, GetRangeIndex) {
const size_t total_num = 937;
const size_t split_num = 11;
BalancedSplitter bs(total_num, split_num);
ASSERT_TRUE(bs.total_num() == total_num);
for (size_t i = 0; i < split_num; ++i) {
Range range = bs.At(i);
for (size_t value = range.begin(); value < range.end(); ++value) {
ASSERT_TRUE(bs.GetRangIndex(value) == i);
}
}
}

} // namespace oneflow
54 changes: 54 additions & 0 deletions oneflow/core/common/deallocate_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_COMMON_DEALLOCATE_CONTEXT_H_
#define ONEFLOW_CORE_COMMON_DEALLOCATE_CONTEXT_H_

#include <functional>
#include <memory>

namespace oneflow {

class DeallocateContext {
public:
DeallocateContext() = default;
virtual ~DeallocateContext() = default;

// Support customizing the Dispatch method for releasing a pointer, such as releasing it in a
// separate thread. Note that T also needs to be a smart pointer.
template<typename T>
void Deallocate(std::shared_ptr<T>&& ptr) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

单独的多线程释放,可以单独一个 PR(后置优化 PR)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

单独的多线程释放,可以单独一个 PR(后置优化 PR)

done

std::shared_ptr<T> data = ptr;
ptr.reset();
// reset shared_ptr inside data by customized Dispatch
Dispatch([data] { const_cast<std::shared_ptr<T>*>(&data)->reset(); });
// reset data
data.reset();
}

virtual void Dispatch(std::function<void()> Handle) = 0;
};

class NaiveDeallocateContext final : public DeallocateContext {
public:
NaiveDeallocateContext() = default;
~NaiveDeallocateContext() = default;

void Dispatch(std::function<void()> Handle) { Handle(); }
};

} // namespace oneflow

#endif // ONEFLOW_CORE_COMMON_DEALLOCATE_CONTEXT_H_
11 changes: 11 additions & 0 deletions oneflow/core/common/env_var/env_var.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ int64_t ThreadLocalEnvInteger();

DEFINE_THREAD_LOCAL_ENV_INTEGER(ONEFLOW_THRAED_LOCAL_CACHED_SIZE, 128 * 1024);

template<typename env_var>
const std::string& ThreadLocalEnvString();

#define DEFINE_THREAD_LOCAL_ENV_STRING(env_var, default_value) \
struct env_var {}; \
template<> \
inline const std::string& ThreadLocalEnvString<env_var>() { \
thread_local std::string value = GetStringFromEnv(OF_PP_STRINGIZE(env_var), default_value); \
return value; \
}

} // namespace oneflow

#endif // ONEFLOW_CORE_COMMON_ENV_VAR_ENV_VAR_H_
30 changes: 30 additions & 0 deletions oneflow/core/common/env_var/lazy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_COMMON_ENV_VAR_LAZY_H_
#define ONEFLOW_CORE_COMMON_ENV_VAR_LAZY_H_

#include <string>
#include "oneflow/core/common/env_var/env_var.h"

namespace oneflow {

// Default compilation mode and default number of threads during compilation.
DEFINE_THREAD_LOCAL_ENV_STRING(ONEFLOW_LAZY_COMPILE_MODE, "naive");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里应该把合法的 "mode" str 罗列出来

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里应该把合法的 "mode" str 罗列出来

done

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

既然: rank_per_iter 和 rank_per_thread 是用于中间调试用的 mode,那么最终应该改成一个 bool 环境变量:

ENABLE_ONEFLOW_LAZY_COMPILE_PER_RANK 之类的环境变量,开启或者关闭

DEFINE_THREAD_LOCAL_ENV_INTEGER(ONEFLOW_LAZY_COMPILE_RPC_THREAD_NUM, 16);

} // namespace oneflow

#endif // ONEFLOW_CORE_COMMON_ENV_VAR_LAZY_H_
41 changes: 41 additions & 0 deletions oneflow/core/common/id_pairs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_COMMON_ID_PAIRS_H_
#define ONEFLOW_CORE_COMMON_ID_PAIRS_H_

#include "oneflow/core/common/id_pairs.pb.h"
#include <unordered_set>
#include <utility>

namespace oneflow {

void InitIdPairs(const std::unordered_set<std::pair<int64_t, int64_t>>& pairs, IdPairs* proto) {
for (const auto& pair : pairs) {
auto* proto_pair = proto->mutable_int64_pair()->Add();
proto_pair->set_first(pair.first);
proto_pair->set_second(pair.second);
}
}

void MergeIdPairs(const IdPairs& id_pairs, std::unordered_set<std::pair<int64_t, int64_t>>* pairs) {
for (const auto& pair : id_pairs.int64_pair()) {
pairs->emplace(std::make_pair(pair.first(), pair.second()));
}
}

} // namespace oneflow

#endif // ONEFLOW_CORE_COMMON_ID_PAIRS_H_
11 changes: 11 additions & 0 deletions oneflow/core/common/id_pairs.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
syntax = "proto2";
package oneflow;

message Int64Pair {
required int64 first = 1;
required int64 second = 2;
}

message IdPairs {
repeated Int64Pair int64_pair = 1;
}
6 changes: 1 addition & 5 deletions oneflow/core/framework/instructions_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ Maybe<void> InstructionsBuilder::SoftSyncNNGraphBuffers(
namespace {

int64_t NewSymbolId() {
static std::atomic<int64_t> cnt(0);
static std::atomic<int64_t> cnt(1);
return cnt.fetch_add(1, std::memory_order_relaxed);
}

Expand All @@ -225,10 +225,6 @@ Maybe<Scope> InstructionsBuilder::GetScopeSymbol(const ScopeProto& scope_proto)
return Singleton<symbol::Storage<Scope>>::Get()->FindOrCreate(scope_proto, &NewSymbolId);
}

Maybe<OperatorConfSymbol> InstructionsBuilder::GetOpConfSymbol(const OperatorConf& op_conf) {
return Singleton<symbol::Storage<OperatorConfSymbol>>::Get()->FindOrCreate(op_conf, &NewSymbolId);
}

Maybe<Scope> InstructionsBuilder::BuildInitialScope(
int64_t session_id, const JobConfigProto& job_conf, const std::string& device_tag,
const std::vector<std::string>& machine_device_ids, const std::shared_ptr<Shape>& hierarchy,
Expand Down
Loading