-
Notifications
You must be signed in to change notification settings - Fork 692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add check of placement constructor #6991
Changes from 7 commits
a66e09e
ae81013
1e1da33
4d7db22
64c4a4a
4648a07
bbb3e50
09c79c4
1c7c75a
e21d6e7
ce27122
6e6fb6e
3a0bbd5
9a52595
1f05567
ac172a4
dfc96ef
3add327
24eba9e
4388edf
828aa46
2930f9d
5b0914f
c432c52
9430475
e738f0f
c7d8284
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
#include <algorithm> | ||
#include <pybind11/pybind11.h> | ||
#include <pybind11/stl.h> | ||
#include <pybind11/operators.h> | ||
|
@@ -30,13 +31,26 @@ limitations under the License. | |
#include "oneflow/core/job/placement.cfg.h" | ||
#include "oneflow/core/job/global_for.h" | ||
#include "oneflow/core/job/resource_desc.h" | ||
#ifdef WITH_CUDA | ||
#include <cuda.h> | ||
#endif // WITH_CUDA | ||
|
||
namespace py = pybind11; | ||
|
||
namespace oneflow { | ||
|
||
namespace { | ||
|
||
int64_t GetGpuDeviceNum() { | ||
#ifndef WITH_CUDA | ||
return 0; | ||
#else | ||
int device_count = 0; | ||
cudaGetDeviceCount(&device_count); | ||
return device_count; | ||
#endif | ||
} | ||
|
||
Maybe<Shape> MakeShape(const py::tuple& py_shape) { | ||
DimVector shape_dims{}; | ||
for (const auto& dim : py_shape) { shape_dims.emplace_back(dim.cast<int64_t>()); } | ||
|
@@ -150,6 +164,12 @@ struct PlacementSymbolExportUtil { | |
if (iter == device_tag2placement.end()) { | ||
int64_t node_size = GlobalProcessCtx::NodeSize(); | ||
int64_t device_num = GlobalProcessCtx::NumOfProcessPerNode(); | ||
if (device_tag == "gpu") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 对 |
||
const int64_t gpu_device_num = GetGpuDeviceNum(); | ||
CHECK_NE(gpu_device_num, 0) | ||
<< "Can\'t construct placment with \"cuda\" type because there is no CUDA device!"; | ||
device_num = std::min(device_num, gpu_device_num); | ||
} | ||
std::vector<std::string> machine_device_ids; | ||
for (int64_t node_id = 0; node_id < node_size; ++node_id) { | ||
std::string device_name = std::to_string(node_id) + ":0-" + std::to_string(device_num - 1); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,22 +13,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
#include <algorithm> | ||
#include "oneflow/core/job/parallel_desc.h" | ||
#include "oneflow/core/job/placement.cfg.h" | ||
#include "oneflow/core/common/decorator.h" | ||
#include "oneflow/core/common/util.h" | ||
#include "oneflow/core/common/multi_client.h" | ||
#include "oneflow/core/common/cpp_attribute.h" | ||
#include "oneflow/core/job/global_for.h" | ||
#include "oneflow/core/job/id_manager.h" | ||
#include "oneflow/core/control/global_process_ctx.h" | ||
#include "oneflow/core/framework/parallel_conf_util.h" | ||
#include "oneflow/core/framework/instructions_builder.h" | ||
#include "oneflow/core/framework/device.h" | ||
#include "oneflow/core/vm/vm_util.h" | ||
#ifdef WITH_CUDA | ||
#include <cuda.h> | ||
#endif // WITH_CUDA | ||
|
||
namespace oneflow { | ||
|
||
namespace { | ||
|
||
int64_t GetGpuDeviceNum() { | ||
#ifndef WITH_CUDA | ||
return 0; | ||
#else | ||
int device_count = 0; | ||
cudaGetDeviceCount(&device_count); | ||
return device_count; | ||
#endif | ||
} | ||
|
||
using MachineId2DeviceIdList = | ||
std::shared_ptr<HashMap<int64_t, std::shared_ptr<std::vector<int64_t>>>>; | ||
|
||
|
@@ -302,6 +318,34 @@ Maybe<void> ParallelDesc::CheckWithResourceDesc(const ResourceDesc& resource_des | |
return Maybe<void>::Ok(); | ||
} | ||
|
||
Maybe<void> ParallelDesc::CheckDeviceIdsIsValid() const { | ||
if (likely(JUST(IsMultiClient()))) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 只有multi-client 模式才做检查 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. likey是一个新加的语法? 😂 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
likely是cpp_attribute.h里定义的一个宏,你可以认为告诉编译器这个分支最常走,要它顺着这个分支做优化,方便机器指令预取。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 相应的还有unlikely |
||
const auto& sorted_dev_phy_ids_iter = | ||
machine_id2sorted_dev_phy_ids_->find(GlobalProcessCtx::Rank()); | ||
if (sorted_dev_phy_ids_iter != machine_id2sorted_dev_phy_ids_->end()) { | ||
for (int64_t dev_phy_id : *sorted_dev_phy_ids_iter->second) { | ||
if (device_type_ == DeviceType::kCUDA) { | ||
const int64_t gpu_device_num = GetGpuDeviceNum(); | ||
CHECK_NE_OR_RETURN(gpu_device_num, 0) | ||
<< "Placment with \"cuda\" type is invalid because there is no CUDA device!"; | ||
int64_t device_num = std::min(GlobalProcessCtx::NumOfProcessPerNode(), gpu_device_num); | ||
CHECK_LT_OR_RETURN(dev_phy_id, device_num) | ||
<< "Placment is invalid because device id must be less than " | ||
<< (gpu_device_num < GlobalProcessCtx::NumOfProcessPerNode() | ||
? "num of CUDA devices on node" | ||
: "num of process per node"); | ||
} else if (device_type_ == DeviceType::kCPU) { | ||
CHECK_LT_OR_RETURN(dev_phy_id, GlobalProcessCtx::NumOfProcessPerNode()) | ||
<< "Placment is invalid because device id must be less than num of process per node"; | ||
} else { | ||
OF_UNIMPLEMENTED(); | ||
} | ||
} | ||
} | ||
} | ||
return Maybe<void>::Ok(); | ||
} | ||
|
||
ParallelConf ParallelDesc::GetParallelIdOnlyParallelConf(int64_t parallel_id) const { | ||
ParallelConf parallel_conf; | ||
std::string rank = std::to_string(CHECK_JUST(MachineId4ParallelId(parallel_id))); | ||
|
@@ -456,6 +500,11 @@ Maybe<Symbol<ParallelDesc>> RawTxtStringToPlacement(const std::string& parallel_ | |
return SymbolOf(ParallelDesc(parallel_conf)); | ||
} | ||
|
||
Maybe<void> RawCheckDeviceIdsIsValid(Symbol<ParallelDesc> placement) { | ||
JUST(placement->CheckDeviceIdsIsValid()); | ||
return Maybe<void>::Ok(); | ||
} | ||
|
||
} // namespace | ||
|
||
decltype(GetParallelId4CurrentProcessCtx) GetParallelId4CurrentProcessCtx = | ||
|
@@ -467,5 +516,7 @@ decltype(PlacementToString) PlacementToString = DECORATE(&RawPlacementToString, | |
decltype(GetTensorDevice) GetTensorDevice = DECORATE(&RawGetTensorDevice, ThreadLocal); | ||
decltype(TxtStringToPlacement) TxtStringToPlacement = | ||
DECORATE(&RawTxtStringToPlacement, ThreadLocalCopiable); | ||
decltype(CheckDeviceIdsIsValid) CheckDeviceIdsIsValid = | ||
DECORATE(&RawCheckDeviceIdsIsValid, ThreadLocal); | ||
|
||
} // namespace oneflow |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,6 +107,7 @@ class ParallelDesc final { | |
|
||
std::shared_ptr<cfg::ParallelConf> cfg_parallel_conf() const { return cfg_parallel_conf_; } | ||
bool TryGetParallelId(int64_t machine_id, int64_t device_id, int64_t* parallel_id) const; | ||
Maybe<void> CheckDeviceIdsIsValid() const; | ||
|
||
private: | ||
friend Maybe<OFRecord> ParseMachineAndDeviceIdList(const ParallelConf& parallel_conf); | ||
|
@@ -149,6 +150,7 @@ extern Maybe<Symbol<ParallelDesc>> (*ReplaceDeviceType)(Symbol<ParallelDesc>, De | |
extern Maybe<std::string> (*PlacementToString)(Symbol<ParallelDesc> placement); | ||
extern Maybe<Symbol<Device>> (*GetTensorDevice)(Symbol<ParallelDesc> parallel_desc); | ||
extern Maybe<Symbol<ParallelDesc>> (*TxtStringToPlacement)(const std::string& parallel_conf_str); | ||
extern Maybe<void> (*CheckDeviceIdsIsValid)(Symbol<ParallelDesc> placement); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 按照833#issuecomment-993179425的说法,这个借口仍然保留,建议使用placemnt时检查 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 那还是revert到之前的设计吧。如果确定在使用 placement 的时候加上检查,那么这里确实可以不用加。省的增加 ParallelDesc 的复杂度了。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
已经回退到之前的设计,在使用 placement 的时候加上检查 |
||
|
||
inline bool operator==(const ParallelConf& lhs, const ParallelConf& rhs) { | ||
return ParallelDesc(lhs) == ParallelDesc(rhs); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
placement 的check散落在各处,感觉是增加了使用负担。
好像还是之前那种构造时check的方法更好?即生成时就可以保证正确性。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
构造时check会引发local代码中会一些问题,讨论过程在这里commnet-in-issue883
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
xinqi想把placement设计成一种逻辑描述,在生成时不需要有物理设备;只在运行时去检查其需要的物理设备。
这样讲的确是合理的。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
placement 的检查散落在了太多地方。任何一个 consistent 的 functor 都需要检查 placement。我觉得不是一个好的设计。。。。。 我倾向于就在构造的时候检查
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
我也觉得是合理的,placement 作为 consistent 层的配置,的确应该是逻辑描述。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
但是这个逻辑描述如果超出实际的限制以后,就不能被任何 tensor 所使用。本身存在有什么意义呢
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
placement 本身作为逻辑描述,这个可以。 但是不允许把这个检查蔓延到任何使用 placement 的地方。除非想到其他方式既能允许 placement 定义的时候任意定义,又能做这个检查保证使用 placement 的时候在合法值范围内。