From bbd0d1d3b4b680ce280dabca9db3eb98d0b1e587 Mon Sep 17 00:00:00 2001 From: binbinHan Date: Thu, 16 Dec 2021 18:56:11 +0800 Subject: [PATCH] Add check of placement constructor (#6991) * add_check_of_placement_constructor * move CheckDeviceIdsIsValid to runtime * handle comment * fix error * fix error Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com> --- oneflow/api/python/framework/op_expr.cpp | 1 + oneflow/api/python/functional/tensor_api.cpp | 4 ++ .../api/python/symbol/placement_symbol.cpp | 20 ++++++++ .../core/functional/impl/array_functor.cpp | 2 + .../core/functional/impl/consistent_cast.cpp | 2 + .../core/functional/impl/dataset_functor.cpp | 1 + oneflow/core/functional/impl/math_functor.cpp | 3 ++ .../core/functional/impl/random_functor.cpp | 5 ++ oneflow/core/job/parallel_desc.cpp | 51 +++++++++++++++++++ oneflow/core/job/parallel_desc.h | 2 + 10 files changed, 91 insertions(+) diff --git a/oneflow/api/python/framework/op_expr.cpp b/oneflow/api/python/framework/op_expr.cpp index 9a32ef35617..ae0ef563806 100644 --- a/oneflow/api/python/framework/op_expr.cpp +++ b/oneflow/api/python/framework/op_expr.cpp @@ -51,6 +51,7 @@ Maybe Interpret(const one::OpExpr& op, Maybe Interpret(const one::OpExpr& op, const Symbol& placement, const std::vector>& sbp_tuple, const AttrMap& attrs) { + JUST(CheckDeviceIdsIsValid(placement)); CHECK_EQ_OR_RETURN(op.input_size(), 0) << " the op : " << op.op_type_name() << " is NOT source op with input_size = " << op.input_size(); diff --git a/oneflow/api/python/functional/tensor_api.cpp b/oneflow/api/python/functional/tensor_api.cpp index 10c162dd182..c1b79e68b52 100644 --- a/oneflow/api/python/functional/tensor_api.cpp +++ b/oneflow/api/python/functional/tensor_api.cpp @@ -75,6 +75,7 @@ class ConsistentTensorWithDataFunctor { const bool& requires_grad) const { // NOTE(chengcheng): flow.Tensor or flow.tensor ONLY created by EagerTensor now. LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false); + JUST(CheckDeviceIdsIsValid(placement)); if (PyTensorCheck(data)) { // Throw warnings like pytorch. @@ -107,6 +108,7 @@ class ConsistentTensorEmptyCtorFunctor { Maybe operator()(const Symbol& placement, const std::vector>& sbp_tuple) const { Shape shape(DimVector{0}); + JUST(CheckDeviceIdsIsValid(placement)); return ConsistentTensorWithShapeCtor(shape, placement, sbp_tuple); } }; @@ -148,6 +150,7 @@ class ConsistentTensorWithDataCtorFunctor { public: Maybe operator()(PyObject* data, const Symbol& placement, const std::vector>& sbp_tuple) const { + JUST(CheckDeviceIdsIsValid(placement)); // Treat the single long as shape. if (PyLong_Check(data)) { int64_t size = PyLong_AsLongLong(data); @@ -190,6 +193,7 @@ class ConsistentTensorWithShapeCtorFunctor { const std::vector>& sbp_tuple) const { // NOTE(chengcheng): flow.Tensor or flow.tensor ONLY created by EagerTensor now. LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false); + JUST(CheckDeviceIdsIsValid(placement)); return functional::ConsistentEmpty(shape, DType::Float(), placement, sbp_tuple); } }; diff --git a/oneflow/api/python/symbol/placement_symbol.cpp b/oneflow/api/python/symbol/placement_symbol.cpp index 3547aeb91db..bec528df334 100644 --- a/oneflow/api/python/symbol/placement_symbol.cpp +++ b/oneflow/api/python/symbol/placement_symbol.cpp @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include #include @@ -30,6 +31,9 @@ limitations under the License. #include "oneflow/core/job/placement.cfg.h" #include "oneflow/core/job/global_for.h" #include "oneflow/core/job/resource_desc.h" +#ifdef WITH_CUDA +#include +#endif // WITH_CUDA namespace py = pybind11; @@ -37,6 +41,16 @@ namespace oneflow { namespace { +int64_t GetGpuDeviceNum() { +#ifndef WITH_CUDA + return 0; +#else + int device_count = 0; + cudaGetDeviceCount(&device_count); + return device_count; +#endif +} + Maybe MakeShape(const py::tuple& py_shape) { DimVector shape_dims{}; for (const auto& dim : py_shape) { shape_dims.emplace_back(dim.cast()); } @@ -150,6 +164,12 @@ struct PlacementSymbolExportUtil { if (iter == device_tag2placement.end()) { int64_t node_size = GlobalProcessCtx::NodeSize(); int64_t device_num = GlobalProcessCtx::NumOfProcessPerNode(); + if (device_tag == "gpu") { + const int64_t gpu_device_num = GetGpuDeviceNum(); + CHECK_NE(gpu_device_num, 0) + << "Can\'t construct placment with \"cuda\" type because there is no CUDA device!"; + device_num = std::min(device_num, gpu_device_num); + } std::vector machine_device_ids; for (int64_t node_id = 0; node_id < node_size; ++node_id) { std::string device_name = std::to_string(node_id) + ":0-" + std::to_string(device_num - 1); diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index 35778560f09..3e43f99428b 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -130,6 +130,7 @@ class ConsistentConstantFunctor { Maybe operator()(const Shape& shape, const Scalar& value, const Symbol& dtype, const Symbol& placement, const std::vector>& sbp_tuple) const { + JUST(CheckDeviceIdsIsValid(placement)); MutableAttrMap attrs; JUST(attrs.SetAttr("shape", shape)); JUST(attrs.SetAttr("dtype", dtype->data_type())); @@ -210,6 +211,7 @@ class ConsistentEmptyFunctor { Maybe operator()(const Shape& shape, const Symbol& dtype, const Symbol& placement, const std::vector>& sbp_tuple) const { + JUST(CheckDeviceIdsIsValid(placement)); MutableAttrMap attrs; JUST(attrs.SetAttr("shape", shape)); JUST(attrs.SetAttr("dtype", dtype->data_type())); diff --git a/oneflow/core/functional/impl/consistent_cast.cpp b/oneflow/core/functional/impl/consistent_cast.cpp index 4d01a497ab4..79545a3fd2b 100644 --- a/oneflow/core/functional/impl/consistent_cast.cpp +++ b/oneflow/core/functional/impl/consistent_cast.cpp @@ -294,6 +294,7 @@ class LocalToConsistentFunctor { Symbol parallel_desc, const std::vector>& sbp_parallels, const Shape& shape, const Symbol& dtype) const { + JUST(CheckDeviceIdsIsValid(parallel_desc)); CHECK_OR_RETURN(x->is_local()); std::shared_ptr input = x; // copy to right device first if input's device type is wrong @@ -336,6 +337,7 @@ class ToConsistentFunctor { Symbol parallel_desc, const std::vector>& sbp_parallels, const std::vector>& grad_sbp_parallels) const { + JUST(CheckDeviceIdsIsValid(parallel_desc)); std::shared_ptr tensor; if (x->is_consistent()) { tensor = JUST(ConsistentToConsistent(x, parallel_desc, sbp_parallels, grad_sbp_parallels)); diff --git a/oneflow/core/functional/impl/dataset_functor.cpp b/oneflow/core/functional/impl/dataset_functor.cpp index 096f58d566d..5a347c958ef 100644 --- a/oneflow/core/functional/impl/dataset_functor.cpp +++ b/oneflow/core/functional/impl/dataset_functor.cpp @@ -106,6 +106,7 @@ class ReadOneRecFunctor { JUST(attrs.SetAttr("verify_example", verify_example)); if (placement.has_value()) { + JUST(CheckDeviceIdsIsValid(JUST(placement))); CHECK_OR_RETURN(sbp.has_value()) << "placement is not None, but sbp is None. It's not allowed."; AttrMap attrmap(attrs); diff --git a/oneflow/core/functional/impl/math_functor.cpp b/oneflow/core/functional/impl/math_functor.cpp index d6873390857..8d702d059a8 100644 --- a/oneflow/core/functional/impl/math_functor.cpp +++ b/oneflow/core/functional/impl/math_functor.cpp @@ -622,6 +622,7 @@ class ConsistentEyeFunctor { const Optional>& dtype, const Symbol& placement, const std::vector>& sbp_tuple) const { + JUST(CheckDeviceIdsIsValid(placement)); MutableAttrMap attrs; JUST(attrs.SetAttr("rows", JUST(rows.As()))); JUST(attrs.SetAttr("cols", JUST(cols.value_or(rows).As()))); @@ -732,6 +733,7 @@ class ConsistentArangeFunctor { const Optional>& dtype, const Symbol& placement, const std::vector>& sbp_tuple) const { + JUST(CheckDeviceIdsIsValid(placement)); MutableAttrMap attrs; if (dtype.has_value()) { const DataType range_dtype = JUST(dtype)->data_type(); @@ -781,6 +783,7 @@ class ConsistentArange2Functor { Maybe operator()(const Scalar& limit, const Symbol& dtype, const Symbol& placement, const std::vector>& sbp_tuple) const { + JUST(CheckDeviceIdsIsValid(placement)); return ConsistentArange(Scalar(0), limit, Scalar(1), dtype, placement, sbp_tuple); } }; diff --git a/oneflow/core/functional/impl/random_functor.cpp b/oneflow/core/functional/impl/random_functor.cpp index 59445c9ebb1..d85ddc29704 100644 --- a/oneflow/core/functional/impl/random_functor.cpp +++ b/oneflow/core/functional/impl/random_functor.cpp @@ -108,6 +108,7 @@ class ConsistentRandFunctor { const Optional>& dtype, const Optional& generator, const bool& requires_grad) const { + JUST(CheckDeviceIdsIsValid(placement)); DataType dtype_val = DataType::kFloat; if (dtype.has_value()) { dtype_val = JUST(dtype)->data_type(); @@ -182,6 +183,7 @@ class ConsistentRandNFunctor { const Optional>& dtype, const Optional& generator, const bool& requires_grad) const { + JUST(CheckDeviceIdsIsValid(placement)); DataType dtype_val = DataType::kFloat; if (dtype) { dtype_val = JUST(dtype)->data_type(); } if (dtype_val != DataType::kFloat && dtype_val != DataType::kDouble) { @@ -269,6 +271,7 @@ class ConsistentRandIntFunctor { const Optional>& dtype, const Optional& generator, const bool& requires_grad) const { + JUST(CheckDeviceIdsIsValid(placement)); DataType dtype_val = DataType::kInt64; if (dtype) { dtype_val = JUST(dtype)->data_type(); } @@ -305,6 +308,7 @@ class ConsistentRandInt2Functor { const Optional>& dtype, const Optional& generator, const bool& requires_grad) const { + JUST(CheckDeviceIdsIsValid(placement)); return ConsistentRandInt(/*low*/ 0, high, shape, placement, sbp_tuple, dtype, generator, requires_grad); } @@ -344,6 +348,7 @@ class ConsistentRandPermFunctor { const std::vector>& sbp_tuple, const Optional& generator, const Symbol& dtype, const bool& requires_grad) const { + JUST(CheckDeviceIdsIsValid(placement)); const auto gen = generator.value_or(JUST(one::DefaultAutoGenerator())); MutableAttrMap attrs; JUST(attrs.SetAttr("n", n)); diff --git a/oneflow/core/job/parallel_desc.cpp b/oneflow/core/job/parallel_desc.cpp index c375c98ad96..9f36e1d08de 100644 --- a/oneflow/core/job/parallel_desc.cpp +++ b/oneflow/core/job/parallel_desc.cpp @@ -13,10 +13,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "oneflow/core/job/parallel_desc.h" #include "oneflow/core/job/placement.cfg.h" #include "oneflow/core/common/decorator.h" #include "oneflow/core/common/util.h" +#include "oneflow/core/common/multi_client.h" +#include "oneflow/core/common/cpp_attribute.h" #include "oneflow/core/job/global_for.h" #include "oneflow/core/job/id_manager.h" #include "oneflow/core/control/global_process_ctx.h" @@ -24,11 +27,24 @@ limitations under the License. #include "oneflow/core/framework/instructions_builder.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/vm/vm_util.h" +#ifdef WITH_CUDA +#include +#endif // WITH_CUDA namespace oneflow { namespace { +int64_t GetGpuDeviceNum() { +#ifndef WITH_CUDA + return 0; +#else + int device_count = 0; + cudaGetDeviceCount(&device_count); + return device_count; +#endif +} + using MachineId2DeviceIdList = std::shared_ptr>>>; @@ -302,6 +318,34 @@ Maybe ParallelDesc::CheckWithResourceDesc(const ResourceDesc& resource_des return Maybe::Ok(); } +Maybe ParallelDesc::CheckDeviceIdsIsValid() const { + if (likely(JUST(IsMultiClient()))) { + const auto& sorted_dev_phy_ids_iter = + machine_id2sorted_dev_phy_ids_->find(GlobalProcessCtx::Rank()); + if (sorted_dev_phy_ids_iter != machine_id2sorted_dev_phy_ids_->end()) { + for (int64_t dev_phy_id : *sorted_dev_phy_ids_iter->second) { + if (device_type_ == DeviceType::kCUDA) { + const int64_t gpu_device_num = GetGpuDeviceNum(); + CHECK_NE_OR_RETURN(gpu_device_num, 0) + << "Placment with \"cuda\" type is invalid because there is no CUDA device!"; + int64_t device_num = std::min(GlobalProcessCtx::NumOfProcessPerNode(), gpu_device_num); + CHECK_LT_OR_RETURN(dev_phy_id, device_num) + << "Placment is invalid because device id must be less than " + << (gpu_device_num < GlobalProcessCtx::NumOfProcessPerNode() + ? "num of CUDA devices on node" + : "num of process per node"); + } else if (device_type_ == DeviceType::kCPU) { + CHECK_LT_OR_RETURN(dev_phy_id, GlobalProcessCtx::NumOfProcessPerNode()) + << "Placment is invalid because device id must be less than num of process per node"; + } else { + OF_UNIMPLEMENTED(); + } + } + } + } + return Maybe::Ok(); +} + ParallelConf ParallelDesc::GetParallelIdOnlyParallelConf(int64_t parallel_id) const { ParallelConf parallel_conf; std::string rank = std::to_string(CHECK_JUST(MachineId4ParallelId(parallel_id))); @@ -456,6 +500,11 @@ Maybe> RawTxtStringToPlacement(const std::string& parallel_ return SymbolOf(ParallelDesc(parallel_conf)); } +Maybe RawCheckDeviceIdsIsValid(Symbol placement) { + JUST(placement->CheckDeviceIdsIsValid()); + return Maybe::Ok(); +} + } // namespace decltype(GetParallelId4CurrentProcessCtx) GetParallelId4CurrentProcessCtx = @@ -467,5 +516,7 @@ decltype(PlacementToString) PlacementToString = DECORATE(&RawPlacementToString, decltype(GetTensorDevice) GetTensorDevice = DECORATE(&RawGetTensorDevice, ThreadLocal); decltype(TxtStringToPlacement) TxtStringToPlacement = DECORATE(&RawTxtStringToPlacement, ThreadLocalCopiable); +decltype(CheckDeviceIdsIsValid) CheckDeviceIdsIsValid = + DECORATE(&RawCheckDeviceIdsIsValid, ThreadLocal); } // namespace oneflow diff --git a/oneflow/core/job/parallel_desc.h b/oneflow/core/job/parallel_desc.h index 4b43fba0d85..9848334950a 100644 --- a/oneflow/core/job/parallel_desc.h +++ b/oneflow/core/job/parallel_desc.h @@ -107,6 +107,7 @@ class ParallelDesc final { std::shared_ptr cfg_parallel_conf() const { return cfg_parallel_conf_; } bool TryGetParallelId(int64_t machine_id, int64_t device_id, int64_t* parallel_id) const; + Maybe CheckDeviceIdsIsValid() const; private: friend Maybe ParseMachineAndDeviceIdList(const ParallelConf& parallel_conf); @@ -149,6 +150,7 @@ extern Maybe> (*ReplaceDeviceType)(Symbol, De extern Maybe (*PlacementToString)(Symbol placement); extern Maybe> (*GetTensorDevice)(Symbol parallel_desc); extern Maybe> (*TxtStringToPlacement)(const std::string& parallel_conf_str); +extern Maybe (*CheckDeviceIdsIsValid)(Symbol placement); inline bool operator==(const ParallelConf& lhs, const ParallelConf& rhs) { return ParallelDesc(lhs) == ParallelDesc(rhs);