From 9b8664cc441fa544b5a690b027dfb21e8b2f3f83 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Sun, 5 Jun 2022 20:13:45 +0800 Subject: [PATCH 01/45] Add a slight cost for B->S and B->P in 2d sbp --- oneflow/core/framework/sbp_infer_util.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index a6ccd134267..fedf419ebc1 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -126,7 +126,7 @@ double ComputCopyCostBetweenTwoDiffSbpParallel(const SbpParallel& producer_sbp_p } if (on_same_devices) { // B->S, B->P - if (producer_sbp_parallel.has_broadcast_parallel()) { return 0; } + if (producer_sbp_parallel.has_broadcast_parallel()) { return 1; } // has S if (consumer_sbp_parallel.has_split_parallel() || producer_sbp_parallel.has_split_parallel()) { if (consumer_sbp_parallel.has_split_parallel() From c32adbbc7cc79bfc58ff42db29dcee737f64a891 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Mon, 6 Jun 2022 22:48:54 +0800 Subject: [PATCH 02/45] Add penalty for P in consumer --- oneflow/core/framework/sbp_infer_util.cpp | 55 ++++++++++++++++++++--- oneflow/core/framework/sbp_infer_util.h | 6 +++ 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index fedf419ebc1..dd39f85fe1e 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -43,6 +43,18 @@ bool CheckNdSbp(const NdSbp& nd_sbp) { return true; } +double Penalty4PartialInConsumer(double logical_blob_size, int32_t producer_parallel_num, + int32_t consumer_parallel_num) { + static const int64_t PartialInConsumerType = ParseIntegerFromEnv("PartialInConsumerTag", 2); + if (PartialInConsumerType == PartialInConsumerTag::kSlight) { + return 1.0; + } else if (PartialInConsumerType == PartialInConsumerTag::kMiddle) { + return 4 * logical_blob_size * (producer_parallel_num + consumer_parallel_num); + } else { + return kUnsupportedBoxing; + } +} + Maybe ComputCopyCostBetweenTwoSbpParallel(const SbpParallel& producer_sbp_parallel, const SbpParallel& consumer_sbp_parallel, const BlobDesc& logical_blob_desc, @@ -65,15 +77,19 @@ Maybe ComputCopyCostBetweenTwoSbpParallel(const SbpParallel& producer_sb if (producer_parallel_desc == consumer_parallel_desc) { // Same sbp, no cost: S->S, B->B, P->P if (producer_sbp_parallel == consumer_sbp_parallel) { return 0.0; } - // B->S, B->P - if (producer_sbp_parallel.has_broadcast_parallel()) { return 1.0; } + double logical_blob_size = + logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()); // S->P for eager. It should be 0 as well. // NOTE: Similar to B->P, we just make the other part to be 0. You can consider P as S(i) for an // arbitrary i. - if (consumer_sbp_parallel.has_partial_sum_parallel()) { return 1.0; } + // ? -> P + if (consumer_sbp_parallel.has_partial_sum_parallel()) { + return Penalty4PartialInConsumer(logical_blob_size, producer_parallel_desc.parallel_num(), + consumer_parallel_desc.parallel_num()); + } + // B->S + if (producer_sbp_parallel.has_broadcast_parallel()) { return 1.0; } - double logical_blob_size = - logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()); // has S if (consumer_sbp_parallel.has_split_parallel() || producer_sbp_parallel.has_split_parallel()) { if (consumer_sbp_parallel.has_split_parallel() @@ -108,7 +124,13 @@ Maybe ComputCopyCostBetweenTwoSbpParallel(const SbpParallel& producer_sb if (producer_sbp_parallel.has_partial_sum_parallel()) { overall_cost += (producer_parallel_desc.parallel_num() - 1) * logical_blob_size; } - // For B->P, B->S, S->S, overall_cost == logical_blob_size; + // ? -> P + if (consumer_sbp_parallel.has_partial_sum_parallel()) { + overall_cost += + Penalty4PartialInConsumer(logical_blob_size, producer_parallel_desc.parallel_num(), + consumer_parallel_desc.parallel_num()); + } + // For B->S, S->S, overall_cost == logical_blob_size; return overall_cost; } } @@ -125,7 +147,11 @@ double ComputCopyCostBetweenTwoDiffSbpParallel(const SbpParallel& producer_sbp_p return kUnsupportedBoxing; } if (on_same_devices) { - // B->S, B->P + // B->P + if (consumer_sbp_parallel.has_partial_sum_parallel()) { + return Penalty4PartialInConsumer(logical_blob_size, parallel_num, parallel_num); + } + // B->S if (producer_sbp_parallel.has_broadcast_parallel()) { return 1; } // has S if (consumer_sbp_parallel.has_split_parallel() || producer_sbp_parallel.has_split_parallel()) { @@ -151,6 +177,9 @@ double ComputCopyCostBetweenTwoDiffSbpParallel(const SbpParallel& producer_sbp_p if (producer_sbp_parallel.has_partial_sum_parallel()) { overall_cost += logical_blob_size * (parallel_num - 1); } + if (consumer_sbp_parallel.has_partial_sum_parallel()) { + overall_cost += Penalty4PartialInConsumer(logical_blob_size, parallel_num, parallel_num); + } // For B->P, B->S, S->S, overall_cost == logical_blob_size; return overall_cost; } @@ -264,6 +293,12 @@ Maybe ComputeEagerCopyCostBetweenNdSbp(const NdSbp& producer_sbp_paralle // TODO: Fix that after support all sbp combination for eager. total_cost += JUST(ComputCopyCostBetweenTwoSbpParallel( in_sbp, out_sbp, logical_blob_desc, reduced_in_parallel_desc, reduced_out_parallel_desc)); + // Add the penalty for P in the consumer + if (out_sbp.has_partial_sum_parallel() && (in_sbp != out_sbp)) { + total_cost += Penalty4PartialInConsumer( + logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()), + producer_parallel_desc.parallel_num(), consumer_parallel_desc.parallel_num()); + } // detect the cases that splits the same dimension before this splitting if (normal_case && in_sbp.has_split_parallel() && in_sbp == out_sbp) { for (int32_t j = 0; j < i; j++) { @@ -302,6 +337,12 @@ Maybe ComputeEagerCopyCostBetweenNdSbp(const NdSbp& producer_sbp_paralle if (reduced_out_nd_sbp.sbp_parallel(i).has_broadcast_parallel()) { out_cost *= reduced_out_parallel_desc.hierarchy()->At(i); } + // Add the penalty for P in the consumer + if (reduced_out_nd_sbp.sbp_parallel(i).has_partial_sum_parallel()) { + total_cost += + Penalty4PartialInConsumer(logical_blob_size, producer_parallel_desc.parallel_num(), + consumer_parallel_desc.parallel_num()); + } } total_cost += logical_blob_size * out_cost; } diff --git a/oneflow/core/framework/sbp_infer_util.h b/oneflow/core/framework/sbp_infer_util.h index 63fd1333523..401810e6d11 100644 --- a/oneflow/core/framework/sbp_infer_util.h +++ b/oneflow/core/framework/sbp_infer_util.h @@ -27,6 +27,12 @@ enum SbpInferRuleTag : int { kMinCost = 3 // Lowest cost }; +enum PartialInConsumerTag : int { + kSlight = 1, // Slight penalty + kMiddle = 2, // Make sure we do not select P in the consumer + kStrict = 3 // Not allow a transfer to P +}; + double GetValidMaxCopyCost(); double GetTransferCost(); From 403d4297876222edcb6ffa8048eb5bdfd5cb5d9f Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Tue, 7 Jun 2022 18:18:47 +0800 Subject: [PATCH 03/45] Fix a slight bug --- oneflow/core/auto_parallel/boxing_collector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index d28d696cf44..57b568a9611 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -49,7 +49,7 @@ void DfsSetNdSbp(const std::vector<::oneflow::SbpParallel>& id2sbp_parallel, int } // Let a nd sbp be consistent with the given hierarchy number -Maybe SetNdSbpDim(NdSbp nd_sbp, int32_t hierarchy_num) { +Maybe SetNdSbpDim(const NdSbp& nd_sbp, int32_t hierarchy_num) { // Do not need to change if (nd_sbp.sbp_parallel_size() == hierarchy_num) { return nd_sbp; } // (S0, S0) -> S0 From 3ff0d595be8c9f6e28e53dda0a97722579df6e15 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Wed, 8 Jun 2022 00:09:43 +0800 Subject: [PATCH 04/45] Add at most 1 middle node for general basic communication --- .../core/auto_parallel/boxing_collector.cpp | 129 +++++++++++++++--- oneflow/core/auto_parallel/boxing_collector.h | 9 ++ 2 files changed, 119 insertions(+), 19 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 57b568a9611..9f18a8073e9 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -71,6 +71,16 @@ Maybe SetNdSbpDim(const NdSbp& nd_sbp, int32_t hierarchy_num) { return new_sbp; } +int32_t TotalNumSplit(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc) { + int32_t total_num_split = 1; + for (int32_t i = 0; i < nd_sbp.sbp_parallel_size(); i++) { + if (nd_sbp.sbp_parallel(i).has_split_parallel()) { + total_num_split *= parallel_desc.hierarchy()->At(i); + } + } + return total_num_split; +} + } // namespace // A constructor with init, designed for uncustomized boxing collector @@ -496,25 +506,6 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const if (ParseBooleanFromEnv("ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK", false)) { return Maybe::Ok(); } - // If compute_cost==false + 2D sbp + same placment + nccl logical + not (p->b), - // Use nccl logical send recv instead of middle node. - // Note that in op sbp inference, cost of middle nodes is still used for the moment. -#ifdef WITH_CUDA - if (compute_cost == false && producer_parallel_desc.hierarchy()->NumAxes() == 2 - && producer_parallel_desc == consumer_parallel_desc - && !(NdSbpHasPartialParallel(sbp_consumer)) && - // TODO(): When same dim 0 finished dealing with (*, P) -> (*, S) in nccl logical pass, open - // this condition. When dealing with (P, P) -> (B, S0), middle node will change it to (P, P) - // -> (P, S0) -> (B, S0), neither same dim 0 or send recv in nccl logical pass can deal with - // (P, P) -> (P, S0) at the moment. - // !(NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) && - Global::Get()->nccl_use_compute_stream()) { - VLOG(3) << "Middle node insertion is skipped when src sbp is " << NdSbpToString(sbp_producer) - << " dst sbp is " << NdSbpToString(sbp_consumer) - << ", because nccl logical send/recv can handle this."; - return Maybe::Ok(); - } -#endif // WITH_CUDA // Dealing with 1D sbp to 1D sbp // Specifically, S -> P. @@ -568,6 +559,22 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const // Transfer for the same machines, devices and hierarchy. if (sbp_producer == sbp_consumer) { return Maybe::Ok(); } const auto& parallel_hierarchy = producer_parallel_desc.hierarchy(); + +#ifdef WITH_CUDA + // Use a general basic communication if no P in the consumer + if ((!NdSbpHasPartialParallel(sbp_consumer))) { + if (NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) { + // (?, P, ?)->(Si, Sj)->(?, B, ?), two-step transfer + JUST(AskSbpCombination4GeneralBasicCommunication( + sbp_producer, sbp_consumer, logical_blob_desc, producer_parallel_desc, + consumer_parallel_desc, middle_sbps, diag_node_pos)); + } else { + // one-step transfer + return Maybe::Ok(); + } + } +#endif // WITH_CUDA + *diag_node_pos = 0; // Dealing with nD sbp, n>2 if (parallel_hierarchy->NumAxes() > 2) { @@ -1007,4 +1014,88 @@ Maybe BoxingCollector::FilterNdSbpList4LogicalShape(const BlobDesc& logica return Maybe::Ok(); } +// Ask for sbp combination for general basic communication +Maybe BoxingCollector::AskSbpCombination4GeneralBasicCommunication( + const NdSbp& sbp_producer, const NdSbp& sbp_consumer, const BlobDesc& logical_blob_desc, + const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, + std::vector& middle_sbps, int32_t* diag_node_pos) { + bool close2producer = true; + if (producer_parallel_desc.parallel_num() == consumer_parallel_desc.parallel_num()) { + // Get close to the one with more splits + close2producer = TotalNumSplit(sbp_producer, producer_parallel_desc) + > TotalNumSplit(sbp_consumer, consumer_parallel_desc); + } else { + // Get close to the one with more machines + close2producer = producer_parallel_desc.parallel_num() > consumer_parallel_desc.parallel_num(); + } + // Get the contiguous sbp + if (close2producer) { + JUST(AskCloseAllSplitSbp(sbp_producer, producer_parallel_desc, logical_blob_desc, middle_sbps)); + *diag_node_pos = 1; + } else { + JUST(AskCloseAllSplitSbp(sbp_consumer, consumer_parallel_desc, logical_blob_desc, middle_sbps)); + *diag_node_pos = 0; + } + return Maybe::Ok(); +} + +// Ask for a all-split sbp which is close to the original one +Maybe BoxingCollector::AskCloseAllSplitSbp(const NdSbp& nd_sbp, + const ParallelDesc& parallel_desc, + const BlobDesc& logical_blob_desc, + std::vector& middle_sbps) { + Shape remain_shape = logical_blob_desc.shape(); + Shape rest_split_shape = logical_blob_desc.shape(); + int32_t dim_shape = remain_shape.NumAxes(); + // Initialize the remains and splitting + // logical_blob_desc.shape() == remain_shape .* rest_split_shape; + for (int32_t i = 0; i < dim_shape; i++) { rest_split_shape.Set(i, 1); } + for (int32_t sbp_id = 0; sbp_id < nd_sbp.sbp_parallel_size(); sbp_id++) { + const auto& sbp = nd_sbp.sbp_parallel(sbp_id); + if (sbp.has_split_parallel()) { + int32_t axis = sbp.split_parallel().axis(); + int32_t split_num = parallel_desc.hierarchy()->At(sbp_id); + remain_shape.Set(axis, remain_shape.At(axis) / split_num); + rest_split_shape.Set(axis, rest_split_shape.At(axis) * split_num); + } + } + // Get the contiguous sbp + NdSbp new_sbp = nd_sbp; + for (int32_t sbp_id = 0; sbp_id < nd_sbp.sbp_parallel_size(); sbp_id++) { + const auto& sbp = nd_sbp.sbp_parallel(sbp_id); + int32_t split_num = parallel_desc.hierarchy()->At(sbp_id); + if (sbp.has_split_parallel()) { + int32_t axis = sbp.split_parallel().axis(); + // split shape is the total splitting number starting from sbp_id to the end + rest_split_shape.Set(axis, rest_split_shape.At(axis) / split_num); + } else { + // change P or B to S(axis) + int32_t axis = -1; + // 4096 is large enough, we might not have that much devices + int32_t min_split_num = 4096; + // We need to pick a suitable axis + for (int32_t i = 0; i < remain_shape.NumAxes(); i++) { + if (remain_shape.At(i) % split_num > 0) { + if (rest_split_shape.At(i) < min_split_num) { + // Pick the axis with smallest splitting number among the rest of the sbp + min_split_num = rest_split_shape.At(i); + axis = i; + } + } + } + // P, B -> S(axis) + if (axis >= 0) { + new_sbp.mutable_sbp_parallel(sbp_id)->mutable_split_parallel()->set_axis(axis); + remain_shape.Set(axis, remain_shape.At(axis) / split_num); + } else { + // Can not find a suitable contiguous sbp + return Maybe::Ok(); + } + } + } + // Add the new sbp into the middle node lists + middle_sbps.emplace_back(new_sbp); + return Maybe::Ok(); +} + } // namespace oneflow diff --git a/oneflow/core/auto_parallel/boxing_collector.h b/oneflow/core/auto_parallel/boxing_collector.h index 09ddfd48f13..c0fda578dd3 100644 --- a/oneflow/core/auto_parallel/boxing_collector.h +++ b/oneflow/core/auto_parallel/boxing_collector.h @@ -129,6 +129,15 @@ class BoxingCollector final { BoxingCollector* boxing_collector_producer, BoxingCollector* boxing_collector_consumer, const std::vector>& diag_nodes); + // Ask for sbp combination for general basic communication + Maybe AskSbpCombination4GeneralBasicCommunication( + const NdSbp& sbp_producer, const NdSbp& sbp_consumer, const BlobDesc& logical_blob_desc, + const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, + std::vector& middle_sbps, int32_t* diag_node_pos); + // Ask for a all-split sbp which is closed to the original one + Maybe AskCloseAllSplitSbp(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc, + const BlobDesc& logical_blob_desc, + std::vector& middle_sbps); // Stores all the possible SbpParallel. HashMap<::oneflow::SbpParallel, int32_t> sbp_parallel_universe_; // Relationship between id and Sbp Parallel From 998f883e5987d5c218d08817ab8c9c3eeea2cd89 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 9 Jun 2022 00:15:53 +0800 Subject: [PATCH 05/45] Add the cost for general basic communication --- oneflow/core/framework/sbp_infer_util.cpp | 138 ++++++++++++++++++++++ oneflow/core/framework/sbp_infer_util.h | 7 ++ 2 files changed, 145 insertions(+) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index dd39f85fe1e..bd7f60c5931 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -20,7 +20,9 @@ limitations under the License. #include "oneflow/core/boxing/eager_boxing_interpreter_mgr.h" #include "oneflow/core/common/util.h" #include "oneflow/core/job/lazy_mode.h" +#include "oneflow/core/job/nd_sbp_util.h" #include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/job/sbp_parallel.pb.h" namespace oneflow { @@ -55,6 +57,25 @@ double Penalty4PartialInConsumer(double logical_blob_size, int32_t producer_para } } +int32_t Ratio4Sbp(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc, + std::function classifier) { + int32_t ratio = 1; + for (int32_t sbp_id = 0; sbp_id < nd_sbp.sbp_parallel_size(); sbp_id++) { + if (classifier(nd_sbp.sbp_parallel(sbp_id))) { ratio *= parallel_desc.hierarchy()->At(sbp_id); } + } + return ratio; +} + +int32_t PartialRatio4Producer(const NdSbp& sbp_producer, + const ParallelDesc& producer_parallel_desc) { + return Ratio4Sbp(sbp_producer, producer_parallel_desc, &SbpParallel::has_partial_sum_parallel); +} + +int32_t BroadcastRatio4Consumer(const NdSbp& sbp_consumer, + const ParallelDesc& consumer_parallel_desc) { + return Ratio4Sbp(sbp_consumer, consumer_parallel_desc, &SbpParallel::has_broadcast_parallel); +} + Maybe ComputCopyCostBetweenTwoSbpParallel(const SbpParallel& producer_sbp_parallel, const SbpParallel& consumer_sbp_parallel, const BlobDesc& logical_blob_desc, @@ -534,6 +555,21 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, bool requires_same_sbp) { + // In 90% of the transfer, we would have the same parallel description for producer and consumer + // We need to speed it up and give an approximation of the cost + if (producer_parallel_desc == consumer_parallel_desc) { + if (producer_sbp_parallel == consumer_sbp_parallel) { return 0.0; } +#ifdef WITH_CUDA + // Use a general basic communication if no P in the consumer + if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { + return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, + producer_parallel_desc, consumer_parallel_desc) + * logical_blob_desc.shape().elem_cnt() + * GetSizeOfDataType(logical_blob_desc.data_type()); + } +#endif // WITH_CUDA + } + // Initialize boxing collector constexpr int32_t kRegularMaxSplitAxes = 6; static thread_local BoxingCollector boxing_collector(kRegularMaxSplitAxes); @@ -614,4 +650,106 @@ double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, } } +// The transfer ratio for general basic communication +// Cost = ratio * data amount +double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, + const NdSbp& consumer_sbp_parallel, + const ParallelDesc& producer_parallel_desc, + const ParallelDesc& consumer_parallel_desc) { + // The upper bound of the amount of the transferred data + int32_t producer_partial_ratio = + PartialRatio4Producer(producer_sbp_parallel, producer_parallel_desc); + int32_t consumer_broadcast_ratio = + BroadcastRatio4Consumer(consumer_sbp_parallel, consumer_parallel_desc); + // approximate intersection ratio + double intersection_ratio = 1.0; + // (?, P, ?)->(Si, Sj)->(?, B, ?), two-step transfer + if (producer_partial_ratio > 1 && consumer_broadcast_ratio > 1) { + // Pure P in the producer or B in the consumer + // (P, P, P) -> ? or ? -> (B, B) + if (producer_partial_ratio == producer_parallel_desc.parallel_num() + || consumer_broadcast_ratio == consumer_parallel_desc.parallel_num()) { + // There some cases which is not applicable to this ratio + // We just take the one with the largest possibility + // For example: (P, S0) -> (B, B) for 1-D blob with machine hierarchy [n, m] + // The path should be (P, S0) -> (S0, S0) -> (B, B) + // true intersection ratio = 1/m + 1 + intersection_ratio = 2.0; + } else { + // sbp_consumer = (B, Si) or (Si, B) + for (int32_t sbp_id = 0; sbp_id < consumer_sbp_parallel.sbp_parallel_size(); sbp_id++) { + if (consumer_sbp_parallel.sbp_parallel(sbp_id).has_split_parallel()) { + const auto& producer_sbp4sbp_id = producer_sbp_parallel.sbp_parallel(sbp_id); + // (B, P) or (Si, P) -> (Si, B) + // (P, B) or (P, Si) -> (B, Si) + if (producer_sbp4sbp_id.has_broadcast_parallel() + || producer_sbp4sbp_id == consumer_sbp_parallel.sbp_parallel(sbp_id)) { + intersection_ratio = 2.0; + break; + } + } + } + // Judge whether the intersection ratio is given a value (2.0) + if (intersection_ratio == 1.0) { + // The true intersection ratio range from 0 to 2, + // we just take a middle point of the range as the approximation + // For example: (P, S0) -> (S0, B), Path: (P, S0) -> (S1, S0) -> (S0, B) + // true intersection ratio = 1 + 1/m + // For example: (P, S0) -> (S1, B), Path: (P, S0) -> (S1, S0) -> (S1, B) + // true intersection ratio = 1 + 1 + // For example: (P, S0) -> (B, S0), with a 1D blob + // true intersection ratio = (n+p-1)/nm + (n+p-1)/nm + // For example: (S0, P) -> (B, S0), Path: (S0, P) -> (S0, S1) -> (B, S0) + // true intersection ratio = 1 + 1/n + + // We use the approximation 1 + (1/n + 1/m)/2 + intersection_ratio = 1.0 + 0.5 / producer_parallel_desc.hierarchy()->At(0) + + 0.5 / producer_parallel_desc.hierarchy()->At(1); + } + } + } else { + // No P in the producer or no B in the consumer, one-step transfer + // The intersection ratio is design for two steps. + // However, we only have one step here, we would increase the ratio by 1.0 + // to eliminate the unused step + const auto& parallel_hierarchy = producer_parallel_desc.hierarchy(); + for (int32_t sbp_id = 0; sbp_id < consumer_sbp_parallel.sbp_parallel_size(); sbp_id++) { + const auto& producer_sbp4sbp_id = producer_sbp_parallel.sbp_parallel(sbp_id); + const auto& consumer_sbp4sbp_id = consumer_sbp_parallel.sbp_parallel(sbp_id); + // ? -> Si + if (consumer_sbp4sbp_id.has_split_parallel()) { + // Sj -> Si + if (producer_sbp4sbp_id.has_split_parallel() + && producer_sbp4sbp_id != consumer_sbp4sbp_id) { + intersection_ratio /= parallel_hierarchy->At(sbp_id); + } + } else { + // B/P -> B + if (!producer_sbp4sbp_id.has_split_parallel()) { + intersection_ratio *= parallel_hierarchy->At(sbp_id); + } + } + // For B/P/Si -> Si and Si -> B + // intersection ratio remains the same + } + // With the approximation above, + // (S1, S0) -> (S0, S0) would have an approximate intersection ratio 1/n + // (B, S0) -> (S0, S0) would have an approximate intersection ratio 1 + // However, their actual intersection ratios are (n+p-1)/(n^2*m) and (n+p-1)/(nm), respectively + // We add a patch for this approximation, making them 1/nm and 1/m respectively + if (producer_sbp_parallel.sbp_parallel(0) != consumer_sbp_parallel.sbp_parallel(0) + && producer_sbp_parallel.sbp_parallel_size() >= 2) { + const auto& producer_sbp_parallel_1 = producer_sbp_parallel.sbp_parallel(1); + if (producer_sbp_parallel_1 == consumer_sbp_parallel.sbp_parallel(1) + && producer_sbp_parallel_1.has_split_parallel() + && (producer_sbp_parallel_1 == producer_sbp_parallel.sbp_parallel(0) + || producer_sbp_parallel_1 == consumer_sbp_parallel.sbp_parallel(0))) { + intersection_ratio /= parallel_hierarchy->At(1); + } + } + } + // Subtract the intersection part + return producer_partial_ratio + consumer_broadcast_ratio - intersection_ratio; +} + } // namespace oneflow diff --git a/oneflow/core/framework/sbp_infer_util.h b/oneflow/core/framework/sbp_infer_util.h index 401810e6d11..e9c30cc4c8c 100644 --- a/oneflow/core/framework/sbp_infer_util.h +++ b/oneflow/core/framework/sbp_infer_util.h @@ -88,6 +88,13 @@ double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, bool requires_same_sbp); +// The transfer ratio for general basic communication +// Cost = ratio * data amount +double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, + const NdSbp& consumer_sbp_parallel, + const ParallelDesc& producer_parallel_desc, + const ParallelDesc& consumer_parallel_desc); + } // namespace oneflow #endif // ONEFLOW_CORE_FRAMEWORK_SBP_INFER_UTIL_H_ From e53ffbcfc592de9a96250b8d3becdc85719e29cd Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 9 Jun 2022 00:34:18 +0800 Subject: [PATCH 06/45] Add the slight penalty for eager --- oneflow/core/framework/sbp_infer_util.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index dd39f85fe1e..eb74ccbb812 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -275,7 +275,7 @@ Maybe ComputeEagerCopyCostBetweenNdSbp(const NdSbp& producer_sbp_paralle reduced_in_parallel_desc, reduced_out_parallel_desc); } - double total_cost = 0.0; + double total_cost = 1.0; if (reduced_in_parallel_desc == reduced_out_parallel_desc) { // NOTE: After analysis, transfer cost increase if spliting the same dimension. // Example 1: (S(1), S(0), S(1), S(0)) -> (S(0), S(0), S(0), S(0)) From 0373803312b503016605b8397789b7c1ed03817e Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 9 Jun 2022 22:11:17 +0800 Subject: [PATCH 07/45] Skip initialization of boxing collector if not needed --- oneflow/core/framework/sbp_infer_util.cpp | 11 ++++++ .../job_rewriter/boxing_with_middle_nodes.cpp | 34 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 410468c9683..095c2027d97 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -428,6 +428,17 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel double logical_blob_size = logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()); + +#ifdef WITH_CUDA + // Use a general basic communication if no P in the consumer + if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { + return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, + producer_parallel_desc, consumer_parallel_desc) + * logical_blob_desc.shape().elem_cnt() + * GetSizeOfDataType(logical_blob_desc.data_type()); + } +#endif // WITH_CUDA + bool on_same_devices = reduced_in_parallel_desc.EqualsIgnoringHierarchy(reduced_out_parallel_desc); diff --git a/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp b/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp index 91ed0f77f87..d70068c9941 100644 --- a/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp +++ b/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp @@ -14,8 +14,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/job_rewriter/boxing_with_middle_nodes.h" +#include "oneflow/core/common/just.h" #include "oneflow/core/common/util.h" #include "oneflow/core/framework/nd_sbp.h" +#include "oneflow/core/framework/sbp_infer_util.h" #include "oneflow/core/job/job_desc.h" #include "oneflow/core/common/protobuf.h" #include "oneflow/core/auto_parallel/boxing_collector.h" @@ -23,11 +25,43 @@ limitations under the License. namespace oneflow { +namespace { +bool NeedBoxingCollector(const OpGraph& op_graph) { + bool need_boxing_collector = false; + op_graph.ForEachNode([&](const OpNode* node) { + if (need_boxing_collector) { return; } + OperatorConf::OpTypeCase op_type_case = node->op().op_conf().op_type_case(); + if (IsClassRegistered(op_type_case)) { return; } + for (const std::string& ibn : node->op().input_bns()) { + const LogicalBlobId& lbi = node->op().BnInOp2Lbi(ibn); + const OpNode& producer = node->ProducerOpNode4Lbi(lbi); + const NdSbp& producer_nd_sbp = producer.NdSbp4Lbi(lbi); + const NdSbp& consumer_nd_sbp = node->NdSbp4BnInOp(ibn); + // If dealing with different placement + if (producer.parallel_desc().parallel_num() != 1 + || node->parallel_desc().parallel_num() != 1) { + const auto& logical_blob_desc = producer.LogicalBlobDesc4Lbi(lbi); + if (CHECK_JUST(ComputeLazyCopyCostBetweenNdSbp(producer_nd_sbp, consumer_nd_sbp, + logical_blob_desc, producer.parallel_desc(), + node->parallel_desc(), + /*requires_same_sbp=*/false)) + > GetValidMaxCopyCost()) { + need_boxing_collector = true; + return; + } + } + } + }); + return need_boxing_collector; +} +} // namespace + Maybe BoxingWithMiddleNodes(const OpGraph& op_graph, JobBuilder* job_builder) { // Not allowed two-step boxing and disable checking for debugging if (ParseBooleanFromEnv("ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK", false)) { return Maybe::Ok(); } + if (!NeedBoxingCollector(op_graph)) { return Maybe::Ok(); } // Initialize boxing collector BoxingCollector boxing_collector; // We assemble the boxing table from S(0) to S(5). From e7164a7e57d3022baac6281ac597017ddc2a4baa Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 9 Jun 2022 22:13:46 +0800 Subject: [PATCH 08/45] Fix a bug --- oneflow/core/auto_parallel/boxing_collector.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 9f18a8073e9..ec7456e060e 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -568,10 +568,9 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const JUST(AskSbpCombination4GeneralBasicCommunication( sbp_producer, sbp_consumer, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc, middle_sbps, diag_node_pos)); - } else { - // one-step transfer - return Maybe::Ok(); } + // Otherwise, one-step transfer + return Maybe::Ok(); } #endif // WITH_CUDA From 2b16f1b5324b327eebfa2f0db6c768630e030100 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 23 Jun 2022 11:24:16 +0800 Subject: [PATCH 09/45] Dev nd nccl send recv boxing (#8467) * nd nccl_send_recv_boxing * rm print * support num_axes > 2 * Add distributed optional run (#8372) * Add * change deps * add install * add skip * autoprof supports bandwidth (#8367) * autoprof supports bandwidth Signed-off-by: daquexian * print bandwidth Signed-off-by: daquexian * auto format by CI Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot * remove tmp buffer of cumprod cpu backward kernel (#8369) * remove tmp buffer of cumprod cpu backward kernel * refine * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Move tensor api to cpython part3 (#8342) * add tensor_functions * concat py methods * add hash, restore tensor.py * check replacement * refine code, remove commented tensor.py * refine code * move some api * add cpu and cuda api * add triu tril norm and etc. * remove tensor_functions.h * move more api * move more api, refine size * fix typo * format code, remove useless include * refine code * refine code, fix typo * align .cuda to python * refine code * split some api to part3 for review * remove positional only arguments of argmax and argmin * remove arguments parse * modify arguments name in matmul and floor_divide * rename BINARY_FUNC to DIRECT_PASS_FUNC, modify some functions * refine code, format code * add inplace /=, add comments * remove name in macros * remove python api * remove redundant include * remove cout * format code * refactor tensor.size by directly call shape.at, refactor tensor.sub_ by calling nb_sub_ * remove redundant code * auto format by CI * fix typo, fix wrong call * modify idx datatype from int32 to int64 in tensor.size * add some DIRECT_PASS_FUNC * add cpu cuda var pow and etc. * add masked_fill any all * make REDUCE_FUNC macro, add reduce_* functions * add 0dim check in ReduceSumWhole, refine yaml * fix bug * restore add add_ sub sub_ * add unittest for tensor.half tensor.add tensor.add_ * refine code * refine code * fix typo * fix bug of tensor.std() * refactor var std and cuda, using c++ functional api * add beta and threshold in softplus * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Add nn_functor Check (#7910) * add bias_add_check * add bias_add error test * fix conv2d nhwc bias_add error * add nhwc conv test * add bias_add_error test * Add bias add error check * Rename * add batch matmul error check * add matmul check error msg * remove annotation * add fused mlp error msg check * Add pixel shuffle check test * add more test until normalization add relu functor * refine error message * finish all nnfunctor check msg * handle type error * remove useless symbol * modify back to TypeError * fix all comment * Remove redundant code * Remove pad ndim check * fix bias add space * fix check logic cause ci gpu not always gpu:0 Co-authored-by: hjchen2 Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Add FusedMatmulBiasAddReluDropout [OneEmbedding] (#8222) * previous version for fused_matmul_bias_add_relu_dropout * add op infer * fix detail * finish forward * support dropout rate list * add forward test * fix bug for output buffer * Configurable alpha params * try to add bit mask logic * Add bitmask first version! * Add row col bitmask logic * support not align4 reludropout * simplify relu dropout ld logic * Add naive relu dropout grad kernel * add simple relu dropout grad kernel * Rename * support relu_dropout bitmask backward * add vectorized optimization * fix tmp buffer * add to amp list * add lazy backward logic * Refine kernel * add indextype dispatch * simplify functor logic * fix cublas fused mlp aux_ld shape bug * Add more relu dropout kernel * add full unittest * fix bug in skip final activation * refine * Remove dump func * fix format * Remove cmake * remove redundant divide * add padded version * fix dropout * oneflow curand * refine * remove redundant kernel * add unroll logic * add unroll and ballot sync * refine format * Remove fast curand * Refine python interface * Add if branch for memset * fix python logic * just for debug * not use matmul bias add grad * add launch 1 block limit * fix unittest * Refine * fix graph backward bug * limit to 11060 * change to use int32_t dtype for cublas aux * Fix jc comment * fix comment * fix convert * fix static_analysis * fix at * fix userops td * fix userops td * fix const ref * fix compile error for bfloat16 * limit to 11060 * fix bug Co-authored-by: Juncheng Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix gather 0-dim tensor bug (#8376) * fix 0-dim tensor bug * refine * support input 0-dim tensor for gather * refine * refine * refine dim_scatter_kernel check * refine * refine check * fix clang_tidy error Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add api to apply external job pass (#8370) * Add condition to find-test-cache-distributed (#8387) * add condition to find-test-cache-distributed * fix * warp dim util (#8382) * warp dim util * format * use more maybe_wrap_dim * refine array functor * add more * refine math_functor * fix_bug_in_broadcast_min_max_grad_and_broadcast_like (#8379) * fix_bug_in_broadcast_min_max_grad_and_broadcast_like * refine * fix static check error * fix bug about index (#8388) * fix bug about index * add test case Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * LogicalSliceAssign support full slice sbp (#8344) * feat(SliceOp): slice ops support 2d sbp * fix(SliceOp): fix [B, P] 2d sbp bug * refine error message * fix bug in parallel_num == 1 * add comment * add warning and format * add NOLINT for boxing check * feat(LogicalSliceOps): support all nd_sbp * feat(LogicalSlice): support nd_sbp * add error message * fix(AutoTest): fix auto_test bug in module.parameter pass * auto format by CI * fix(LogicalSliceAssign): skip test when 1n1d * fix SliceParams memset error * remove memset * add CHECK_JUST * fix(*): make sure split_axis >= 0 or equal to SPLIT_AXIS_FOR_NON_SPLIT * remove memset * fix spilit_info.axis bug * feat(LogicalSliceOps): support grad * add logical_slice gradient_funcs * feat(LogicalSliceAssign): LogicalSliceAssign support full slice sbp * auto format by CI * test(LogicalSlice): fix logical_slice dims Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Houjiang Chen Co-authored-by: oneflow-ci-bot * fix_tensor_from_numpy_mem_leak_bug (#8391) * fix_tensor_from_numpy_mem_leak_bug * add note * refine note * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Make of_pyext_obj static only to make sure only a python ext so has python symbols (#8393) * make of_pyext_obj static only * refine note Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Adjust tolerance setting in embedding_renorm unit test (#8394) * support front end compile for job to iree (#8249) * support frontend dev version * polish name * add tosa-to-elf.mlir * tosa to elf by llvm * conv2d partial * an enhanced frontend runner * support numpy as input * enable multiple using nn graph with different input(jobname make it it cd /home/yuhao/frontend/oneflow ; /usr/bin/env /usr/bin/python3 /home/yuhao/.vscode-server/extensions/ms-python.python-2022.6.2/pythonFiles/lib/python/debugpy/launcher 40873 -- /home/yuhao/frontend/oneflow/oneflow/ir/test/Frontend/runner.py ) * enable multiple input * enable cpu and cuda * change full_name to _full_name * support exchange cuda with cpu seamlessly * remove pip * lit config * polish * trim * auto format by CI * modify * auto format by CI * last line polish * use unittest * auto format by CI * use allclose * auto format by CI * pulish * optimize convert oneflow to tosa * conv2d * conv2d enhanced && conv2d examples add * add road map * add add_n2Op and boardcast_addOp conversion * add matmulOp conversion * support converting normailzation op to tosa(partically) * update roadmap * support i64 tensor to dense elem attr * support 100% resnet op conversion * add test mlir * add test iree resnet python script * auto format by CI * done * enhance iree resnet test script * auto format by CI * rebuild code * auto format by CI * rebuild test script * update * auto format by CI * pub * trim test scripts * move * move * input and output add block arg judgement * emit error in variable conversion * error handle for ci * modify err info * auto format by CI * merge * auto format by CI * output not block * flow ones * rm const * trim maybe * trim maybe with header file * const auto * solve clangd error Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Feat/zero mix with mp (#8036) * add zero limit * add debug * add mix zero test * refactor zero api * zero test with mp * add 2d test * add zero nd * add nd zero * add sbp cast * test passed soft limit consumer * refine size api * zero use stage 2 * add limit consumer api * add new api * refine zero s select * fix index out of range * rm zero limit on device type * zero test with activation checkpointing * add indentity when dp sequence len is 1 * move to base with master * fix * fix * fix * add test * debug bad case * refine test for eager and graph boxing * test case ready * simplify * refine test * fix buff size * fix conflict * refine zero nd * refine * add full test * revert change * refine split check * fix typo * rm log * spit long func * restore test * Update optimizer_placement_optimization_pass.cpp * auto format by CI * auto format by CI * fix static check * add tips for zero api change * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Revert embedding normal path and fix amp list (#8374) * revert embedding normal path, fix amp list * fix amp * fix memset bug in gather cpu kernel Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * replace fixed_vector with small_vector and make Shape inherit from it (#8365) * Replace fixed_vector with llvm::SmallVector Signed-off-by: daquexian * Shape inherited from llvm::SmallVector Signed-off-by: daquexian * refine cmake Signed-off-by: daquexian * rename fixed_vector to small_vector Signed-off-by: daquexian * fix reviews Signed-off-by: daquexian * auto format by CI * update Shape constructor Signed-off-by: daquexian * add 'PUBLIC' keyword to all target_link_libraries Signed-off-by: daquexian * auto format by CI * update cmake Signed-off-by: daquexian * auto format by CI * update cmake Signed-off-by: daquexian * update cmake Signed-off-by: daquexian * auto format by CI * set is_initialized_ default to true Signed-off-by: daquexian * override some methods to set is_initialized_ Signed-off-by: daquexian * auto format by CI Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot * Light plan for debug (#8396) * Light plan for debug * fix note * disable terminfo to fix missing terminfo symbols (#8400) * disable terminfo to fix missing terminfo symbols Signed-off-by: daquexian * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix bug of ZeRO MP in complex case (#8404) * Remove redundant output_lbns in ir (#8409) * mv case * remove redundant info * Dev FusedCrossInteraction[OneEmbedding] (#8335) * add simple fused cross interaction forward * add packed fused * Add cross interaction grad * simplify code * fix bug * support crossnet v2 * support cross interaction v2 * add lazy backward * Rename and add test * fix jc comment * fix comment * fix bug * fix userops td elem_cnt for FUSED Group * fix header file * fix clang static analysis * fix unittest Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add exe graph physical shape check msg (#8002) * fix index select op in graph * add exe graph physical shape check msg * improve the debug information for the python stack trace 1. add a parameter 'max_stack_depth' to specify the max depth for the stack trace 2. refactor other debug related classes. * remove parens * update * resolve PR comments * update * update graph debug test file. * restore self._debug in class Graph and class ModuleBlock * Do not shorten the stack frame string if it is in debug mode * delete TODOs * disable conv3d test (#7969) Signed-off-by: daquexian * skip layernorm random_data_warp test (#7941) * skip layernorm random_data_warp test * warp/block/uncached case only test gpu Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Lock click version (#7967) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add global avgpool unittest (#7585) * fix (#7978) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Support negative dim in scatter op (#7934) * support negative dim in scatter op * refine scatter test * refine scatter test again Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * run barrier callback in BarrierPhyInstrOperand::~BarrierPhyInstrOperand (#7702) * run barrier callback in BarrierPhyInstrOperand::~BarrierPhyInstrOperand * lock gil in vm Callback thread * more comments for VirtualMachineEngine::Callback() * the Env is never destroyed. * export Env into python * more unittests * wait shared_ptr.use_count() == 0 * export unittest.TestCase in framework/unittest.py * SwitchToShuttingDownPhase * optional is_normal_exit * VirtualMachine::CloseVMThreads * Delete env_api.h env_api.h is deleted by master * reshape_only_one_dim_infered * address pr comments * fix a ref-cnt bug in TryRunBarrierInstruction. * rollback flow.env.all_device_placement * no distributed running test_shutting_down.py * auto format by CI * expand lifetime of module oneflow in test_shutting_down.py * refine del depend on of * capture oneflow._oneflow_internal.eager when calling sync in __del__ * add try in flaky test Co-authored-by: Luyang Co-authored-by: chengtbf <472491134@qq.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot Co-authored-by: Xiaoyu Xu * Fix one hot scalar tensor bug (#7975) * fix reduce_sum scalar check bug * fix one_hot scalar tensor bug * fix clang tidy error Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * support ctor np array from of tensor (#7970) * support ctor np array from of tensor * add test case constructing np array from tensor * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add_manual_seed_all_api (#7957) * add_manual_seed_all_api * Update conf.py * refine * add test case * auto format by CI * Update random_generator.cpp * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * one_embedding add doc string (#7902) * add doc string * add example * add * fix doc * refine * address review * mb to MB * add make_table_option * option to options * refine * add forward Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Support numpy scalar parameters (#7935) * feat(functional): support numpy scalar parameters * rename inferface * feat(*): TensorIndex support numpy scalar * feat(TensorIndex): support advance indexing * add unittest and int32 support for branch feat-param_support_np_scalar (#7939) * add unittest * refactor unittest * add todo for int16 advanced indexing * add int32 supporting for advance indexing * auto format by CI Co-authored-by: Wang Yi <53533850+marigoold@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot * fix tensor_scatter_nd_update (#7953) * fix tensor_scatter_nd_update * auto backward Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix one_embedding adam (#7974) * fix one_embedding adam * fix tidy * fix normal Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * speed test with score (#7990) Signed-off-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Feat/graph del by ref (#7857) * remove IsMultiClient() and single client logic Signed-off-by: daquexian * rename eager.multi_client to eager Signed-off-by: daquexian * auto format by CI * add py ref * refine new session * clean code * make scope api inner use * use session with ref cnt * run barrier callback in BarrierPhyInstrOperand::~BarrierPhyInstrOperand * test pass * lock gil in vm Callback thread * more comments for VirtualMachineEngine::Callback() * merge * merge rm single client * rm initenv * merge and fix master * refactor env c api * add debug code * fix and serving test pass * test passed * rm useless * rm useless code * format * rm useless include * rm sync in py * the Env is never destroyed. * export Env into python * more unittests * fix and pass tests * revert virtual_machine.cpp * revert core/vm * remove outdated python class oneflow.unittest.TestCase * graph test passed * wait shared_ptr.use_count() == 0 * export unittest.TestCase in framework/unittest.py * SwitchToShuttingDownPhase * optional is_normal_exit * VirtualMachine::CloseVMThreads * Delete env_api.h env_api.h is deleted by master * address pr comments * rm is env init * Clear empty thread when graph destroy (#7633) * Revert "Clear empty thread when graph destroy (#7633)" (#7860) This reverts commit 3e8585e5fa20b97229d6b0be46a7ff814dc8cd83. * fix a ref-cnt bug in TryRunBarrierInstruction. * rm env_api * fix clang-tidy error * fix clang-tidy in env_imp * refine env api * format * refine graph del and sync at shuttingdown * fix typo * add comment * rm useless * rm useless Co-authored-by: daquexian Co-authored-by: oneflow-ci-bot Co-authored-by: lixinqi Co-authored-by: Li Xinqi Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Luyang Co-authored-by: cheng cheng <472491134@qq.com> * [PersistentTable] Fix num blocks (#7986) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Add auto benchmark for flowvision (#7806) * update yml * update workflow * add resnet50 * [PersistentTable] Async write (#7946) * [PersistentTable] Async write * fix Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * save log in separate dir by default (#7825) Signed-off-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix index select op in graph * add exe graph physical shape check msg * improve the debug information for the python stack trace 1. add a parameter 'max_stack_depth' to specify the max depth for the stack trace 2. refactor other debug related classes. * remove parens * update * resolve PR comments * update * update graph debug test file. * restore self._debug in class Graph and class ModuleBlock * Do not shorten the stack frame string if it is in debug mode * delete TODOs * Revert "Merge branch 'master' into fea/graph_check_msg" This reverts commit 28833b73a8041463e5e3d130784be386ee248bd8, reversing changes made to baadf6045f2fce69c090e442a755229c1c949773. * Revert "Revert "Merge branch 'master' into fea/graph_check_msg"" This reverts commit 1d5e196d8530ffd2b9bf781abcf168b94ff9ca41. * update * resolve conflicts * resolve conflicts Co-authored-by: Cijie Xia Co-authored-by: daquexian Co-authored-by: guo ran <360112263@qq.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Shenghang Tsai Co-authored-by: Houjiang Chen Co-authored-by: Peihong Liu Co-authored-by: Li Xinqi Co-authored-by: Luyang Co-authored-by: chengtbf <472491134@qq.com> Co-authored-by: oneflow-ci-bot Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Co-authored-by: liufengwei0103 <2472937968@qq.com> Co-authored-by: binbinHan Co-authored-by: Yinggang Wang Co-authored-by: Wang Yi <53533850+marigoold@users.noreply.github.com> Co-authored-by: Shijie <821898965@qq.com> Co-authored-by: lixinqi Co-authored-by: Juncheng * add batch_matmul sbp (#8385) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * suppress gcc11 false positive warning (#8401) Signed-off-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix variable op conversion to tosa error in ninja c1 (#8412) * pub * move test iree resnet python script to oneflow_iree repo * add bracket * rename const_val to const_val_ and restore resnet.py test script Co-authored-by: Shenghang Tsai * nccl send/recv support different placement * refine * auto format by CI * rm out ctrl * auto format by CI Co-authored-by: guo-ran <360112263@qq.com> Co-authored-by: Shenghang Tsai Co-authored-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot Co-authored-by: liufengwei0103 <2472937968@qq.com> Co-authored-by: Wang Yi <53533850+marigoold@users.noreply.github.com> Co-authored-by: ZZK <359521840@qq.com> Co-authored-by: hjchen2 Co-authored-by: Juncheng Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Co-authored-by: Luyang Co-authored-by: binbinHan Co-authored-by: Yinggang Wang Co-authored-by: Yao Zihang <1162526220@qq.com> Co-authored-by: yuhao <72971170+howin98@users.noreply.github.com> Co-authored-by: Xiaoyu Xu Co-authored-by: cheng cheng <472491134@qq.com> Co-authored-by: Cijie Xia Co-authored-by: Peihong Liu Co-authored-by: Li Xinqi Co-authored-by: Shijie <821898965@qq.com> Co-authored-by: lixinqi --- .github/workflows/test.yml | 28 +- cmake/oneflow.cmake | 24 +- cmake/op_schema.cmake | 2 +- cmake/util.cmake | 6 + docs/source/graph.rst | 3 +- oneflow/api/common/variable_tensor_mgr.h | 4 + oneflow/api/cpp/framework/graph.cpp | 36 +- oneflow/api/cpp/framework/graph.h | 3 + oneflow/api/python/framework/nn_graph.cpp | 10 +- oneflow/api/python/framework/tensor.cpp | 1 + .../api/python/framework/tensor_functions.cpp | 228 +++++++- .../python/framework/variable_tensor_mgr.cpp | 1 + oneflow/api/python/functional/tensor_api.cpp | 10 +- .../gradient_funcs/broadcast_binary_ops.cpp | 25 +- .../gradient_funcs/cublas_fused_mlp.cpp | 60 +-- .../fused_cross_interaction.cpp | 117 ++++ .../fused_matmul_bias_add_relu_dropout.cpp | 205 +++++++ .../autograd/gradient_funcs/normalization.cpp | 7 +- oneflow/core/common/cached_caller.h | 11 + oneflow/core/common/error_util.cpp | 5 +- oneflow/core/common/fixed_vector.h | 277 ---------- oneflow/core/common/fixed_vector_test.cpp | 419 -------------- oneflow/core/common/shape.cpp | 48 +- oneflow/core/common/shape.h | 46 +- oneflow/core/common/shape_vec.h | 15 +- oneflow/core/common/shape_view.cpp | 2 +- oneflow/core/common/small_vector.h | 53 ++ oneflow/core/common/wrap_dim_utils.h | 40 ++ .../broadcast_elementwise_binary.cpp | 20 +- oneflow/core/framework/nn_graph.cpp | 3 + oneflow/core/framework/tensor_methods.cpp | 10 +- .../core/framework/variable_tensor_mgr.cpp | 4 + oneflow/core/framework/variable_tensor_mgr.h | 1 + oneflow/core/functional/function_library.h | 1 + oneflow/core/functional/functional_api.yaml | 30 +- .../functional/impl/activation_functor.cpp | 9 +- .../core/functional/impl/array_functor.cpp | 147 +++-- oneflow/core/functional/impl/common.cpp | 10 +- oneflow/core/functional/impl/math_functor.cpp | 93 +--- oneflow/core/functional/impl/nn_functor.cpp | 259 +++++++-- .../core/functional/impl/nn_grad_functor.cpp | 88 ++- ...erarchical_sub_task_graph_builder_impl.cpp | 124 ++++- oneflow/core/graph/exec_graph.cpp | 18 +- .../graph/nccl_send_recv_boxing_task_node.cpp | 92 ++++ .../graph/nccl_send_recv_boxing_task_node.h | 57 ++ oneflow/core/graph/op_graph.cpp | 6 + oneflow/core/graph/task_graph.cpp | 6 + oneflow/core/job/eager_nccl_comm_manager.cpp | 3 + oneflow/core/job/job_build_and_infer_ctx.cpp | 6 + oneflow/core/job/job_builder.cpp | 1 + oneflow/core/job/job_conf.proto | 1 + oneflow/core/job/job_ir.cpp | 4 + oneflow/core/job/job_ir.h | 1 + oneflow/core/job/plan_util.cpp | 126 ++++- oneflow/core/job/plan_util.h | 1 + oneflow/core/job/task.proto | 1 + .../auto_mixed_precision_lists.cpp | 2 + .../optimizer_placement_optimization_pass.cpp | 292 ++++++++-- .../kernel/nccl_send_recv_boxing_kernel.cpp | 258 +++++++++ oneflow/core/lazy/actor/naive_actor.cpp | 1 + .../operator/nccl_send_recv_boxing_op.cpp | 133 +++++ .../nccl_send_recv_boxing_op_util.cpp | 170 ++++++ .../operator/nccl_send_recv_boxing_op_util.h | 31 ++ oneflow/core/operator/op_conf.proto | 15 + oneflow/extension/python/numpy_internal.h | 2 +- oneflow/ir/include/OneFlow/OneFlowOps.td | 4 + oneflow/ir/include/OneFlow/OneFlowUserOps.td | 111 +++- oneflow/ir/install-llvm.cmake | 3 + .../lib/OneFlow/Conversion/OneFlowToTosa.cpp | 510 +++++++++++++++++- oneflow/ir/lib/OneFlow/OneFlowOps.cpp | 1 + oneflow/ir/lib/OneFlow/OneFlowSupport.cpp | 18 +- oneflow/ir/llvm-in-tree.cmake | 2 + oneflow/ir/oneflow-extension/ir_pass.cpp | 5 + .../include/OneFlow/MLIROneFlowTranslation.h | 1 + .../lib/OneFlow/MLIROneFlowTranslation.cpp | 29 + oneflow/ir/test/Frontend/OneFlowToIree.mlir | 266 +++++++++ oneflow/ir/test/Frontend/lit.local.cfg | 2 + oneflow/ir/test/Frontend/test_iree_resnet.py | 107 ++++ oneflow/ir/test/Frontend/test_iree_runner.py | 71 +++ .../ir/test/Frontend/test_tosa_to_elf.mlir | 16 + .../OneFlow/conversion/OneFlowToTosa.mlir | 342 ++++++++++++ .../test_fuser_cast_scale.py | 0 oneflow/ir/test/lit.cfg.py | 7 + .../user/kernels/arg_where_kernel_util.cpp | 2 +- oneflow/user/kernels/arg_where_kernel_util.cu | 2 +- oneflow/user/kernels/avg_pool_kernel_util.h | 2 +- ...cublas_bias_add_relu_matmul_grad_kernel.cu | 7 +- .../cublas_fused_matmul_bias_add_grad.cu | 9 +- .../user/kernels/cublas_fused_mlp_kernel.cu | 4 + oneflow/user/kernels/cum_backward_kernel.cpp | 116 ++-- oneflow/user/kernels/dim_gather_kernels.cpp | 13 +- .../user/kernels/dim_scatter_kernel_util.h | 2 +- oneflow/user/kernels/dim_scatter_kernels.cpp | 2 +- .../kernels/dim_scatter_scalar_kernels.cpp | 2 +- oneflow/user/kernels/dropout_kernel.cu | 5 +- .../fused_cross_feature_interaction.cu | 257 +++++++++ .../fused_cross_feature_interaction_grad.cu | 454 ++++++++++++++++ .../fused_matmul_bias_add_relu_dropout.cu | 478 ++++++++++++++++ .../kernels/fused_relu_dropout_grad_kernel.cu | 148 +++++ oneflow/user/kernels/gather_kernel_util.cpp | 2 +- .../user/kernels/image_preprocess_kernels.cu | 2 +- oneflow/user/kernels/max_pool_kernel_util.h | 2 +- oneflow/user/kernels/slice_kernel.cpp | 84 +-- oneflow/user/ops/arange_op.cpp | 5 +- oneflow/user/ops/constant_op.cpp | 5 +- oneflow/user/ops/cublas_fused_mlp_op.cpp | 100 ++-- oneflow/user/ops/dim_gather_op.cpp | 8 +- oneflow/user/ops/dim_scatter_ops.cpp | 15 +- oneflow/user/ops/distributions/normal_op.cpp | 5 +- .../user/ops/distributions/uniform_int_op.cpp | 5 +- oneflow/user/ops/distributions/uniform_op.cpp | 5 +- oneflow/user/ops/eager_nccl_ops.cpp | 5 +- oneflow/user/ops/empty_op.cpp | 5 +- .../fused_cross_feature_interaction_op.cpp | 181 +++++++ .../fused_matmul_bias_add_relu_dropout_op.cpp | 263 +++++++++ .../user/ops/fused_relu_dropout_grad_op.cpp | 61 +++ oneflow/user/ops/gather_op.cpp | 3 +- oneflow/user/ops/image_preprocess_ops.cpp | 5 +- oneflow/user/ops/matmul_op.cpp | 46 +- oneflow/user/ops/randperm_op.cpp | 5 +- oneflow/user/ops/slice_op.cpp | 13 +- oneflow/user/ops/stack_op.cpp | 2 - oneflow/user/utils/pool_util.h | 4 +- python/oneflow/autoprof/__main__.py | 24 +- .../oneflow/framework/multi_client_session.py | 3 + python/oneflow/framework/sysconfig.py | 1 - python/oneflow/framework/tensor.py | 46 +- python/oneflow/nn/graph/graph.py | 21 +- python/oneflow/nn/graph/graph_config.py | 133 ++--- python/oneflow/nn/modules/fused_mlp.py | 36 +- python/oneflow/nn/modules/sparse.py | 14 +- .../test/exceptions/test_array_functor.py | 2 +- .../test/exceptions/test_nn_functor.py | 386 +++++++++++++ python/oneflow/test/graph/test_graph_zero.py | 182 ++++++- .../test/graph/test_optimization_conf.py | 2 +- .../test/modules/test_consistent_diagonal.py | 1 + .../test/modules/test_consistent_slice.py | 12 +- .../modules/test_consistent_slice_assign.py | 75 ++- python/oneflow/test/modules/test_conv2d.py | 102 ++++ .../test/modules/test_cublas_fused_mlp.py | 6 +- python/oneflow/test/modules/test_cum_ops.py | 25 + .../modules/test_fused_cross_interaction.py | 154 ++++++ ...test_fused_matmul_bias_add_relu_dropout.py | 192 +++++++ python/oneflow/test/modules/test_gather.py | 33 ++ python/oneflow/test/modules/test_max.py | 8 + .../modules/test_nccl_send_recv_boxing.py | 103 ++++ python/oneflow/test/modules/test_sparse.py | 2 +- .../oneflow/test/modules/test_tensor_ops.py | 14 + .../oneflow/test/tensor/test_tensor_part_1.py | 24 + 149 files changed, 7580 insertions(+), 1562 deletions(-) create mode 100644 oneflow/core/autograd/gradient_funcs/fused_cross_interaction.cpp create mode 100644 oneflow/core/autograd/gradient_funcs/fused_matmul_bias_add_relu_dropout.cpp delete mode 100644 oneflow/core/common/fixed_vector.h delete mode 100644 oneflow/core/common/fixed_vector_test.cpp create mode 100644 oneflow/core/common/small_vector.h create mode 100644 oneflow/core/common/wrap_dim_utils.h create mode 100644 oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp create mode 100644 oneflow/core/graph/nccl_send_recv_boxing_task_node.h create mode 100644 oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp create mode 100644 oneflow/core/operator/nccl_send_recv_boxing_op.cpp create mode 100644 oneflow/core/operator/nccl_send_recv_boxing_op_util.cpp create mode 100644 oneflow/core/operator/nccl_send_recv_boxing_op_util.h create mode 100644 oneflow/ir/test/Frontend/OneFlowToIree.mlir create mode 100644 oneflow/ir/test/Frontend/lit.local.cfg create mode 100644 oneflow/ir/test/Frontend/test_iree_resnet.py create mode 100644 oneflow/ir/test/Frontend/test_iree_runner.py create mode 100644 oneflow/ir/test/Frontend/test_tosa_to_elf.mlir create mode 100644 oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir rename oneflow/ir/test/OneFlow/{ => cuda_code_gen}/test_fuser_cast_scale.py (100%) create mode 100644 oneflow/user/kernels/fused_cross_feature_interaction.cu create mode 100644 oneflow/user/kernels/fused_cross_feature_interaction_grad.cu create mode 100644 oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu create mode 100644 oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu create mode 100644 oneflow/user/ops/fused_cross_feature_interaction_op.cpp create mode 100644 oneflow/user/ops/fused_matmul_bias_add_relu_dropout_op.cpp create mode 100644 oneflow/user/ops/fused_relu_dropout_grad_op.cpp create mode 100644 python/oneflow/test/exceptions/test_nn_functor.py create mode 100644 python/oneflow/test/modules/test_fused_cross_interaction.py create mode 100644 python/oneflow/test/modules/test_fused_matmul_bias_add_relu_dropout.py create mode 100644 python/oneflow/test/modules/test_nccl_send_recv_boxing.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e8732d99567..213d0246ebf 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -347,7 +347,7 @@ jobs: find-test-cache-distributed: name: "Find test cache (distributed)" - if: github.event.pull_request.draft == false && github.base_ref == 'master' && false + if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.labels.*.name, 'need-test-distributed') runs-on: ubuntu-latest needs: [build-oneflow] env: @@ -411,10 +411,10 @@ jobs: test-distributed: name: Distributed test suite - needs: [wait_for_gpu_slot, find-test-cache-distributed] + needs: [wait_for_gpu_slot, find-test-cache-distributed, test] runs-on: ${{ matrix.runs-on }} timeout-minutes: 120 - if: github.event.pull_request.draft == false && github.base_ref == 'master' && false + if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.labels.*.name, 'need-test-distributed') concurrency: group: distributed-test-${{ matrix.entry }}-rank-${{ matrix.rank }} cancel-in-progress: false @@ -439,6 +439,22 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} repository: ${{github.event.pull_request.head.repo.full_name}} + - name: Checkout Oneflow-Inc/vision + if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} + uses: actions/checkout@v2 + with: + repository: Oneflow-Inc/vision + # please use a commit here + ref: ${{ env.FLOW_VISION_COMMIT}} + path: ${{ env.FLOW_VISION_SRC}} + - name: Checkout Oneflow-Inc/libai + if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} + uses: actions/checkout@v2 + with: + repository: Oneflow-Inc/libai + # please use a commit here + ref: ${{ env.LIBAI_COMMIT}} + path: ${{ env.LIBAI_SRC}} - name: Remove container timeout-minutes: 45 if: ${{ contains(matrix.runs-on, 'self-hosted') }} @@ -537,6 +553,12 @@ jobs: ls ${ONEFLOW_WHEEL_PATH} docker exec ${TEST_CONTAINER_NAME} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple docker exec ${TEST_CONTAINER_NAME} python3 -m pip install --find-links=${ONEFLOW_WHEEL_PATH} oneflow + - name: Install downstream libs + if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} + run: | + docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.FLOW_VISION_SRC}} + docker exec ${TEST_CONTAINER_NAME} python3 -m pip install pybind11 --user + docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.LIBAI_SRC}} - name: Module API test (distributed) timeout-minutes: 90 if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'module' && matrix.device == 'cuda' && fromJson(matrix.is-distributed) }} diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake index 4a2cf3b06f7..205224541a2 100644 --- a/cmake/oneflow.cmake +++ b/cmake/oneflow.cmake @@ -197,7 +197,7 @@ generate_functional_api_and_pybind11_cpp(FUNCTIONAL_GENERATED_SRCS FUNCTIONAL_GE FUNCTIONAL_PYBIND11_SRCS ${PROJECT_SOURCE_DIR}) oneflow_add_library(of_functional_obj STATIC ${FUNCTIONAL_GENERATED_SRCS} ${FUNCTIONAL_GENERATED_HRCS}) -target_link_libraries(of_functional_obj glog::glog) +target_link_libraries(of_functional_obj LLVMSupportWithHeader glog::glog) add_dependencies(of_functional_obj prepare_oneflow_third_party) if(BUILD_PYTHON) @@ -214,7 +214,7 @@ if(BUILD_PYTHON) of_functional_tensor_obj STATIC ${FUNCTIONAL_TENSOR_GENERATED_SRCS} ${FUNCTIONAL_TENSOR_GENERATED_HRCS} ${FUNCTIONAL_OPS_GENERATED_SRCS} ${FUNCTIONAL_OPS_GENERATED_HRCS}) - target_link_libraries(of_functional_tensor_obj glog::glog) + target_link_libraries(of_functional_tensor_obj LLVMSupportWithHeader glog::glog) add_dependencies(of_functional_tensor_obj prepare_oneflow_third_party) target_include_directories(of_functional_tensor_obj PRIVATE ${Python_INCLUDE_DIRS} ${Python_NumPy_INCLUDE_DIRS}) @@ -274,6 +274,22 @@ if(WITH_MLIR) set(ONEFLOW_MLIR_LIBS -Wl,--no-as-needed MLIROneFlowExtension -Wl,--as-needed) endif() +if("${LLVM_PROVIDER}" STREQUAL "install") + get_property(LLVM_INSTALL_DIR GLOBAL PROPERTY LLVM_INSTALL_DIR) + check_variable_defined(LLVM_INSTALL_DIR) + find_library(LLVMSupportLib LLVMSupport PATHS ${LLVM_INSTALL_DIR}/lib REQUIRED) + add_library(LLVMSupportWithHeader UNKNOWN IMPORTED) + set_property(TARGET LLVMSupportWithHeader PROPERTY IMPORTED_LOCATION ${LLVMSupportLib}) +else() + add_library(LLVMSupportWithHeader INTERFACE IMPORTED) + target_link_libraries(LLVMSupportWithHeader INTERFACE LLVMSupport) +endif() +check_variable_defined(LLVM_INCLUDE_DIRS) +set_property(TARGET LLVMSupportWithHeader PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${LLVM_INCLUDE_DIRS}) + +list(APPEND oneflow_third_party_libs LLVMSupportWithHeader) + include(op_schema) get_property(EXTERNAL_INCLUDE_DIRS GLOBAL PROPERTY EXTERNAL_INCLUDE_DIRS) @@ -317,7 +333,9 @@ endif() if(BUILD_PYTHON) # py ext lib - oneflow_add_library(of_pyext_obj SHARED ${of_pyext_obj_cc}) + # This library should be static to make sure all python symbols are included in the final ext shared lib, + # so that it is safe to do wheel audits of multiple pythons version in parallel. + oneflow_add_library(of_pyext_obj STATIC ${of_pyext_obj_cc}) target_include_directories(of_pyext_obj PRIVATE ${Python_INCLUDE_DIRS} ${Python_NumPy_INCLUDE_DIRS}) target_link_libraries(of_pyext_obj oneflow pybind11::headers) diff --git a/cmake/op_schema.cmake b/cmake/op_schema.cmake index ce790c1918b..5017fab574e 100644 --- a/cmake/op_schema.cmake +++ b/cmake/op_schema.cmake @@ -81,5 +81,5 @@ set_source_files_properties(${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP} TRUE) oneflow_add_library(of_op_schema OBJECT ${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP}) -target_link_libraries(of_op_schema glog::glog) +target_link_libraries(of_op_schema LLVMSupportWithHeader glog::glog) add_dependencies(of_op_schema prepare_oneflow_third_party) diff --git a/cmake/util.cmake b/cmake/util.cmake index 3aaae830e12..a69128f416e 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -269,6 +269,12 @@ function(set_compile_options_to_oneflow_target target) endif() endfunction() +function(check_variable_defined variable) + if(NOT DEFINED ${variable}) + message(FATAL_ERROR "Variable ${variable} is not defined") + endif() +endfunction() + function(checkDirAndAppendSlash) set(singleValues DIR;OUTPUT) set(prefix ARG) diff --git a/docs/source/graph.rst b/docs/source/graph.rst index 5ec08061a8a..270e5a01cf0 100644 --- a/docs/source/graph.rst +++ b/docs/source/graph.rst @@ -20,12 +20,11 @@ Base class for running neural networks in Static Graph Mode. .. autoclass:: oneflow.nn.graph.graph_config.GraphConfig :members: enable_amp, + enable_zero, allow_fuse_model_update_ops, allow_fuse_add_to_output, allow_fuse_cast_scale, set_gradient_accumulation_steps, - set_zero_redundancy_optimizer_mode, - set_zero_redundancy_optimizer_min_size_after_split, enable_cudnn_conv_heuristic_search_algo, :member-order: bysource diff --git a/oneflow/api/common/variable_tensor_mgr.h b/oneflow/api/common/variable_tensor_mgr.h index 883ebcaf381..3f0f5618492 100644 --- a/oneflow/api/common/variable_tensor_mgr.h +++ b/oneflow/api/common/variable_tensor_mgr.h @@ -28,6 +28,10 @@ inline Maybe FillVariableTensorMgr( auto mgr = Global::Get(); return mgr->Fill(variable_op_names, variable_tensors); } +inline void ClearVariableTensorMgr() { + auto mgr = Global::Get(); + mgr->Clear(); +} inline std::tuple, std::vector>> DumpVariableTensorMgr() { diff --git a/oneflow/api/cpp/framework/graph.cpp b/oneflow/api/cpp/framework/graph.cpp index 37ce8323a60..a49d022c145 100644 --- a/oneflow/api/cpp/framework/graph.cpp +++ b/oneflow/api/cpp/framework/graph.cpp @@ -127,6 +127,9 @@ class Graph::GraphImpl final { std::vector Forward(const std::vector& inputs); void set_batch_size(int batch_size) { batch_size_ = batch_size; } + of::Maybe RegisterJobPass( + const std::function& pass_fn); + private: of::Maybe CollectInputOutputInfos(); of::Maybe Compile(const std::vector& inputs); @@ -135,6 +138,7 @@ class Graph::GraphImpl final { of::Maybe BuildGraph(); of::Maybe LoadCheckpoint(); of::Maybe RegisterTensors(const std::vector& inputs); + of::Maybe ApplyJobPasses(const of::Job& job); std::shared_ptr graph_ = nullptr; std::string model_path_; @@ -149,6 +153,7 @@ class Graph::GraphImpl final { of::HashMap> variable_op_name_to_tensor_; std::shared_ptr output_tensor_tuple_; std::shared_ptr parameter_tensor_tuple_; + std::vector> registered_job_passes_; }; Graph::Graph(const std::string& model_path, const Device& device) @@ -168,6 +173,10 @@ InputOutputInfos Graph::GetInputInfos() { return graph_->GetInputInfos(); } InputOutputInfos Graph::GetOutputInfos() { return graph_->GetOutputInfos(); } +void Graph::RegisterJobPass(const std::function& pass_fn) { + CHECK_JUST(graph_->RegisterJobPass(pass_fn)); +} + IValue Graph::Forward(const IValue& inputs) { std::vector input_tensors; if (inputs.IsNone()) { @@ -234,6 +243,28 @@ of::Maybe Graph::GraphImpl::CollectInputOutputInfos() { return of::Maybe::Ok(); } +of::Maybe Graph::GraphImpl::RegisterJobPass( + const std::function& pass_fn) { + if (is_compiled_) { + return of::Error::RuntimeError() << "job pass should be registered before compile and forward"; + } + registered_job_passes_.emplace_back(pass_fn); + return of::Maybe::Ok(); +} + +of::Maybe Graph::GraphImpl::ApplyJobPasses(const of::Job& job) { + auto current_job = std::make_shared(job); + for (const auto& pass_fn : registered_job_passes_) { + std::string new_serialized_job = pass_fn(current_job->SerializeAsString()); + of::Job new_job; + if (!new_job.ParseFromString(new_serialized_job)) { + return of::Error::RuntimeError() << "invalid serialized job after pass applied"; + } + current_job->Swap(&new_job); + } + return current_job; +} + std::vector Graph::GraphImpl::Forward(const std::vector& inputs) { if (!is_compiled_) { static std::mutex mtx; @@ -299,9 +330,12 @@ of::Maybe Graph::GraphImpl::BuildGraph() { } JUST(LoadCheckpoint()); JUST(of::CurJobBuildAndInferCtx_Complete()); - const std::shared_ptr complete_job = JUST(of::GetCurrentJob()); + std::shared_ptr complete_job = JUST(of::GetCurrentJob()); int64_t job_id = JUST(of::JobBuildAndInferCtx_GetCurrentJobId()); CHECK(of::Global::Get() != nullptr); + + // apply custom job passes + complete_job = JUST(ApplyJobPasses(*complete_job)); graph_ = std::make_shared(job_.job_conf().job_name(), *complete_job, job_id, of::Global::Get()->GetSessionCtx()); { diff --git a/oneflow/api/cpp/framework/graph.h b/oneflow/api/cpp/framework/graph.h index 0cd166ea3f8..ea58b3a5097 100644 --- a/oneflow/api/cpp/framework/graph.h +++ b/oneflow/api/cpp/framework/graph.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensor.h" #include #include +#include #include namespace oneflow { @@ -64,6 +65,8 @@ class Graph { IValue Forward(const IValue& inputs); void set_batch_size(int batch_size); + void RegisterJobPass(const std::function& pass_fn); + static Graph Load(const std::string& model_path, const Device& device = Device("cpu")); private: diff --git a/oneflow/api/python/framework/nn_graph.cpp b/oneflow/api/python/framework/nn_graph.cpp index e02d86e9eb1..9e0c939b3e2 100644 --- a/oneflow/api/python/framework/nn_graph.cpp +++ b/oneflow/api/python/framework/nn_graph.cpp @@ -80,12 +80,18 @@ ONEFLOW_API_PYBIND11_MODULE("nn.graph.", m) { m.def("RunLazyNNGraph", &RunLazyNNGraph); m.def("SoftSyncNNGraphBuffers", &SoftSyncNNGraphBuffers); m.def("AddTensorAsGraphLoss", &AddTensorAsGraphLoss); + m.def("ConvertJobToTosaIR", [](const std::string& serialized_job) -> Maybe { + Job job; + CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job)) + << "serialized job conversion failed."; + return ConvertJobToTosaIR(&job); + }); m.def("SaveJobToIR", [](const std::string& serialized_job, const std::string& path) -> Maybe { Job job; - CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job)); + CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job)) + << "serialized job conversion failed."; return SaveJobToIR(&job, path); - ; }); m.def("LoadSerializedJobFromIR", [](const std::string& path) -> Maybe { Job job; diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp index 2094bb85b87..71120182894 100644 --- a/oneflow/api/python/framework/tensor.cpp +++ b/oneflow/api/python/framework/tensor.cpp @@ -337,6 +337,7 @@ static PyObject* PyTensorObject_type(PyObject* self, PyObject* args, PyObject* k ASSERT(CopyBetweenMirroredTensorAndNumpy(PyTensor_Unpack(self), copied, \ BlobNumpyCopyUtil::From, "mut", \ /*block_host_until_done=*/false)); \ + Py_DECREF(copied); \ Py_RETURN_NONE; \ END_HANDLE_ERRORS \ } diff --git a/oneflow/api/python/framework/tensor_functions.cpp b/oneflow/api/python/framework/tensor_functions.cpp index 1c6198c2ab0..2dbfd4a3a02 100644 --- a/oneflow/api/python/framework/tensor_functions.cpp +++ b/oneflow/api/python/framework/tensor_functions.cpp @@ -22,6 +22,7 @@ limitations under the License. #include "oneflow/core/common/shape_vec.h" #include "oneflow/core/functional/functional.h" #include "oneflow/core/common/shape.h" +#include "oneflow/core/common/wrap_dim_utils.h" namespace oneflow { namespace one { @@ -241,7 +242,6 @@ DIRECT_PASS_FUNC(PyTensorObject_div, functional::div) DIRECT_PASS_FUNC(PyTensorObject_div_, functional::div_) DIRECT_PASS_FUNC(PyTensorObject_mul, functional::mul) DIRECT_PASS_FUNC(PyTensorObject_mul_, functional::mul_) -DIRECT_PASS_FUNC(PyTensorObject_sub, functional::sub) DIRECT_PASS_FUNC(PyTensorObject_fmod, functional::fmod) DIRECT_PASS_FUNC(PyTensorObject_logical_and, functional::logical_and) DIRECT_PASS_FUNC(PyTensorObject_logical_or, functional::logical_or) @@ -253,8 +253,36 @@ DIRECT_PASS_FUNC(PyTensorObject_bmm, functional::batch_matmul) DIRECT_PASS_FUNC(PyTensorObject_argmax, functional::argmax) DIRECT_PASS_FUNC(PyTensorObject_argmin, functional::argmin) DIRECT_PASS_FUNC(PyTensorObject_amin, functional::amin) +DIRECT_PASS_FUNC(PyTensorObject_amax, functional::amax) DIRECT_PASS_FUNC(PyTensorObject_addcmul, functional::addcmul) DIRECT_PASS_FUNC(PyTensorObject_addcmul_, functional::addcmul_) +DIRECT_PASS_FUNC(PyTensorObject_clip, functional::clip) +DIRECT_PASS_FUNC(PyTensorObject_clip_, functional::clip_) +DIRECT_PASS_FUNC(PyTensorObject_clamp, functional::clamp) +DIRECT_PASS_FUNC(PyTensorObject_clamp_, functional::clamp_) +DIRECT_PASS_FUNC(PyTensorObject_flatten, functional::flatten) +DIRECT_PASS_FUNC(PyTensorObject_in_top_k, functional::in_top_k) +DIRECT_PASS_FUNC(PyTensorObject_index_select, functional::index_select) +DIRECT_PASS_FUNC(PyTensorObject_maximum, functional::maximum) +DIRECT_PASS_FUNC(PyTensorObject_minimum, functional::minimum) +DIRECT_PASS_FUNC(PyTensorObject_tril, functional::tril) +DIRECT_PASS_FUNC(PyTensorObject_triu, functional::triu) +DIRECT_PASS_FUNC(PyTensorObject_softmax, functional::softmax) +DIRECT_PASS_FUNC(PyTensorObject_log_softmax, functional::log_softmax) +DIRECT_PASS_FUNC(PyTensorObject_roll, functional::roll) +DIRECT_PASS_FUNC(PyTensorObject_unbind, functional::unbind) +DIRECT_PASS_FUNC(PyTensorObject_squeeze, functional::squeeze) +DIRECT_PASS_FUNC(PyTensorObject_swapaxes, functional::swapaxes) +DIRECT_PASS_FUNC(PyTensorObject_swapdims, functional::swapdims) +DIRECT_PASS_FUNC(PyTensorObject_unfold, functional::unfold_tensor) +DIRECT_PASS_FUNC(PyTensorObject_unsqueeze, functional::unsqueeze) +DIRECT_PASS_FUNC(PyTensorObject_max, functional::max) +DIRECT_PASS_FUNC(PyTensorObject_min, functional::min) +DIRECT_PASS_FUNC(PyTensorObject_median, functional::median) +DIRECT_PASS_FUNC(PyTensorObject_pow, functional::pow) +DIRECT_PASS_FUNC(PyTensorObject_chunk, functional::chunk) +DIRECT_PASS_FUNC(PyTensorObject_narrow, functional::narrow) +DIRECT_PASS_FUNC(PyTensorObject_masked_fill, functional::masked_fill) // functions that parsing at Python C api layer static PyObject* PyTensorObject_byte(PyObject* self, PyObject* unused) { @@ -302,10 +330,7 @@ static PyObject* PyTensorObject_size(PyObject* self, PyObject* args, PyObject* k if (idx_obj == NULL || idx_obj == Py_None) return TensorSize_NewFromShape(*shape); int64_t idx = PyLong_AsLongLong(idx_obj); int64_t ndim = shape->NumAxes(); - - CHECK_OR_THROW(idx >= -ndim && idx < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << idx << ")"; + idx = CHECK_JUST(maybe_wrap_dim(idx, ndim)); idx = idx < 0 ? idx + ndim : idx; return PyLong_FromLongLong(shape->At(idx)); END_HANDLE_ERRORS @@ -370,19 +395,6 @@ static PyObject* PyTensorObject_matmul(PyObject* self, PyObject* args, PyObject* END_HANDLE_ERRORS } -static PyObject* PyTensorObject_sub_(PyObject* self, PyObject* args, PyObject* kwargs) { - HANDLE_ERRORS - PyObject* other = NULL; - static const char* keywords[2] = {"other", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O:sub_", const_cast(keywords), &other)) { - return NULL; - } - PyObject* result = PyTensorObject_nb_inplace_sub(self, other); - if (PyErr_Occurred()) { throw py::error_already_set(); } - return result; - END_HANDLE_ERRORS -} - static PyObject* PyTensorObject_reshape(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_ERRORS PyObject* shape = args; @@ -411,6 +423,143 @@ static PyObject* PyTensorObject_reshape_as(PyObject* self, PyObject* args, PyObj END_HANDLE_ERRORS } +static PyObject* PyTensorObject_cpu(PyObject* self, PyObject* unused) { + HANDLE_ERRORS + Optional device = "cpu"; + return PyTensor_New(ASSERT_PTR(functional::To(PyTensor_Unpack(self), device, NullOpt, false))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_cuda(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_ERRORS + PyObject* device_obj = Py_None; + static const char* keywords[2] = {"device", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O:cuda", const_cast(keywords), + &device_obj)) { + return NULL; + } + auto tensor = PyTensor_Unpack(self); + if (functional::PyDeviceCheck(device_obj)) { + Optional> device = functional::PyUnpackDevice(device_obj); + return PyTensor_New(ASSERT_PTR(functional::To(tensor, device, NullOpt, false))); + } + Optional device_str; + if (device_obj == Py_None) { + device_str = "cuda"; + } else if (PyLong_Check(device_obj)) { + device_str = "cuda:" + std::to_string(PyLong_AsLongLong(device_obj)); + } + return PyTensor_New(ASSERT_PTR(functional::To(tensor, device_str, tensor->dtype(), false))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_var(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_ERRORS + PyObject* dim_obj = Py_None; + PyObject* unbiased_obj = Py_True; + PyObject* keepdim_obj = Py_False; + static const char* keywords[4] = {"dim", "unbiased", "keepdim", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO!O!:var", const_cast(keywords), + &dim_obj, &PyBool_Type, &unbiased_obj, &PyBool_Type, + &keepdim_obj)) { + return NULL; + } + bool unbiased = unbiased_obj == Py_True; + bool keepdim = keepdim_obj == Py_True; + CHECK_OR_THROW(dim_obj == Py_None || PyLong_Check(dim_obj) + || functional::PyLongSequenceCheck(dim_obj)) + << Error::TypeError() << "var(): argument 'dim' must be int32 list, not " + << functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(dim_obj))); + auto tensor = PyTensor_Unpack(self); + if (dim_obj == Py_None) { + return PyTensor_New(ASSERT_PTR(functional::Variance(tensor, NullOpt, unbiased, keepdim))); + } + std::vector dim; + if (PyLong_Check(dim_obj)) { + dim.emplace_back(static_cast(PyLong_AsLong(dim_obj))); + return PyTensor_New(ASSERT_PTR(functional::Variance(tensor, dim, unbiased, keepdim))); + } + dim = functional::PyUnpackLongSequence(dim_obj); + return PyTensor_New(ASSERT_PTR(functional::Variance(tensor, dim, unbiased, keepdim))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_std(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_ERRORS + PyObject* dim_obj = Py_None; + PyObject* unbiased_obj = Py_True; + PyObject* keepdim_obj = Py_False; + static const char* keywords[4] = {"dim", "unbiased", "keepdim", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO!O!:std", const_cast(keywords), + &dim_obj, &PyBool_Type, &unbiased_obj, &PyBool_Type, + &keepdim_obj)) { + return NULL; + } + bool unbiased = unbiased_obj == Py_True; + bool keepdim = keepdim_obj == Py_True; + CHECK_OR_THROW(dim_obj == Py_None || PyLong_Check(dim_obj) + || functional::PyLongSequenceCheck(dim_obj)) + << Error::TypeError() << "std(): argument 'dim' must be int32 list, not " + << functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(dim_obj))); + auto tensor = PyTensor_Unpack(self); + if (dim_obj == Py_None) { + return PyTensor_New( + ASSERT_PTR(functional::StandardDeviation(tensor, NullOpt, unbiased, keepdim))); + } + std::vector dim; + if (PyLong_Check(dim_obj)) { + dim.emplace_back(static_cast(PyLong_AsLong(dim_obj))); + return PyTensor_New(ASSERT_PTR(functional::StandardDeviation(tensor, dim, unbiased, keepdim))); + } + dim = functional::PyUnpackLongSequence(dim_obj); + return PyTensor_New(ASSERT_PTR(functional::StandardDeviation(tensor, dim, unbiased, keepdim))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_softplus(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_ERRORS + double beta = 1.0; + double threshold = 20.0; + static const char* keywords[3] = {"beta", "threshold", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "dd:softplus", const_cast(keywords), &beta, + &threshold)) { + return NULL; + } + return PyTensor_New(ASSERT_PTR(functional::Softplus(PyTensor_Unpack(self), beta, threshold))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_relu(PyObject* self, PyObject* unused) { + HANDLE_ERRORS + return PyTensor_New(ASSERT_PTR(functional::Relu(PyTensor_Unpack(self), false))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_relu_(PyObject* self, PyObject* unused) { + HANDLE_ERRORS + return PyTensor_New(ASSERT_PTR(functional::Relu(PyTensor_Unpack(self), true))); + END_HANDLE_ERRORS +} + +#define REDUCE_FUNC(func_name, bind_func, whole_func) \ + static PyObject* func_name(PyObject* self, PyObject* args, PyObject* kwargs) { \ + HANDLE_ERRORS \ + if ((args == NULL || PyTuple_Size(args) == 0) \ + && (kwargs == NULL || PyDict_Size(kwargs) == 0)) { \ + return PyTensor_New(ASSERT_PTR(whole_func(PyTensor_Unpack(self)))); \ + } \ + PyObjectPtr concat_args(concat_self(self, args)); \ + PyObject* result = bind_func(NULL, concat_args.get(), kwargs); \ + if (PyErr_Occurred()) { throw py::error_already_set(); } \ + return result; \ + END_HANDLE_ERRORS \ + } + +REDUCE_FUNC(PyTensorObject_any, functional::reduce_any, functional::ReduceAnyWhole) +REDUCE_FUNC(PyTensorObject_all, functional::reduce_all, functional::ReduceAllWhole) +REDUCE_FUNC(PyTensorObject_sum, functional::reduce_sum, functional::ReduceSumWhole) +REDUCE_FUNC(PyTensorObject_mean, functional::reduce_mean, functional::ReduceMeanWhole) + #define DATATYPE_FUNC(func_name, dtype) \ static PyObject* func_name(PyObject* self, PyObject* unused) { \ HANDLE_ERRORS \ @@ -421,6 +570,7 @@ static PyObject* PyTensorObject_reshape_as(PyObject* self, PyObject* args, PyObj DATATYPE_FUNC(PyTensorObject_int, DType::Int32()); DATATYPE_FUNC(PyTensorObject_long, DType::Int64()); +DATATYPE_FUNC(PyTensorObject_half, DType::Float16()); DATATYPE_FUNC(PyTensorObject_float, DType::Float()); DATATYPE_FUNC(PyTensorObject_double, DType::Double()); @@ -499,12 +649,23 @@ PyMethodDef PyTensorObject_extra_methods[] = { {"diagonal", (PyCFunction)PyTensorObject_diagonal, METH_VARARGS | METH_KEYWORDS, NULL}, {"addcmul", (PyCFunction)PyTensorObject_addcmul, METH_VARARGS | METH_KEYWORDS, NULL}, {"addcmul_", (PyCFunction)PyTensorObject_addcmul_, METH_VARARGS | METH_KEYWORDS, NULL}, - {"sub_", (PyCFunction)PyTensorObject_sub_, METH_VARARGS | METH_KEYWORDS, NULL}, {"matmul", (PyCFunction)PyTensorObject_matmul, METH_VARARGS | METH_KEYWORDS, NULL}, {"int", PyTensorObject_int, METH_NOARGS, NULL}, {"long", PyTensorObject_long, METH_NOARGS, NULL}, + {"half", PyTensorObject_half, METH_NOARGS, NULL}, {"float", PyTensorObject_float, METH_NOARGS, NULL}, {"double", PyTensorObject_double, METH_NOARGS, NULL}, + {"cpu", PyTensorObject_cpu, METH_NOARGS, NULL}, + {"cuda", (PyCFunction)PyTensorObject_cuda, METH_VARARGS | METH_KEYWORDS, NULL}, + {"var", (PyCFunction)PyTensorObject_var, METH_VARARGS | METH_KEYWORDS, NULL}, + {"std", (PyCFunction)PyTensorObject_std, METH_VARARGS | METH_KEYWORDS, NULL}, + {"softplus", (PyCFunction)PyTensorObject_softplus, METH_VARARGS | METH_KEYWORDS, NULL}, + {"relu", PyTensorObject_relu, METH_NOARGS, NULL}, + {"relu_", PyTensorObject_relu_, METH_NOARGS, NULL}, + {"all", (PyCFunction)PyTensorObject_all, METH_VARARGS | METH_KEYWORDS, NULL}, + {"any", (PyCFunction)PyTensorObject_any, METH_VARARGS | METH_KEYWORDS, NULL}, + {"sum", (PyCFunction)PyTensorObject_sum, METH_VARARGS | METH_KEYWORDS, NULL}, + {"mean", (PyCFunction)PyTensorObject_mean, METH_VARARGS | METH_KEYWORDS, NULL}, // macro DIRECT_PASS_FUNC {"floor_divide", (PyCFunction)PyTensorObject_floor_divide, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -515,7 +676,6 @@ PyMethodDef PyTensorObject_extra_methods[] = { {"div_", (PyCFunction)PyTensorObject_div_, METH_VARARGS | METH_KEYWORDS, NULL}, {"mul", (PyCFunction)PyTensorObject_mul, METH_VARARGS | METH_KEYWORDS, NULL}, {"mul_", (PyCFunction)PyTensorObject_mul_, METH_VARARGS | METH_KEYWORDS, NULL}, - {"sub", (PyCFunction)PyTensorObject_sub, METH_VARARGS | METH_KEYWORDS, NULL}, {"fmod", (PyCFunction)PyTensorObject_fmod, METH_VARARGS | METH_KEYWORDS, NULL}, {"logical_and", (PyCFunction)PyTensorObject_logical_and, METH_VARARGS | METH_KEYWORDS, NULL}, {"logical_or", (PyCFunction)PyTensorObject_logical_or, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -524,6 +684,34 @@ PyMethodDef PyTensorObject_extra_methods[] = { {"ne", (PyCFunction)PyTensorObject_ne, METH_VARARGS | METH_KEYWORDS, NULL}, {"lt", (PyCFunction)PyTensorObject_lt, METH_VARARGS | METH_KEYWORDS, NULL}, {"le", (PyCFunction)PyTensorObject_le, METH_VARARGS | METH_KEYWORDS, NULL}, + {"clip", (PyCFunction)PyTensorObject_clip, METH_VARARGS | METH_KEYWORDS, NULL}, + {"clip_", (PyCFunction)PyTensorObject_clip_, METH_VARARGS | METH_KEYWORDS, NULL}, + {"clamp", (PyCFunction)PyTensorObject_clamp, METH_VARARGS | METH_KEYWORDS, NULL}, + {"clamp_", (PyCFunction)PyTensorObject_clamp_, METH_VARARGS | METH_KEYWORDS, NULL}, + {"flatten", (PyCFunction)PyTensorObject_flatten, METH_VARARGS | METH_KEYWORDS, NULL}, + {"in_top_k", (PyCFunction)PyTensorObject_in_top_k, METH_VARARGS | METH_KEYWORDS, NULL}, + {"index_select", (PyCFunction)PyTensorObject_index_select, METH_VARARGS | METH_KEYWORDS, NULL}, + {"maximum", (PyCFunction)PyTensorObject_maximum, METH_VARARGS | METH_KEYWORDS, NULL}, + {"minimum", (PyCFunction)PyTensorObject_minimum, METH_VARARGS | METH_KEYWORDS, NULL}, + {"tril", (PyCFunction)PyTensorObject_tril, METH_VARARGS | METH_KEYWORDS, NULL}, + {"triu", (PyCFunction)PyTensorObject_triu, METH_VARARGS | METH_KEYWORDS, NULL}, + {"softmax", (PyCFunction)PyTensorObject_softmax, METH_VARARGS | METH_KEYWORDS, NULL}, + {"log_softmax", (PyCFunction)PyTensorObject_log_softmax, METH_VARARGS | METH_KEYWORDS, NULL}, + {"roll", (PyCFunction)PyTensorObject_roll, METH_VARARGS | METH_KEYWORDS, NULL}, + {"unbind", (PyCFunction)PyTensorObject_unbind, METH_VARARGS | METH_KEYWORDS, NULL}, + {"squeeze", (PyCFunction)PyTensorObject_squeeze, METH_VARARGS | METH_KEYWORDS, NULL}, + {"swapaxes", (PyCFunction)PyTensorObject_swapaxes, METH_VARARGS | METH_KEYWORDS, NULL}, + {"amax", (PyCFunction)PyTensorObject_amax, METH_VARARGS | METH_KEYWORDS, NULL}, + {"swapdims", (PyCFunction)PyTensorObject_swapdims, METH_VARARGS | METH_KEYWORDS, NULL}, + {"unfold", (PyCFunction)PyTensorObject_unfold, METH_VARARGS | METH_KEYWORDS, NULL}, + {"unsqueeze", (PyCFunction)PyTensorObject_unsqueeze, METH_VARARGS | METH_KEYWORDS, NULL}, + {"max", (PyCFunction)PyTensorObject_max, METH_VARARGS | METH_KEYWORDS, NULL}, + {"min", (PyCFunction)PyTensorObject_min, METH_VARARGS | METH_KEYWORDS, NULL}, + {"median", (PyCFunction)PyTensorObject_median, METH_VARARGS | METH_KEYWORDS, NULL}, + {"pow", (PyCFunction)PyTensorObject_pow, METH_VARARGS | METH_KEYWORDS, NULL}, + {"chunk", (PyCFunction)PyTensorObject_chunk, METH_VARARGS | METH_KEYWORDS, NULL}, + {"narrow", (PyCFunction)PyTensorObject_narrow, METH_VARARGS | METH_KEYWORDS, NULL}, + {"masked_fill", (PyCFunction)PyTensorObject_masked_fill, METH_VARARGS | METH_KEYWORDS, NULL}, // macro UNARY_METHOD {"abs", PyTensorObject_abs, METH_NOARGS, NULL}, diff --git a/oneflow/api/python/framework/variable_tensor_mgr.cpp b/oneflow/api/python/framework/variable_tensor_mgr.cpp index 1560a16574d..1d9b0f64b47 100644 --- a/oneflow/api/python/framework/variable_tensor_mgr.cpp +++ b/oneflow/api/python/framework/variable_tensor_mgr.cpp @@ -26,6 +26,7 @@ namespace oneflow { ONEFLOW_API_PYBIND11_MODULE("", m) { m.def("FillVariableTensorMgr", &FillVariableTensorMgr); m.def("DumpVariableTensorMgr", &DumpVariableTensorMgr); + m.def("ClearVariableTensorMgr", &ClearVariableTensorMgr); } } // namespace oneflow diff --git a/oneflow/api/python/functional/tensor_api.cpp b/oneflow/api/python/functional/tensor_api.cpp index 99caa223100..974edc7edbc 100644 --- a/oneflow/api/python/functional/tensor_api.cpp +++ b/oneflow/api/python/functional/tensor_api.cpp @@ -239,7 +239,12 @@ class LocalTensorSharedNumpyDataFunctor { if (!PyArray_IS_C_CONTIGUOUS(array)) { OF_LOG_ONCE(LOG(WARNING) << "OneFlow don't support non-contiguous array now, " "and we will copy the array to a contiguous one."); + // PyArray_GETCONTIGUOUS will return a reference if array is already contiguous, + // otherwise return a (contiguous) copy of the array. + // Note: Increment the reference count for array occurs whether the array is continuous or not array = PyArray_GETCONTIGUOUS(array); + } else { + Py_INCREF(obj); } // Build TensorMeta @@ -264,13 +269,12 @@ class LocalTensorSharedNumpyDataFunctor { auto tensor_meta = std::make_shared(shape, strides, data_type, device, 0); // Build TensorBuffer - const auto& Free = [obj](char* dptr) { + const auto& Free = [array](char* dptr) { CHECK_JUST(Global::Get()->WithScopedAcquire([&]() -> Maybe { - Py_DECREF(obj); + Py_DECREF(array); return Maybe::Ok(); })); }; - Py_INCREF(obj); // make TensorBuffer hold ndarray void* data_ptr = PyArray_DATA(array); auto array_size_in_bytes = PyArray_NBYTES(array); auto tensor_data = std::make_shared(); diff --git a/oneflow/core/autograd/gradient_funcs/broadcast_binary_ops.cpp b/oneflow/core/autograd/gradient_funcs/broadcast_binary_ops.cpp index c0eaa737e72..00580d213d9 100644 --- a/oneflow/core/autograd/gradient_funcs/broadcast_binary_ops.cpp +++ b/oneflow/core/autograd/gradient_funcs/broadcast_binary_ops.cpp @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/common/container_util.h" #include "oneflow/core/framework/op_expr_grad_function.h" #include "oneflow/core/framework/op_builder.h" #include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" @@ -269,19 +270,27 @@ class BroadcastMinMax : public BroadcastBinaryGrad { const auto& x_shape = *(x->shape()); const Shape& left_extended_x_shape = CreateLeftExtendedShape(ShapeView(x_shape), out_shape.NumAxes()); - const AxisVector& broadcast_axis_vec = left_extended_x_shape.Axes4BroadcastTo(out_shape); - const std::vector x_axis = - std::vector{broadcast_axis_vec.begin(), broadcast_axis_vec.end()}; - broad_x_ = JUST(functional::BroadcastLike(x, out_grads.at(0), x_axis)); + if (left_extended_x_shape == out_shape) { + broad_x_ = JUST(functional::ReshapeLike(x, JUST(VectorAt(out_grads, 0)))); + } else { + const AxisVector& broadcast_axis_vec = left_extended_x_shape.Axes4BroadcastTo(out_shape); + const std::vector x_axis = + std::vector{broadcast_axis_vec.begin(), broadcast_axis_vec.end()}; + broad_x_ = JUST(functional::BroadcastLike(x, JUST(VectorAt(out_grads, 0)), x_axis)); + } } if (ctx->broadcast_y) { const auto& y_shape = *(y->shape()); const Shape& left_extended_y_shape = CreateLeftExtendedShape(ShapeView(y_shape), out_shape.NumAxes()); - const AxisVector& broadcast_axis_vec = left_extended_y_shape.Axes4BroadcastTo(out_shape); - const std::vector y_axis = - std::vector{broadcast_axis_vec.begin(), broadcast_axis_vec.end()}; - broad_y_ = JUST(functional::BroadcastLike(y, out_grads.at(0), y_axis)); + if (left_extended_y_shape == out_shape) { + broad_y_ = JUST(functional::ReshapeLike(y, JUST(VectorAt(out_grads, 0)))); + } else { + const AxisVector& broadcast_axis_vec = left_extended_y_shape.Axes4BroadcastTo(out_shape); + const std::vector y_axis = + std::vector{broadcast_axis_vec.begin(), broadcast_axis_vec.end()}; + broad_y_ = JUST(functional::BroadcastLike(y, JUST(VectorAt(out_grads, 0)), y_axis)); + } } const auto& broad_grads = JUST(elementwise_grad_functor_(out_grads.at(0), broad_x_, broad_y_)); diff --git a/oneflow/core/autograd/gradient_funcs/cublas_fused_mlp.cpp b/oneflow/core/autograd/gradient_funcs/cublas_fused_mlp.cpp index 0a6fd1dbfa5..e0ae114b140 100644 --- a/oneflow/core/autograd/gradient_funcs/cublas_fused_mlp.cpp +++ b/oneflow/core/autograd/gradient_funcs/cublas_fused_mlp.cpp @@ -23,7 +23,7 @@ limitations under the License. #include "oneflow/core/common/container_util.h" #include "oneflow/core/functional/functional.h" #include "oneflow/core/functional/functional_api.yaml.h" -#if CUDA_VERSION >= 11040 +#if CUDA_VERSION >= 11060 namespace oneflow { @@ -94,6 +94,20 @@ Maybe CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx, const TensorTuple& out_grads, TensorTuple* in_grads) const { int32_t weight_num = ctx->weight_num; in_grads->resize(1 + 2 * weight_num); + std::shared_ptr last_bias_dy = JUST(VectorAt(out_grads, 0)); + + if (!ctx->skip_final_activation) { + // step1: use dy and final output to get last layer's relu grad. + last_bias_dy = JUST(functional::ReluGrad(JUST(VectorAt(out_grads, 0)), + JUST(VectorAt(ctx->SavedTensors(), 1 + weight_num)))); + } + + // step2: use reduce_sum to get last layer's bias grad. + std::vector reduce_axes_vec{0}; + if (JUST(VectorAt(ctx->biases_requires_grad, weight_num - 1))) { + JUST(VectorAt(*in_grads, 2 * weight_num)) = + JUST(functional::ReduceSum(last_bias_dy, reduce_axes_vec, false)); + } TensorTuple hiddens(weight_num - 1); TensorTuple weights(weight_num); @@ -101,7 +115,6 @@ Maybe CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx, TensorTuple dgrad(weight_num); std::shared_ptr x = JUST(VectorAt(ctx->SavedTensors(), 0)); - std::shared_ptr out = JUST(VectorAt(ctx->SavedTensors(), 1 + weight_num)); for (int32_t i = 0; i < weight_num; ++i) { weights[i] = JUST(VectorAt(ctx->SavedTensors(), 1 + i)); @@ -115,33 +128,6 @@ Maybe CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx, hiddens[i] = JUST(VectorAt(ctx->SavedTensors(), i + 2 + 2 * weight_num)); } - std::shared_ptr last_bias_dy = JUST(VectorAt(out_grads, 0)); - - if (!ctx->skip_final_activation) { - // step1: use dy and final output to get last layer's relu grad. - last_bias_dy = JUST(functional::ReluGrad(JUST(VectorAt(out_grads, 0)), out)); - } - - const bool last_layer_weight_requires_grad = - JUST(VectorAt(ctx->weights_requires_grad, weight_num - 1)); - const bool last_layer_bias_requires_grad = - JUST(VectorAt(ctx->biases_requires_grad, weight_num - 1)); - - // For last layer, we use CublasMatmulBiasAddGrad to get wgrad and b grad. - if ((last_layer_weight_requires_grad || last_layer_bias_requires_grad)) { - // If there is only 1 layer, we use CublasMatmulBiasAddGrad to calculate first layer's dw. - std::shared_ptr last_layer_x = x; - if (weight_num != 1) { last_layer_x = JUST(VectorAt(hiddens, weight_num - 2)); } - const auto& last_layer_wgrad_bgrad = - JUST(functional::CublasMatmulBiasAddGrad(last_bias_dy, last_layer_x)); - if (last_layer_weight_requires_grad) { - JUST(VectorAt(*in_grads, weight_num)) = JUST(VectorAt(*last_layer_wgrad_bgrad, 0)); - } - if (last_layer_bias_requires_grad) { - JUST(VectorAt(*in_grads, 2 * weight_num)) = JUST(VectorAt(*last_layer_wgrad_bgrad, 1)); - } - } - std::shared_ptr cublas_dy = last_bias_dy; for (int32_t hidden_layer_idx = weight_num - 1; hidden_layer_idx > 0; hidden_layer_idx--) { // If it is final layer, we use out_grads[0] as dy. @@ -154,7 +140,7 @@ Maybe CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx, */ const auto& matmul_relu_bias_bgrad = JUST(functional::CublasBiasAddReluMatmulGrad( cublas_dy, JUST(VectorAt(weights, hidden_layer_idx)), - JUST(VectorAt(cublas_auxs, hidden_layer_idx - 1)))); + JUST(VectorAt(cublas_auxs, hidden_layer_idx - 1)), /*alpha=*/1.0)); // dgrad dgrad.at(hidden_layer_idx) = matmul_relu_bias_bgrad->at(0); // NOLINT @@ -164,10 +150,8 @@ Maybe CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx, JUST(VectorAt(*in_grads, weight_num + hidden_layer_idx)) = matmul_relu_bias_bgrad->at(1); // NOLINT } - // dw, need to skip final layer, cause final layer's wgrad has used CublasMatmulBiasAddGrad to - // calculate. - if (JUST(VectorAt(ctx->weights_requires_grad, hidden_layer_idx)) - && hidden_layer_idx != weight_num - 1) { + // dw + if (JUST(VectorAt(ctx->weights_requires_grad, hidden_layer_idx))) { JUST(VectorAt(*in_grads, (1 + hidden_layer_idx))) = JUST(functional::MatMul( cublas_dy, JUST(VectorAt(hiddens, hidden_layer_idx - 1)), true, false, 1.0)); } @@ -186,12 +170,10 @@ Maybe CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx, JUST(VectorAt(*in_grads, 0)) = JUST(functional::MatMul(last_dy, JUST(VectorAt(weights, 0)), false, false, 1.0)); } - if (JUST(VectorAt(ctx->weights_requires_grad, 0)) && weight_num >= 2) { - // If weight_num == 1, dw has been calculated by CublasMatmulBiasAddGrad, so we need to skip. + if (JUST(VectorAt(ctx->weights_requires_grad, 0))) { // dw: JUST(VectorAt(*in_grads, 1)) = - JUST(functional::MatMul(last_dy, JUST(VectorAt(ctx->SavedTensors(), 0)), true, false, - 1.0)); // use x instead just vectorat + JUST(functional::MatMul(last_dy, JUST(VectorAt(ctx->SavedTensors(), 0)), true, false, 1.0)); } return Maybe::Ok(); @@ -202,4 +184,4 @@ REGISTER_OP_EXPR_GRAD_FUNCTION("cublas_fused_mlp", CublasFusedMLP); } // namespace one } // namespace oneflow -#endif // CUDA_VERSION >= 11040 +#endif // CUDA_VERSION >= 11060 diff --git a/oneflow/core/autograd/gradient_funcs/fused_cross_interaction.cpp b/oneflow/core/autograd/gradient_funcs/fused_cross_interaction.cpp new file mode 100644 index 00000000000..378f93123b4 --- /dev/null +++ b/oneflow/core/autograd/gradient_funcs/fused_cross_interaction.cpp @@ -0,0 +1,117 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/maybe.h" +#include "oneflow/core/framework/op_expr_grad_function.h" +#include "oneflow/core/framework/op_builder.h" +#include "oneflow/core/framework/op_expr.h" +#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" +#include "oneflow/core/functional/functional.h" +#include "oneflow/core/common/container_util.h" + +namespace oneflow { +namespace one { + +struct FusedCrossFeatureInteractionInterpState : public AutoGradCaptureState { + bool x_requires_grad = true; + bool weight_requires_grad = true; + bool x0_requires_grad = true; + bool bias_requires_grad = true; + size_t x_idx = 0; + size_t bias_idx = 0; + size_t weight_idx = 0; + size_t x0_idx = 0; + size_t matmul_result_idx = 0; + std::string interaction_mode; +}; + +class FusedCrossFeatureInteraction + : public OpExprGradFunction { + public: + Maybe Init(const OpExpr& op) override { + const auto* fw_op_expr = dynamic_cast(&op); + CHECK_NOTNULL_OR_RETURN(fw_op_expr) << "fw_op_expr should not be None. "; + base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); + return Maybe::Ok(); + } + + Maybe Capture(FusedCrossFeatureInteractionInterpState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, const AttrMap& attrs) const override { + CHECK_EQ_OR_RETURN(inputs.size(), 4) << "Input size should be equal to 4. "; + ComposedAttrMap composed_attrs(attrs, base_attrs_); + ctx->interaction_mode = JUST(composed_attrs.GetAttr("interaction_mode")); + ctx->x_requires_grad = JUST(oneflow::VectorAt(inputs, 0))->requires_grad(); + ctx->weight_requires_grad = JUST(oneflow::VectorAt(inputs, 1))->requires_grad(); + ctx->x_requires_grad = JUST(oneflow::VectorAt(inputs, 2))->requires_grad(); + ctx->weight_requires_grad = JUST(oneflow::VectorAt(inputs, 3))->requires_grad(); + ctx->x_idx = ctx->SaveTensorForBackward(JUST(oneflow::VectorAt(inputs, 0))); + ctx->weight_idx = ctx->SaveTensorForBackward(JUST(oneflow::VectorAt(inputs, 1))); + ctx->x0_idx = ctx->SaveTensorForBackward(JUST(oneflow::VectorAt(inputs, 2))); + if (ctx->interaction_mode == "matrix") { + ctx->bias_idx = ctx->SaveTensorForBackward(JUST(oneflow::VectorAt(inputs, 3))); + } + ctx->matmul_result_idx = ctx->SaveTensorForBackward(JUST(oneflow::VectorAt(outputs, 1))); + + return Maybe::Ok(); + } + + Maybe Apply(const FusedCrossFeatureInteractionInterpState* ctx, + const TensorTuple& out_grads, TensorTuple* in_grads) const override { + CHECK_EQ_OR_RETURN(out_grads.size(), 2) << "Out grads size should be equal to 2. "; + std::shared_ptr grads; + in_grads->resize(4); + if (ctx->interaction_mode == "vector") { + grads = JUST(functional::FusedCrossFeatureInteractionV1Grad( + JUST(oneflow::VectorAt(out_grads, 0)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->weight_idx)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->x_idx)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->x0_idx)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->matmul_result_idx)))); + } else if (ctx->interaction_mode == "matrix") { + grads = JUST(functional::FusedCrossFeatureInteractionV2Grad( + JUST(oneflow::VectorAt(out_grads, 0)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->weight_idx)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->bias_idx)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->x_idx)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->x0_idx)), + JUST(oneflow::VectorAt(ctx->SavedTensors(), ctx->matmul_result_idx)))); + } else { + UNIMPLEMENTED_THEN_RETURN() << "Interaction mode only support `vector` and `matrix`. "; + } + + if (ctx->x_requires_grad) { + JUST(oneflow::VectorAt(*in_grads, 0)) = JUST(oneflow::VectorAt(*grads, 0)); + } + if (ctx->weight_requires_grad) { + JUST(oneflow::VectorAt(*in_grads, 1)) = JUST(oneflow::VectorAt(*grads, 1)); + } + if (ctx->x0_requires_grad) { + JUST(oneflow::VectorAt(*in_grads, 2)) = JUST(oneflow::VectorAt(*grads, 2)); + } + if (ctx->bias_requires_grad) { + JUST(oneflow::VectorAt(*in_grads, 3)) = JUST(oneflow::VectorAt(*grads, 3)); + } + + return Maybe::Ok(); + } + + private: + AttrMap base_attrs_; +}; + +REGISTER_OP_EXPR_GRAD_FUNCTION("fused_cross_feature_interaction", FusedCrossFeatureInteraction); + +} // namespace one +} // namespace oneflow diff --git a/oneflow/core/autograd/gradient_funcs/fused_matmul_bias_add_relu_dropout.cpp b/oneflow/core/autograd/gradient_funcs/fused_matmul_bias_add_relu_dropout.cpp new file mode 100644 index 00000000000..32e38b0da18 --- /dev/null +++ b/oneflow/core/autograd/gradient_funcs/fused_matmul_bias_add_relu_dropout.cpp @@ -0,0 +1,205 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/error.pb.h" +#include "oneflow/core/common/just.h" +#include "oneflow/core/common/maybe.h" +#include "oneflow/core/framework/op_expr_grad_function.h" +#include "oneflow/core/framework/op_builder.h" +#include "oneflow/core/framework/op_expr.h" +#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" +#include "oneflow/core/common/container_util.h" +#include "oneflow/core/functional/functional.h" +#include "oneflow/core/functional/functional_api.yaml.h" +#if CUDA_VERSION >= 11060 + +namespace oneflow { + +namespace one { + +struct FusedMatmulBiasAddReluDropoutCaptureState : public AutoGradCaptureState { + int32_t weight_num = 0; + bool skip_final_activation = false; + bool x_requires_grad = false; + std::vector weights_requires_grad; + std::vector biases_requires_grad; + std::vector dropout_rate_list; +}; + +class FusedMatmulBiasAddReluDropout + : public OpExprGradFunction { + public: + Maybe Init(const OpExpr& op) override; + Maybe Capture(FusedMatmulBiasAddReluDropoutCaptureState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, const AttrMap& attrs) const override; + Maybe Apply(const FusedMatmulBiasAddReluDropoutCaptureState* ctx, + const TensorTuple& out_grads, TensorTuple* in_grads) const override; + + protected: + AttrMap base_attrs_; +}; + +Maybe FusedMatmulBiasAddReluDropout::Init(const OpExpr& op) { + const UserOpExpr* fw_op_expr = dynamic_cast(&op); + CHECK_NOTNULL_OR_RETURN(fw_op_expr); + base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); + return Maybe::Ok(); +} + +Maybe FusedMatmulBiasAddReluDropout::Capture(FusedMatmulBiasAddReluDropoutCaptureState* ctx, + const TensorTuple& inputs, + const TensorTuple& outputs, + const AttrMap& attrs) const { + CHECK_OR_RETURN(inputs.size() % 2 == 1) << "Both weight and bias should be passed together. "; + int32_t weight_num = (inputs.size() - 1) / 2; + ctx->weight_num = weight_num; + ctx->x_requires_grad = JUST(VectorAt(inputs, 0))->requires_grad(); + ctx->weights_requires_grad.resize(weight_num); + ctx->biases_requires_grad.resize(weight_num); + + for (int32_t i = 0; i < weight_num; i++) { + ctx->weights_requires_grad.at(i) = inputs.at(i + 1)->requires_grad(); // NOLINT + ctx->biases_requires_grad.at(i) = inputs.at(i + 1 + weight_num)->requires_grad(); // NOLINT + } + + ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 0))); // x. idx_sum:1 + for (int32_t i = 0; i < weight_num; i++) { + ctx->SaveTensorForBackward(JUST(VectorAt(inputs, i + 1))); // weights. idx_sum:1+w + } + + ctx->SaveTensorForBackward(JUST(VectorAt(outputs, 0))); // final layers output. idx_sum:2+w + for (int32_t i = 0; i < weight_num; i++) { + ctx->SaveTensorForBackward( + JUST(VectorAt(outputs, i + 1))); // cublas aux. need minus 1. idx_sum:2+2w + } + for (int32_t i = 0; i < weight_num - 1; i++) { + ctx->SaveTensorForBackward(JUST(VectorAt(outputs, i + 1 + weight_num))); // hidden. + } + + ComposedAttrMap composed_attrs(attrs, base_attrs_); + ctx->skip_final_activation = JUST(composed_attrs.GetAttr("skip_final_activation")); + ctx->dropout_rate_list = JUST(composed_attrs.GetAttr>("dropout_rate_list")); + + return Maybe::Ok(); +} + +Maybe FusedMatmulBiasAddReluDropout::Apply( + const FusedMatmulBiasAddReluDropoutCaptureState* ctx, const TensorTuple& out_grads, + TensorTuple* in_grads) const { + int32_t weight_num = ctx->weight_num; + in_grads->resize(1 + 2 * weight_num); + + TensorTuple hiddens(weight_num - 1); + TensorTuple weights(weight_num); + TensorTuple cublas_auxs(weight_num); + TensorTuple dgrad(weight_num); + + std::shared_ptr x = JUST(VectorAt(ctx->SavedTensors(), 0)); + std::shared_ptr out = JUST(VectorAt(ctx->SavedTensors(), 1 + weight_num)); + + for (int32_t i = 0; i < weight_num; ++i) { + weights[i] = JUST(VectorAt(ctx->SavedTensors(), 1 + i)); + } + + for (int32_t i = 0; i < weight_num; ++i) { + cublas_auxs[i] = JUST(VectorAt(ctx->SavedTensors(), i + 2 + weight_num)); + } + + for (int32_t i = 0; i < weight_num - 1; ++i) { + hiddens[i] = JUST(VectorAt(ctx->SavedTensors(), i + 2 + 2 * weight_num)); + } + float rate = ctx->dropout_rate_list.at(weight_num - 1); + float scale = 0.0f; + if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); } + + /* + step1: use dy and mask to get last layer's dropout + relu grad. + Because curand_uniform distribution is (0.0, 1.0], so the value after relu will be write into mask + too. And DropoutGrad use this mask to generate grad, it will generate dropout and relu grad + simultaneously. + */ + std::shared_ptr last_bias_dy = JUST(VectorAt(out_grads, 0)); + if (!ctx->skip_final_activation || rate != 0.0f) { + last_bias_dy = JUST(functional::FusedReluDropoutGrad(JUST(VectorAt(out_grads, 0)), + cublas_auxs[weight_num - 1], scale)); + } + + // step2: use reduce_sum to get last layer's bias grad. + std::vector reduce_axes_vec{0}; + if (JUST(VectorAt(ctx->biases_requires_grad, weight_num - 1))) { + JUST(VectorAt(*in_grads, 2 * weight_num)) = + JUST(functional::ReduceSum(last_bias_dy, reduce_axes_vec, false)); + } + + std::shared_ptr cublas_dy = last_bias_dy; + for (int32_t hidden_layer_idx = weight_num - 1; hidden_layer_idx > 0; hidden_layer_idx--) { + // If it is final layer, we use out_grads[0] as dy. + if (hidden_layer_idx != weight_num - 1) { + cublas_dy = JUST(VectorAt(dgrad, hidden_layer_idx + 1)); + } + rate = ctx->dropout_rate_list.at(hidden_layer_idx - 1); + scale = 1.0; + if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); } + /* + Here we use cublas to compute bias + relu + matmul grad. + Then use Matmul to compute weight grad. + */ + const auto& matmul_relu_bias_bgrad = JUST(functional::CublasBiasAddReluMatmulGrad( + cublas_dy, JUST(VectorAt(weights, hidden_layer_idx)), + JUST(VectorAt(cublas_auxs, hidden_layer_idx - 1)), /*alpha=*/scale)); + + // dgrad + dgrad.at(hidden_layer_idx) = matmul_relu_bias_bgrad->at(0); // NOLINT + + if (JUST(VectorAt(ctx->biases_requires_grad, (hidden_layer_idx - 1)))) { + // dbias + JUST(VectorAt(*in_grads, weight_num + hidden_layer_idx)) = + matmul_relu_bias_bgrad->at(1); // NOLINT + } + // dw + if (JUST(VectorAt(ctx->weights_requires_grad, hidden_layer_idx))) { + JUST(VectorAt(*in_grads, (1 + hidden_layer_idx))) = JUST(functional::MatMul( + cublas_dy, JUST(VectorAt(hiddens, hidden_layer_idx - 1)), true, false, 1.0)); + } + } + + // For the first layer, we need to use 2 matmul to get grads. + std::shared_ptr last_dy; + if (weight_num != 1) { + last_dy = JUST(VectorAt(dgrad, 1)); + } else { + last_dy = last_bias_dy; + } + + if (ctx->x_requires_grad) { + // dx: + JUST(VectorAt(*in_grads, 0)) = + JUST(functional::MatMul(last_dy, JUST(VectorAt(weights, 0)), false, false, 1.0)); + } + if (JUST(VectorAt(ctx->weights_requires_grad, 0))) { + // dw: + JUST(VectorAt(*in_grads, 1)) = + JUST(functional::MatMul(last_dy, JUST(VectorAt(ctx->SavedTensors(), 0)), true, false, 1.0)); + } + + return Maybe::Ok(); +} + +REGISTER_OP_EXPR_GRAD_FUNCTION("fused_matmul_bias_add_relu_dropout", FusedMatmulBiasAddReluDropout); + +} // namespace one + +} // namespace oneflow +#endif // CUDA_VERSION >= 11060 diff --git a/oneflow/core/autograd/gradient_funcs/normalization.cpp b/oneflow/core/autograd/gradient_funcs/normalization.cpp index e336edf519d..c12fcb60442 100644 --- a/oneflow/core/autograd/gradient_funcs/normalization.cpp +++ b/oneflow/core/autograd/gradient_funcs/normalization.cpp @@ -136,15 +136,14 @@ class NormalizationGrad : public OpExprGradFunction::Ok(); } - DimVector dim_vec; + Shape shape; for (int i = 0; i < x->shape()->NumAxes(); ++i) { if (i != ctx->axis) { - dim_vec.emplace_back(1); + shape.emplace_back(1); } else { - dim_vec.emplace_back(x->shape()->At(ctx->axis)); + shape.emplace_back(x->shape()->At(ctx->axis)); } } - Shape shape(dim_vec); const auto& reshaped_gamma = JUST(functional::Reshape(gamma, shape)); const auto& reshaped_inv_variance = JUST(functional::Reshape(inv_variance, shape)); diff --git a/oneflow/core/common/cached_caller.h b/oneflow/core/common/cached_caller.h index 9d8817d6e51..17ad41ac8fd 100644 --- a/oneflow/core/common/cached_caller.h +++ b/oneflow/core/common/cached_caller.h @@ -24,6 +24,17 @@ limitations under the License. #include "oneflow/core/common/maybe.h" #include "oneflow/core/common/tuple_hash.h" +// gcc 11 falsely reports error: +// ‘void operator delete(void*, std::size_t)’ called on unallocated object ‘cache’ +// However, `DeleteAndClear` is only called after `cache` is allocated in +// if (cache == nullptr) block. +// The reason not to use #pragma GCC diagnostic push/pop is that gcc reports +// the error on the caller of `ThreadLocalCachedCall`. +// TODO: replace ThreadLocalCachedCall with ThreadLocalCached decorator? +#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 11 +#pragma GCC diagnostic ignored "-Wfree-nonheap-object" +#endif + namespace oneflow { template diff --git a/oneflow/core/common/error_util.cpp b/oneflow/core/common/error_util.cpp index 86e4b0bee92..89c8fe0e9d6 100644 --- a/oneflow/core/common/error_util.cpp +++ b/oneflow/core/common/error_util.cpp @@ -16,6 +16,7 @@ limitations under the License. #include #include "oneflow/core/common/error_util.h" #include "oneflow/core/common/util.h" +#include "oneflow/core/job/graph_scope_vars.h" namespace oneflow { @@ -97,7 +98,9 @@ std::string FormatFunctionOfStackFrame(const std::string& function) { // msg in stack frame Maybe FormatMsgOfStackFrame(std::string error_msg, bool is_last_stack_frame) { - if (!is_last_stack_frame) { error_msg = *JUST(ShortenMsg(error_msg)); } + const bool debug_mode = GetGraphDebugMode(); + // only shorten the message if it is not the last stack frame AND not in debug mode + if (!is_last_stack_frame && !debug_mode) { error_msg = *JUST(ShortenMsg(error_msg)); } // error_msg of last stack frame come from "<<" if (is_last_stack_frame) { error_msg = StripSpace(error_msg); } std::stringstream ss; diff --git a/oneflow/core/common/fixed_vector.h b/oneflow/core/common/fixed_vector.h deleted file mode 100644 index b3d1c98c827..00000000000 --- a/oneflow/core/common/fixed_vector.h +++ /dev/null @@ -1,277 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_ -#define ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_ - -#include -#include -#include -#include - -namespace oneflow { - -template -using RequireInputIter = typename std::enable_if< - std::is_convertible::iterator_category, - std::input_iterator_tag>::value>::type; - -template -class fixed_vector final { - public: - using value_type = T; - using size_type = std::size_t; - using difference_type = std::ptrdiff_t; - using reference = value_type&; - using const_reference = const value_type&; - using pointer = T*; - using const_pointer = const T*; - using iterator = T*; - using const_iterator = const T*; - using reverse_iterator = std::reverse_iterator; - using const_reverse_iterator = std::reverse_iterator; - - fixed_vector() : size_(0) {} - explicit fixed_vector(size_t size) { assign(size, T()); } - explicit fixed_vector(size_t size, const T& val) { assign(size, val); } - template> - fixed_vector(InputIt first, InputIt last) { - assign(first, last); - } - fixed_vector(const fixed_vector& rhs) { *this = rhs; } - fixed_vector(fixed_vector&& rhs) { *this = std::move(rhs); } - fixed_vector(std::initializer_list rhs) { assign(rhs); } - ~fixed_vector() = default; - - fixed_vector& operator=(const fixed_vector& rhs) { - size_ = rhs.size(); - CheckSize(); - std::copy(rhs.begin(), rhs.end(), begin()); - return *this; - } - fixed_vector& operator=(fixed_vector&& rhs) noexcept { - size_ = rhs.size(); - CheckSize(); - std::copy(rhs.begin(), rhs.end(), begin()); - return *this; - } - fixed_vector& operator=(std::initializer_list ilist) { - size_ = ilist.size(); - assign(ilist); - return *this; - } - void assign(size_type count, const value_type& value) { - size_ = count; - CheckSize(); - std::fill(begin(), begin() + size_, value); - } - template> - void assign(InputIt first, InputIt last) { - size_ = last - first; - CheckSize(); - std::copy(first, last, begin()); - } - void assign(std::initializer_list ilist) { - size_ = ilist.size(); - CheckSize(); - std::copy(ilist.begin(), ilist.end(), begin()); - } - - reference at(size_type pos) { - CheckPos(pos); - return data_.at(pos); - } - const_reference at(size_type pos) const { - CheckPos(pos); - return data_.at(pos); - } - - reference operator[](size_type pos) { - CheckPos(pos); - return data_[pos]; - } - const_reference operator[](size_type pos) const { - CheckPos(pos); - return data_[pos]; - } - - reference front() { - CheckPos(0); - return data_.at(0); - } - const_reference front() const { - CheckPos(0); - return data_.at(0); - } - - reference back() { - CheckPos(0); - return data_.at(size_ - 1); - } - const_reference back() const { - CheckPos(0); - return data_.at(size_ - 1); - } - - T* data() noexcept { return data_.data(); } - const T* data() const noexcept { return data_.data(); } - - iterator begin() noexcept { return data_.data(); } - const_iterator begin() const noexcept { return data_.data(); } - const_iterator cbegin() const noexcept { return data_.data(); } - - iterator end() noexcept { return data_.data() + size_; } - const_iterator end() const noexcept { return data_.data() + size_; } - const_iterator cend() const noexcept { return data_.data() + size_; } - - reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } - const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } - const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(cend()); } - - reverse_iterator rend() noexcept { return reverse_iterator(begin()); } - const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } - const_reverse_iterator crend() const noexcept { return const_reverse_iterator(cbegin()); } - - bool empty() const noexcept { return size_ == 0; } - - size_type size() const noexcept { return size_; } - - size_type max_size() const noexcept { return kMaxSize; } - - size_type capacity() const noexcept { return kMaxSize; } - - void clear() noexcept { size_ = 0; } - - iterator insert(iterator pos, const T& value) { - MoveNToEnd(pos, 1); - *pos = value; - return pos; - } - iterator insert(iterator pos, T&& value) { - MoveNToEnd(pos, 1); - *pos = std::move(value); - return pos; - } - iterator insert(iterator pos, size_type count, const T& value) { - MoveNToEnd(pos, count); - std::fill(pos, pos + count, value); - return pos; - } - template> - void insert(iterator pos, InputIt first, InputIt last) { - MoveNToEnd(pos, last - first); - std::copy(first, last, pos); - } - iterator insert(iterator pos, std::initializer_list ilist) { - MoveNToEnd(pos, ilist.size()); - std::copy(ilist.begin(), ilist.end(), pos); - return pos; - } - - template - iterator emplace(iterator pos, Args&&... args) { - MoveNToEnd(pos, 1); - new (&*pos) T(std::forward(args)...); - return pos; - } - - iterator erase(iterator pos) { - MoveNToBegin(pos + 1, 1); - return pos; - } - iterator erase(iterator first, iterator last) { - if (first >= last) { return last; } - MoveNToBegin(last, last - first); - return first; - } - - void push_back(const T& value) { insert(end(), value); } - void push_back(T&& value) { insert(end(), std::move(value)); } - void emplace_back(const T& value) { insert(end(), value); } - template - void emplace_back(Args&&... args) { - insert(end(), std::forward(args)...); - } - - void pop_back() { --size_; } - - void resize(size_type count) { resize(count, T()); } - void resize(size_type count, const value_type& value) { - if (count == size_) { return; } - if (count < size_) { - erase(begin() + count, end()); - return; - } - insert(end(), count - size_, value); - } - - void swap(fixed_vector& rhs) noexcept { - fixed_vector tmp; - tmp = rhs; - rhs = *this; - *this = tmp; - } - - bool operator==(const fixed_vector& rhs) const { - if (size() != rhs.size()) { return false; } - return std::equal(begin(), end(), rhs.begin()); - } - - bool operator!=(const fixed_vector& rhs) const { return !(*this == rhs); } - - bool operator>=(const fixed_vector& rhs) const { return !(*this < rhs); } - - bool operator<=(const fixed_vector& rhs) const { return !(*this > rhs); } - - bool operator>(const fixed_vector& rhs) const { - return std::lexicographical_compare(rhs.begin(), rhs.end(), begin(), end()); - } - - bool operator<(const fixed_vector& rhs) const { - return std::lexicographical_compare(begin(), end(), rhs.begin(), rhs.end()); - } - - private: - void CheckSize() const { CheckSize(size_); } - void CheckSize(size_t size) const { CHECK_LE(size, kMaxSize); } - void CheckPos(size_t pos) const { CHECK_LE(pos, size_); } - void MoveNToEnd(iterator first, size_t N) { - CheckSize(size_ + N); - iterator old_end = end(); - size_ += N; - iterator new_end = end(); - std::copy_backward(first, old_end, new_end); - } - void MoveNToBegin(iterator last, size_t N) { - CheckPos(last - N - begin()); - iterator old_end = end(); - size_ -= N; - std::copy(last, old_end, last - N); - } - - size_t size_; - std::array data_; -}; - -template -void swap(fixed_vector& lhs, fixed_vector& rhs) { - return lhs.swap(rhs); -} - -#define SHAPE_MAX_AXIS_SIZE 20 - -} // namespace oneflow - -#endif // ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_ diff --git a/oneflow/core/common/fixed_vector_test.cpp b/oneflow/core/common/fixed_vector_test.cpp deleted file mode 100644 index cb79b7510e0..00000000000 --- a/oneflow/core/common/fixed_vector_test.cpp +++ /dev/null @@ -1,419 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/fixed_vector.h" -#include "gtest/gtest.h" -#include -#include - -namespace oneflow { - -namespace test { - -using FixedVec = fixed_vector; - -TEST(fixed_vector, constructor_0) { - FixedVec a(8); - ASSERT_EQ(a.size(), 8); -} - -TEST(fixed_vector, constructor_1) { - int value = 30; - FixedVec a(8, value); - ASSERT_TRUE(std::all_of(a.begin(), a.end(), [value](const int x) { return x == value; })); -} - -TEST(fixed_vector, constructor_2) { - std::vector vec{1, 2, 3, 4}; - FixedVec a(vec.begin(), vec.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, constructor_3) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a(b); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, constructor_4) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a(std::move(b)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, constructor_5) { - std::vector vec{1, 2, 3, 4}; - FixedVec a{1, 2, 3, 4}; - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, operator_assign_0) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a; - a = b; - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, operator_assign_1) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a; - a = std::move(b); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, operator_assign_2) { - std::vector vec{1, 2, 3, 4}; - FixedVec a; - a = {1, 2, 3, 4}; - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, assign_0) { - int value = 30; - FixedVec a; - a.assign(8, value); - ASSERT_TRUE(std::all_of(a.begin(), a.end(), [value](const int x) { return x == value; })); -} - -TEST(fixed_vector, assign_1) { - std::vector vec{1, 2, 3, 4}; - FixedVec a; - a.assign(vec.begin(), vec.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, assign_2) { - std::vector vec{1, 2, 3, 4}; - FixedVec a; - a.assign({1, 2, 3, 4}); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, const_at) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(a.at(0), value); -} - -TEST(fixed_vector, at) { - int value = 33; - FixedVec a{0}; - a.at(0) = value; - ASSERT_EQ(a.at(0), value); -} - -TEST(fixed_vector, const_front) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(a.front(), value); -} - -TEST(fixed_vector, front) { - int value = 33; - FixedVec a{0}; - a.front() = value; - ASSERT_EQ(a.front(), value); -} - -TEST(fixed_vector, const_back) { - int value = 33; - const FixedVec a{1, value}; - ASSERT_EQ(a.back(), value); -} - -TEST(fixed_vector, back) { - int value = 33; - FixedVec a{1, 0}; - a.back() = value; - ASSERT_EQ(a.back(), value); -} - -TEST(fixed_vector, const_data) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(*a.data(), value); -} - -TEST(fixed_vector, data) { - int value = 33; - FixedVec a{0}; - *a.data() = value; - ASSERT_EQ(*a.data(), value); -} - -TEST(fixed_vector, const_begin) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(*a.begin(), value); -} - -TEST(fixed_vector, begin) { - int value = 33; - FixedVec a{0}; - *a.begin() = value; - ASSERT_EQ(*a.begin(), value); -} - -TEST(fixed_vector, cbegin) { - int value = 33; - FixedVec a{value}; - ASSERT_EQ(*a.cbegin(), value); -} - -TEST(fixed_vector, const_end) { - const FixedVec a{0, 1, 2}; - ASSERT_EQ(a.begin() + a.size(), a.end()); -} - -TEST(fixed_vector, end) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.begin() + a.size(), a.end()); -} - -TEST(fixed_vector, cend) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.cbegin() + a.size(), a.cend()); -} - -TEST(fixed_vector, const_rbegin) { - int value = 33; - const FixedVec a{0, value}; - ASSERT_EQ(*a.rbegin(), value); -} - -TEST(fixed_vector, rbegin) { - int value = 33; - FixedVec a{0, 0}; - *a.rbegin() = value; - ASSERT_EQ(*a.rbegin(), value); -} - -TEST(fixed_vector, crbegin) { - int value = 33; - FixedVec a{0, value}; - ASSERT_EQ(*a.crbegin(), value); -} - -TEST(fixed_vector, const_rend) { - const FixedVec a{0, 1, 2}; - ASSERT_EQ(a.rbegin() + a.size(), a.rend()); -} - -TEST(fixed_vector, rend) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.rbegin() + a.size(), a.rend()); -} - -TEST(fixed_vector, crend) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.crbegin() + a.size(), a.crend()); -} - -TEST(fixed_vector, insert_0) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{1, 2}; - a.insert(a.begin(), 0); - a.insert(a.end(), 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_1) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{1, 2}; - int zero = 0; - int three = 3; - a.insert(a.begin(), std::move(zero)); - a.insert(a.end(), std::move(three)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_2) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - a.insert(a.begin(), 2, 0); - a.insert(a.end(), 2, 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_3) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - int zero = 0; - int three = 3; - a.insert(a.begin(), 2, std::move(zero)); - a.insert(a.end(), 2, std::move(three)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_4) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - std::vector zeros{0, 0}; - std::vector threes{3, 3}; - a.insert(a.begin(), zeros.begin(), zeros.end()); - a.insert(a.end(), threes.begin(), threes.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_5) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - a.insert(a.begin(), {0, 0}); - a.insert(a.end(), {3, 3}); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, emplace) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{1, 2}; - a.emplace(a.begin(), 0); - a.emplace(a.end(), 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, erase_0) { - std::vector vec{1, 2}; - FixedVec a{0, 1, 2, 3}; - a.erase(a.begin()); - a.erase(a.end() - 1); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, erase_1) { - std::vector vec{1, 2}; - FixedVec a{0, 0, 1, 2, 3, 3}; - a.erase(a.begin(), a.begin() + 2); - a.erase(a.end() - 2, a.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, push_back_0) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{0, 1, 2}; - a.emplace_back(3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, push_back_1) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{0, 1, 2}; - int three = 3; - a.emplace_back(std::move(three)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, emplace_back) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{0, 1, 2}; - a.emplace_back(3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, pop_back) { - std::vector vec{0, 1, 2}; - FixedVec a{0, 1, 2, 3}; - a.pop_back(); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_0) { - std::vector vec{0, 1, 2}; - FixedVec a{0, 1, 2}; - a.resize(3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_1) { - std::vector vec{0, 1, 2}; - FixedVec a{0, 1, 2}; - a.resize(3, 9527); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_2) { - std::vector vec{0}; - FixedVec a{0, 1, 2}; - a.resize(1); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_3) { - std::vector vec{0}; - FixedVec a{0, 1, 2}; - a.resize(1, 9527); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_4) { - std::vector vec{0, 1, 2, 0, 0}; - FixedVec a{0, 1, 2}; - a.resize(5); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_5) { - std::vector vec{0, 1, 2, 3, 3}; - FixedVec a{0, 1, 2}; - a.resize(5, 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, swap) { - std::vector vec_0{0, 1, 2, 0, 0}; - std::vector vec_1{0, 1, 2, 3, 3}; - FixedVec a_0(vec_1.begin(), vec_1.end()); - FixedVec a_1(vec_0.begin(), vec_0.end()); - a_0.swap(a_1); - ASSERT_TRUE(std::equal(a_0.begin(), a_0.end(), vec_0.begin())); - ASSERT_TRUE(std::equal(a_1.begin(), a_1.end(), vec_1.begin())); -} - -void WithTwoVector(std::function&, const std::vector&)> Handler) { - std::vector a{0, 1, 2, 3, 4}; - std::vector b{0, 1, 2, 3}; - std::vector c{4, 3, 2}; - Handler(a, a); - Handler(a, b); - Handler(a, c); - Handler(b, a); - Handler(b, b); - Handler(b, c); - Handler(c, a); - Handler(c, b); - Handler(c, c); -} - -#define TEST_LOGICAL_OPERATOR(test_name, logical_op) \ - TEST(fixed_vector, test_name) { \ - WithTwoVector([](const std::vector& lhs, const std::vector& rhs) { \ - ASSERT_EQ((lhs logical_op rhs), \ - (FixedVec(lhs.begin(), lhs.end()) logical_op FixedVec(rhs.begin(), rhs.end()))); \ - }); \ - } - -TEST_LOGICAL_OPERATOR(eq, ==); -TEST_LOGICAL_OPERATOR(ne, !=); -TEST_LOGICAL_OPERATOR(gt, >); -TEST_LOGICAL_OPERATOR(ge, >=); -TEST_LOGICAL_OPERATOR(lt, <); -TEST_LOGICAL_OPERATOR(le, <=); - -} // namespace test - -} // namespace oneflow diff --git a/oneflow/core/common/shape.cpp b/oneflow/core/common/shape.cpp index 2286cac3290..5d9b5a96a35 100644 --- a/oneflow/core/common/shape.cpp +++ b/oneflow/core/common/shape.cpp @@ -59,50 +59,32 @@ int64_t ShiftNegativeAxis(int64_t axis, const int64_t num_axes) { return axis; } -Shape::Shape(const std::initializer_list& dim_vec) - : dim_vec_(dim_vec), is_initialized_(true) {} -Shape::Shape(const DimVector& dim_vec) : dim_vec_(dim_vec), is_initialized_(true) {} -Shape::Shape(DimVector&& dim_vec) : dim_vec_(std::move(dim_vec)), is_initialized_(true) {} -Shape::Shape(const ShapeProto& shape_proto) : is_initialized_(true) { - dim_vec_.assign(shape_proto.dim().begin(), shape_proto.dim().end()); -} - -Shape& Shape::operator=(const Shape& shape) { - dim_vec_ = shape.dim_vec_; - is_initialized_ = shape.is_initialized_; - return *this; -} - -Shape& Shape::assign(const DimVector& dim_vec) { - dim_vec_ = dim_vec; - is_initialized_ = true; - return *this; -} +Shape::Shape(const DimVector& dim_vec) : DimVector(dim_vec), is_initialized_(true) {} +Shape::Shape(DimVector&& dim_vec) : DimVector(std::move(dim_vec)), is_initialized_(true) {} +Shape::Shape(const ShapeProto& shape_proto) + : DimVector(shape_proto.dim().begin(), shape_proto.dim().end()), is_initialized_(true) {} Shape& Shape::CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view) { CHECK_EQ(NumAxes(), shape_view.NumAxes()); - std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), dim_vec_.data()); + std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), data()); return *this; } Shape& Shape::LeftOnesExtendedAssign(const ShapeView& shape_view) { CHECK_GE(NumAxes(), shape_view.NumAxes()); size_t left_ones_size = NumAxes() - shape_view.NumAxes(); - FOR_RANGE(int, i, 0, left_ones_size) { dim_vec_.at(i) = 1LL; } - std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), - dim_vec_.data() + left_ones_size); + FOR_RANGE(int, i, 0, left_ones_size) { (*this)[i] = 1LL; } + std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), data() + left_ones_size); return *this; } -bool Shape::operator==(const Shape& rhs) const { return dim_vec_ == rhs.dim_vec_; } - std::string Shape::ToString() const { std::stringstream ss; int32_t idx = 0; ss << "("; - for (int64_t dim : dim_vec_) { + for (int64_t dim : *this) { ss << dim; - if (++idx != dim_vec_.size() || dim_vec_.size() == 1) { ss << ","; } + if (++idx != size() || size() == 1) { ss << ","; } } ss << ")"; return ss.str(); @@ -111,21 +93,21 @@ std::string Shape::ToString() const { std::string Shape::DebugStr() const { return ToString(); } void Shape::ToProto(ShapeProto* ret) const { - *(ret->mutable_dim()) = PbRf(dim_vec_.begin(), dim_vec_.end()); + *(ret->mutable_dim()) = PbRf(begin(), end()); } int64_t Shape::At(int64_t index) const { CHECK_GE(index, 0); CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index << " > num_axes: " << this->NumAxes(); - return dim_vec_.at(index); + return (*this)[index]; } void Shape::Set(int64_t index, int64_t val) { CHECK_GE(index, 0); CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index << " > num_axes: " << this->NumAxes(); - dim_vec_.at(index) = val; + (*this)[index] = val; } int64_t Shape::Count(int64_t begin_axis, int64_t end_axis) const { @@ -206,9 +188,9 @@ Maybe Shape::Slice(int64_t start_dim, int64_t end_dim) const { int64_t ndims = this->NumAxes(); if (start_dim > ndims) { start_dim = ndims; } if (end_dim > ndims) { end_dim = ndims; } - DimVector dim_vec; - for (int64_t i = start_dim; i < end_dim && i < ndims; ++i) { dim_vec.emplace_back(this->At(i)); } - return std::make_shared(dim_vec); + std::shared_ptr shape = std::make_shared(); + for (int64_t i = start_dim; i < end_dim && i < ndims; ++i) { shape->emplace_back(this->At(i)); } + return shape; } } // namespace oneflow diff --git a/oneflow/core/common/shape.h b/oneflow/core/common/shape.h index 408a46c57ca..7a94ad85a6d 100644 --- a/oneflow/core/common/shape.h +++ b/oneflow/core/common/shape.h @@ -32,22 +32,37 @@ namespace cfg { class ShapeProto; } // namespace cfg -class Shape final { +class Shape final : public DimVector { public: // OF_DISALLOW_COPY_AND_MOVE(Shape); + using DimVector::DimVector; Shape() : is_initialized_(false) {} explicit Shape(const DimVector& dim_vec); explicit Shape(DimVector&& dim_vec); explicit Shape(const ShapeProto& shape_proto); - Shape(const std::initializer_list& dim_vec); + // explicit constructor from ShapeView + explicit Shape(ShapeView shape_view); ~Shape() = default; - Shape& operator=(const Shape& shape); - Shape& assign(const DimVector& dim_vec); + +#define OVERRIDE_ADD_DATA_FUNC(func) \ + template \ + void func(Args... args) { \ + DimVector::func(std::forward(args)...); \ + is_initialized_ = true; \ + } + + OVERRIDE_ADD_DATA_FUNC(assign) + OVERRIDE_ADD_DATA_FUNC(push_back) + OVERRIDE_ADD_DATA_FUNC(emplace_back) + OVERRIDE_ADD_DATA_FUNC(append) + OVERRIDE_ADD_DATA_FUNC(insert) + OVERRIDE_ADD_DATA_FUNC(resize) + +#undef OVERRIDE_ADD_DATA_FUNC + Shape& CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view); Shape& LeftOnesExtendedAssign(const ShapeView& shape_view); - bool operator==(const Shape& rhs) const; - bool operator!=(const Shape& rhs) const { return !(*this == rhs); } std::string DebugStr() const; std::string ToString() const; @@ -58,16 +73,16 @@ class Shape final { // Getters and Setters bool is_initialized() const { return is_initialized_; } - const DimVector& dim_vec() const { return dim_vec_; } - DimVector& dim_vec() { return dim_vec_; } + const DimVector& dim_vec() const { return *this; } + DimVector& dim_vec() { return *this; } int64_t elem_cnt() const { - return std::accumulate(dim_vec_.begin(), dim_vec_.end(), int64_t(1), std::multiplies<>()); + return std::accumulate(begin(), end(), int64_t(1), std::multiplies<>()); } int64_t At(int64_t index) const; void Set(int64_t index, int64_t val); int64_t NumAxes() const { CHECK(is_initialized()); - return dim_vec_.size(); + return size(); } int64_t Count(int64_t begin_axis, int64_t end_axis) const; int64_t Count(int64_t begin_axis) const; @@ -82,13 +97,14 @@ class Shape final { Maybe Slice(int64_t start_dim, int64_t end_dim) const; - ShapeView ToShapeView() const { return ShapeView(dim_vec_.data(), dim_vec_.size()); } + ShapeView ToShapeView() const { return ShapeView(data(), size()); } - MutShapeView ToMutShapeView() { return MutShapeView(dim_vec_.data(), dim_vec_.size()); } + MutShapeView ToMutShapeView() { return MutShapeView(data(), size()); } private: - DimVector dim_vec_; - bool is_initialized_; + // Set default value here because some constructors are inherited from DimVector + // TODO(daquexian): remove this field and make it initializied by construction + bool is_initialized_ = true; }; int64_t ShiftNegativeAxis(int64_t axis, const int64_t num_axes); @@ -99,7 +115,7 @@ Shape ZeroDimCompatiableShape(const Shape& shape); Shape CreateReducedShapeOrOnesShape(const ShapeView& shape, const AxisVector& axis_vec); template void Shape::SerializeWithTextFormat(StreamT& out_stream) const { - for (int64_t dim : dim_vec_) { out_stream << std::to_string(dim) << ' '; } + for (int64_t dim : *this) { out_stream << std::to_string(dim) << ' '; } } std::ostream& operator<<(std::ostream& out, const Shape& shape); diff --git a/oneflow/core/common/shape_vec.h b/oneflow/core/common/shape_vec.h index c97870a4832..18a34ed7741 100644 --- a/oneflow/core/common/shape_vec.h +++ b/oneflow/core/common/shape_vec.h @@ -16,24 +16,15 @@ limitations under the License. #ifndef ONEFLOW_CORE_COMMON_SHAPE_VEC_H_ #define ONEFLOW_CORE_COMMON_SHAPE_VEC_H_ -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" namespace oneflow { -//#define DISABLE_FIXED_SHAPE_VEC #define SHAPE_MAX_AXIS_SIZE 20 -#if defined(DISABLE_FIXED_SHAPE_VEC) +typedef small_vector DimVector; +typedef small_vector AxisVector; -typedef std::vector DimVector; -typedef std::vector AxisVector; - -#else - -typedef fixed_vector DimVector; -typedef fixed_vector AxisVector; - -#endif } // namespace oneflow #endif // ONEFLOW_CORE_COMMON_SHAPE_VEC_H_ diff --git a/oneflow/core/common/shape_view.cpp b/oneflow/core/common/shape_view.cpp index cd0bb3d1370..648034665fe 100644 --- a/oneflow/core/common/shape_view.cpp +++ b/oneflow/core/common/shape_view.cpp @@ -77,7 +77,7 @@ template void ShapeViewBase::ToShape(Shape* shape) const { DimVector dim_vec; this->ToDimVector(&dim_vec); - shape->assign(dim_vec); + *shape = Shape(dim_vec); } template class ShapeViewBase; diff --git a/oneflow/core/common/small_vector.h b/oneflow/core/common/small_vector.h new file mode 100644 index 00000000000..6aee5359f2b --- /dev/null +++ b/oneflow/core/common/small_vector.h @@ -0,0 +1,53 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_ +#define ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_ + +#include "llvm/ADT/SmallVector.h" + +namespace oneflow { + +template +class small_vector : public llvm::SmallVector { + using Base = llvm::SmallVector; + + public: + // https://stackoverflow.com/questions/27954940/a-using-statement-compiles-with-g-fails-compilation-with-clang + using Base::Base; + + typename Base::reference at(typename Base::size_type idx) { + CHECK_LT(idx, Base::size()); + return (*this)[idx]; + } + typename Base::const_reference at(typename Base::size_type idx) const { + CHECK_LT(idx, Base::size()); + return (*this)[idx]; + } + typename Base::const_iterator cbegin() const { + return (typename Base::const_iterator)this->BeginX; + } + typename Base::const_iterator cend() const { + return (typename Base::const_iterator)(this->BeginX) + Base::size(); + } + typename Base::const_iterator cbegin() { return (typename Base::const_iterator)this->BeginX; } + typename Base::const_iterator cend() { + return (typename Base::const_iterator)(this->BeginX) + Base::size(); + } +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_ diff --git a/oneflow/core/common/wrap_dim_utils.h b/oneflow/core/common/wrap_dim_utils.h new file mode 100644 index 00000000000..929b203cf45 --- /dev/null +++ b/oneflow/core/common/wrap_dim_utils.h @@ -0,0 +1,40 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/maybe.h" + +namespace oneflow { + +// align with pytorch: `c10/core/WrapDimMinimal.h` +static inline Maybe maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, + bool wrap_scalar = true) { + if (dim_post_expr <= 0) { + if (!wrap_scalar) { + return Error::RuntimeError() + << "dimension specified as " << dim << " but tensor has no dimensions"; + } + dim_post_expr = 1; // this will make range [-1, 0] + } + + int64_t min = -dim_post_expr; + int64_t max = dim_post_expr - 1; + if (dim < min || dim > max) { + return Error::IndexError() << "Dimension out of range (expected to be in range of [" << min + << ", " << max << "], but got " << dim << ")"; + } + if (dim < 0) dim += dim_post_expr; + return dim; +} +} // namespace oneflow diff --git a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp index d119c3eeaf3..a8be6b054ed 100644 --- a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp +++ b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp @@ -72,9 +72,9 @@ class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary { void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0, size_t num_src1_dims, const int64_t* src1_dims, const void* src1, void* dst) override { - DimVector src0_dim_vec; - DimVector src1_dim_vec; - DimVector dst_dim_vec; + Shape src0_shape; + Shape src1_shape; + Shape dst_shape; size_t num_dims = 0; int64_t simplified_src0_dims[kMaxNumDims]; int64_t simplified_src1_dims[kMaxNumDims]; @@ -85,15 +85,13 @@ class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary { CheckInplace(num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1, simplified_dst_dims, dst); for (int64_t i = 0; i < num_dims; ++i) { - src0_dim_vec.push_back(simplified_src0_dims[i]); - src1_dim_vec.push_back(simplified_src1_dims[i]); - dst_dim_vec.push_back(simplified_dst_dims[i]); + src0_shape.push_back(simplified_src0_dims[i]); + src1_shape.push_back(simplified_src1_dims[i]); + dst_shape.push_back(simplified_dst_dims[i]); } - binary_func( - stream, XpuVarNdarray(Shape(dst_dim_vec), reinterpret_cast(dst), num_dims), - XpuVarNdarray(Shape(src0_dim_vec), reinterpret_cast(src0), num_dims), - XpuVarNdarray(Shape(src1_dim_vec), reinterpret_cast(src1), - num_dims)); + binary_func(stream, XpuVarNdarray(dst_shape, reinterpret_cast(dst), num_dims), + XpuVarNdarray(src0_shape, reinterpret_cast(src0), num_dims), + XpuVarNdarray(src1_shape, reinterpret_cast(src1), num_dims)); } }; diff --git a/oneflow/core/framework/nn_graph.cpp b/oneflow/core/framework/nn_graph.cpp index 3d7481e231c..4e444fc0824 100644 --- a/oneflow/core/framework/nn_graph.cpp +++ b/oneflow/core/framework/nn_graph.cpp @@ -291,6 +291,9 @@ Maybe NNGraph::CompileAndInitRuntime() { // PlanUtil::SetForceInplaceMemBlock(&plan_); NOTE(chengcheng): only for ssp. PlanUtil::DumpCtrlRegstInfoToPlan(&plan_); PlanUtil::PlanMemoryLog(&plan_, name_); + if (Global::Get()->enable_debug_mode()) { + PlanUtil::GenLightPlan(&plan_, name_); + } } if (GlobalProcessCtx::WorldSize() > 1) { std::string plan_name = "plan:" + job_name(); diff --git a/oneflow/core/framework/tensor_methods.cpp b/oneflow/core/framework/tensor_methods.cpp index b4479136f93..6f6cf271660 100644 --- a/oneflow/core/framework/tensor_methods.cpp +++ b/oneflow/core/framework/tensor_methods.cpp @@ -24,6 +24,7 @@ limitations under the License. #include "oneflow/core/register/ofblob.h" #include "oneflow/core/framework/instructions_builder.h" #include "oneflow/core/ep/include/device_manager_registry.h" +#include "oneflow/core/common/wrap_dim_utils.h" namespace oneflow { namespace one { @@ -187,7 +188,7 @@ Maybe Unsqueeze(const std::shared_ptr& input, const int32_t& exp cnt++; } target_dim_vec[expand_dim] = 1; - target_stride_vec[expand_dim] = strides->At(expand_dim); + target_stride_vec[expand_dim] = expand_dim < ndim ? strides->At(expand_dim) : 1; } int64_t storage_offset = JUST(JUST(input->AsMirroredTensor())->storage_offset()); @@ -400,12 +401,7 @@ Maybe Transpose(const std::shared_ptr& input, const std::vector< CHECK_EQ_OR_RETURN(permute.size(), ndim) << "permute size should be equal to input tensor's ndim, but got " << permute.size(); auto positive_perm = permute; - for (auto i = 0; i < positive_perm.size(); i++) { - if (positive_perm[i] < 0) { positive_perm[i] += ndim; } - CHECK_OR_RETURN(positive_perm[i] >= 0 && positive_perm[i] < ndim) - << "IndexError: Dimension out of range (expected to be in range of [" << -ndim << "," - << ndim << " ) but got " << positive_perm[i]; - } + for (auto i = 0; i < positive_perm.size(); i++) { JUST(maybe_wrap_dim(positive_perm[i], ndim)); } DimVector target_dims(ndim); DimVector stride_vec(ndim); diff --git a/oneflow/core/framework/variable_tensor_mgr.cpp b/oneflow/core/framework/variable_tensor_mgr.cpp index 752c7e1103f..1b7e528c9c5 100644 --- a/oneflow/core/framework/variable_tensor_mgr.cpp +++ b/oneflow/core/framework/variable_tensor_mgr.cpp @@ -63,6 +63,10 @@ VariableTensorMgr::Dump() { return std::make_tuple(variable_op_names, variable_tensors); } +void VariableTensorMgr::Clear() { + std::map>().swap(variables_); +} + std::vector VariableTensorMgr::DumpNames() { std::vector variable_op_names; for (const auto& x : variables_) { variable_op_names.push_back(x.first); } diff --git a/oneflow/core/framework/variable_tensor_mgr.h b/oneflow/core/framework/variable_tensor_mgr.h index 43b0dc50459..35f117f5cfe 100644 --- a/oneflow/core/framework/variable_tensor_mgr.h +++ b/oneflow/core/framework/variable_tensor_mgr.h @@ -45,6 +45,7 @@ class VariableTensorMgr final { const std::vector>& variable_tensors); std::tuple, std::vector>> Dump(); std::vector DumpNames(); + void Clear(); private: friend class Global; diff --git a/oneflow/core/functional/function_library.h b/oneflow/core/functional/function_library.h index 13d59a0434b..570edffb3bc 100644 --- a/oneflow/core/functional/function_library.h +++ b/oneflow/core/functional/function_library.h @@ -17,6 +17,7 @@ limitations under the License. #define ONEFLOW_CORE_FUNCTIONAL_FUNCTION_LIBRARY_H_ #include "oneflow/core/common/util.h" +#include "oneflow/core/common/wrap_dim_utils.h" #include "oneflow/core/functional/packed_functor.h" #include "oneflow/core/common/stride.h" #include "oneflow/core/framework/tensor_methods.h" diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index cea1677d855..fe62eb5f858 100755 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -292,8 +292,10 @@ bind_python: True - name: "reduce_mean" - signature: ["Tensor (Tensor x, Int32List[1] dim, Bool keepdim=False) => ReduceMean", - "Tensor (Tensor x) => ReduceMeanWhole"] + signature: [ + "Tensor (Tensor x, Int32List[1] dim, Bool keepdim=False) => ReduceMean", + "Tensor (Tensor x) => ReduceMeanWhole" + ] bind_python: True - name: "reduce_all" @@ -973,7 +975,7 @@ - name: "cublas_bias_add_relu_matmul_grad" signature: - "TensorTuple (Tensor dy, Tensor weight, Tensor aux) => CublasBiasAddReluMatmulGrad" + "TensorTuple (Tensor dy, Tensor weight, Tensor aux, Double alpha=1.0) => CublasBiasAddReluMatmulGrad" bind_python: False - name: "cublas_matmul_bias_add_grad" @@ -981,6 +983,16 @@ "TensorTuple (Tensor dy, Tensor x) => CublasMatmulBiasAddGrad" bind_python: False +- name: "fused_matmul_bias_add_relu_dropout" + signature: + "Tensor (Tensor x, TensorTuple weights, TensorTuple biases, Bool skip_final_activation, FloatList dropout_rate_list, Generator generator=None) => FusedMatmulBiasAddReluDropout" + bind_python: True + +- name: "fused_relu_dropout_grad" + signature: + "Tensor (Tensor dy, Tensor mask, Float scale) => FusedReluDropoutGrad" + bind_python: False + - name: "broadcast_matmul_grad_b" signature: "Tensor (Tensor a, Tensor b, Double alpha=1.0) => BroadcastMatmulGradB" bind_python: False @@ -2126,6 +2138,18 @@ signature: "TensorTuple (Tensor dy, TensorTuple features, Bool has_output_concat_grad=False, Bool self_interaction=False, Int32 output_concat_grad_dim=0, String pooling=\"none\") => FusedDotFeatureInteractionGrad" bind_python: False +- name: "fused_cross_feature_interaction" + signature: "Tensor (Tensor x, Tensor weight, Tensor x_0, Tensor bias, String interaction_mode) => FusedCrossFeatureInteraction" + bind_python: True + +- name: "fused_cross_feature_interaction_v1_grad" + signature: "TensorTuple (Tensor dy, Tensor weight, Tensor x, Tensor x_0, Tensor matmul_result) => FusedCrossFeatureInteractionV1Grad" + bind_python: False + +- name: "fused_cross_feature_interaction_v2_grad" + signature: "TensorTuple (Tensor dy, Tensor weight, Tensor bias, Tensor x, Tensor x_0, Tensor matmul_result) => FusedCrossFeatureInteractionV2Grad" + bind_python: False + - name: "tensor_buffer_to_tensor" signature: "Tensor (Tensor input, Shape instance_shape, DataType dtype) => TensorBufferToTensor" bind_python: True diff --git a/oneflow/core/functional/impl/activation_functor.cpp b/oneflow/core/functional/impl/activation_functor.cpp index 5f000756846..3a604b3ebf1 100644 --- a/oneflow/core/functional/impl/activation_functor.cpp +++ b/oneflow/core/functional/impl/activation_functor.cpp @@ -226,9 +226,7 @@ class GluFunctor { const auto ndim = input->ndim(); CHECK_GT_OR_RETURN(ndim, 0) << Error::RuntimeError() << "glu does not support scalars because halving size must be even"; - CHECK_OR_RETURN(dim >= -ndim && dim < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << dim << ")"; + dim = JUST(maybe_wrap_dim(dim, ndim)); if (dim < 0) { dim += ndim; } int64_t nc = input->dim(dim); CHECK_EQ_OR_RETURN(nc % 2, 0) << Error::RuntimeError() @@ -332,10 +330,7 @@ class SoftmaxFunctorBase { int64_t dim_ = dim ? JUST(dim) : get_dim(); if (dim_ < 0) { dim_ += num_axes; } - CHECK_OR_RETURN(dim_ >= -num_axes && dim_ < num_axes) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -num_axes << ", " << num_axes - 1 << "], but got " << dim_ << ")"; - + dim_ = JUST(maybe_wrap_dim(dim_, num_axes)); if (dim_ != num_axes - 1) { std::vector input_perm(input_shape->dim_vec().size(), 0); for (size_t i = 1; i < input_perm.size(); ++i) { input_perm[i] = i; } diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index 8c2f90ade5d..baf634f1679 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -68,9 +68,7 @@ class ArgMaxFunctor { int new_dim = JUST(dim); const int32_t ndims = input->shape()->NumAxes(); - CHECK_OR_RETURN(new_dim >= -ndims && new_dim < ndims) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndims - << "," << ndims << " ] but got " << new_dim << ")"; + new_dim = JUST(maybe_wrap_dim(new_dim, ndims)); if (new_dim < 0) { new_dim += ndims; } const auto do_cast = [&](const std::shared_ptr& x) -> Maybe { return Cast(x, JUST(dtype), /*pin_memory=*/false); @@ -429,25 +427,36 @@ class BroadcastLikeFunctor { Maybe operator()(const std::shared_ptr& x, const std::shared_ptr& like, const std::vector& broadcast_axes) const { + const Shape& x_shape = *x->shape(); + const Shape& like_shape = *like->shape(); + if (x_shape == like_shape) { return x; } MutableAttrMap attrs; if (broadcast_axes.empty()) { - int64_t like_ndim = like->shape()->NumAxes(); - int64_t x_ndim = x->shape()->NumAxes(); + int64_t like_ndim = like_shape.NumAxes(); + int64_t x_ndim = x_shape.NumAxes(); int64_t num_prepend = like_ndim - x_ndim; std::vector prepend_shape(num_prepend, 1); - std::vector broadcast_axes; - for (int i = 0; i < x_ndim; ++i) { prepend_shape.emplace_back(x->shape()->At(i)); } + std::vector broadcast_axes; + for (int i = 0; i < x_ndim; ++i) { prepend_shape.emplace_back(x_shape.At(i)); } for (int i = 0; i < num_prepend; ++i) { broadcast_axes.emplace_back(i); } for (int i = num_prepend; i < prepend_shape.size(); ++i) { - if (prepend_shape[i] != like->shape()->At(i)) { - if (prepend_shape[i] == 1) { broadcast_axes.emplace_back(i); } - CHECK_GE_OR_RETURN(prepend_shape[i], 1) - << Error::RuntimeError() << "output with shape " << x->shape()->ToString() - << " doesn't match the broadcast shape " << like->shape()->ToString(); + if (prepend_shape[i] != like_shape.At(i)) { + if (prepend_shape[i] == 1) { + broadcast_axes.emplace_back(i); + } else { + return Error::RuntimeError() << "The expanded size of the tensor " + << "(" << like_shape.At(i) << ")" + << " must match the existing size (" << prepend_shape[i] + << ") at non-singleton dimension " << i + << ". Target sizes: " << like_shape.ToString() + << ". Tensor sizes: " << x_shape.ToString(); + } } } + JUST(attrs.SetAttr>("broadcast_axes", broadcast_axes)); + } else { + JUST(attrs.SetAttr>("broadcast_axes", broadcast_axes)); } - JUST(attrs.SetAttr>("broadcast_axes", broadcast_axes)); return OpInterpUtil::Dispatch(*op_, {x, JUST(like->detach())}, attrs); } @@ -469,10 +478,7 @@ class ConcatFunctor { int64_t ndim = inputs[0]->ndim(); int64_t max_dim_size = 0; CHECK_GE_OR_RETURN(ninput, 1) << Error::RuntimeError() << "inputs size must greater than 0"; - CHECK_OR_RETURN((-(ndim) <= dim) && (dim <= (ndim - 1))) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << dim << ")"; - if (dim < 0) { axis += ndim; } + axis = JUST(maybe_wrap_dim(axis, ndim)); const std::shared_ptr& shape = inputs[0]->shape(); for (const auto& input : inputs) { @@ -526,10 +532,7 @@ class StackFunctor { const int64_t ninput = inputs.size(); int64_t ndims = inputs[0]->ndim(); int64_t stack_dim = dim; - if (dim < 0) { stack_dim = stack_dim + ndims + 1; } - CHECK_OR_RETURN(stack_dim >= 0 && stack_dim <= ndims) - << Error::IndexError() << "Dimension out of range (expected in range of [" << -ndims - 1 - << ", " << ndims << "], but got " << stack_dim << ")"; + stack_dim = JUST(maybe_wrap_dim(stack_dim, ndims + 1)); if (ninput == 1) { return ExpandDims(inputs[0], dim); } const std::shared_ptr& first_in_shape = inputs[0]->shape(); for (const auto& input : inputs) { @@ -666,9 +669,7 @@ class ExpandDimsFunctor { Maybe operator()(const std::shared_ptr& input, const int32_t& dim) const { int32_t expand_dim = dim; const int32_t ndim = input->shape()->NumAxes(); - CHECK_OR_RETURN(-(ndim + 1) <= dim && dim <= ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -(ndim + 1) << ", " << ndim << "], but got " << dim << ")"; + JUST(maybe_wrap_dim(dim, ndim + 1)); if (dim < 0) { expand_dim = dim + ndim + 1; } MutableAttrMap attrs; JUST(attrs.SetAttr("axis", expand_dim)); @@ -695,10 +696,7 @@ class SqueezeFunctor { if (dim.has_value()) { std::vector dims = *JUST(dim); for (int32_t dim_i : dims) { - CHECK_OR_RETURN((dim_i >= -ndim) && (dim_i <= ndim - 1)) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -ndim << "," << ndim - 1 << "], but got " << dim_i << ")"; - if (dim_i < 0) { dim_i += ndim; } + dim_i = JUST(maybe_wrap_dim(dim_i, ndim)); if (x->shape()->At(dim_i) == 1) { squeeze_dims.emplace_back(dim_i); } } } else { @@ -776,19 +774,29 @@ class DimGatherFunctor { << Error::RuntimeError() << "gather(): Expected dtype int32 or int64 for index"; CHECK_EQ_OR_RETURN(sparse_grad, false) << Error::RuntimeError() << "Only support bool = False for now!"; - CHECK_LT_OR_RETURN(dim, index->ndim()) - << Error::RuntimeError() << "Dimension out of range (expected to be in range of [" - << -index->ndim() << ", " << index->ndim() - 1 << "], but got " << dim << ")"; - CHECK_EQ_OR_RETURN(input->ndim(), index->ndim()) - << Error::RuntimeError() - << "Index tensor must have the same number of dimensions as input tensor"; - - FOR_RANGE(int32_t, i, 0, input->ndim()) { - if (i != dim) { - CHECK_LE_OR_RETURN(index->shape()->At(i), input->shape()->At(i)) - << Error::RuntimeError() << "Size does not match at dimension " << i - << " expected index " << *(index->shape()) << " to be smaller than self " - << *(input->shape()) << " apart from dimension " << dim; + + JUST(maybe_wrap_dim(dim, index->ndim())); + if (input->ndim() > 0 && index->ndim() > 0) { + CHECK_EQ_OR_RETURN(input->ndim(), index->ndim()) + << Error::RuntimeError() + << "Index tensor must have the same number of dimensions as input tensor"; + } else if (input->ndim() == 0) { + CHECK_LE_OR_RETURN(index->ndim(), 1) + << Error::RuntimeError() + << "Index tensor must have the same number of dimensions as input tensor"; + } else { + CHECK_LE_OR_RETURN(input->ndim(), 1) + << Error::RuntimeError() + << "Index tensor must have the same number of dimensions as input tensor"; + } + if (input->ndim() > 0 && index->ndim() > 0) { + FOR_RANGE(int32_t, i, 0, input->ndim()) { + if (i != dim) { + CHECK_LE_OR_RETURN(index->shape()->At(i), input->shape()->At(i)) + << Error::RuntimeError() << "Size does not match at dimension " << i + << " expected index " << *(index->shape()) << " to be smaller than self " + << *(input->shape()) << " apart from dimension " << dim; + } } } @@ -1282,11 +1290,8 @@ class NarrowFunctor { const int64_t ndim = input->shape()->NumAxes(); CHECK_GT_OR_RETURN(ndim, 0) << Error::RuntimeError() << "narrow() cannot be applied to a 0-dim tensor."; - CHECK_OR_RETURN((-ndim <= dim) && (dim <= ndim - 1)) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << dim << ")"; - if (narrow_dim < 0) { narrow_dim += ndim; } - const int64_t dim_length = input->shape()->At(narrow_dim); + narrow_dim = JUST(maybe_wrap_dim(narrow_dim, ndim)); + int64_t dim_length = input->shape()->At(narrow_dim); CHECK_OR_RETURN((-dim_length <= start) && (start <= dim_length)) << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim << ", " << ndim << "], but got " << start << ")"; @@ -1922,16 +1927,10 @@ class DiagonalFunctor { Maybe operator()(const std::shared_ptr& x, const int32_t& offset, const int32_t& dim1, const int32_t& dim2) const { int64_t ndims = x->shape()->NumAxes(); - - CHECK_OR_RETURN(dim1 >= -ndims && dim1 < ndims) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndims - << ", " << ndims - 1 << "], but got " << dim1 << ")"; - CHECK_OR_RETURN(dim2 >= -ndims && dim2 < ndims) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndims - << ", " << ndims - 1 << "], but got " << dim2 << ")"; - - const int32_t p_dim1 = dim1 >= 0 ? dim1 : dim1 + ndims; - const int32_t p_dim2 = dim2 >= 0 ? dim2 : dim2 + ndims; + int32_t p_dim1 = dim1; + int32_t p_dim2 = dim2; + p_dim1 = JUST(maybe_wrap_dim(p_dim1, ndims)); + p_dim2 = JUST(maybe_wrap_dim(p_dim2, ndims)); CHECK_NE_OR_RETURN(p_dim1, p_dim2) << Error::RuntimeError() << "diagonal dimensions cannot be identical " << dim1 << ", " << dim2; @@ -2344,10 +2343,7 @@ class SplitFunctor { Maybe operator()(const std::shared_ptr& x, const int64_t& split_size_or_sections, const int64_t& dim) const { int64_t axis = dim; - if (axis < 0) { axis += x->ndim(); } - CHECK_OR_RETURN(axis >= 0 && axis < x->ndim()) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -x->ndim() << ", " << x->ndim() - 1 << "], but got " << axis << ")"; + axis = JUST(maybe_wrap_dim(axis, x->ndim())); CHECK_GE_OR_RETURN(split_size_or_sections, 0) << Error::RuntimeError() << "split expects split_size be non-negative, but got split_size=" << split_size_or_sections; @@ -2371,10 +2367,7 @@ class UnbindFunctor { Maybe operator()(const std::shared_ptr& x, const int64_t& dim) const { int32_t axis = dim; const int32_t ndim = x->ndim(); - if (axis < 0) { axis += ndim; } - CHECK_OR_RETURN((dim >= -ndim) && (dim < ndim)) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << "," << ndim - 1 << "], but got " << dim << ")"; + axis = JUST(maybe_wrap_dim(axis, ndim)); int32_t dim_size = x->shape()->At(axis); std::shared_ptr chunk_res = JUST(functional::Chunk(x, dim_size, axis)); TensorTuple unbinds(dim_size); @@ -2397,10 +2390,7 @@ class ChunkFunctor { << "chunk expects at least a 1-dimensional tensor."; CHECK_OR_RETURN(chunks > 0) << Error::RuntimeError() << "chunk expects `chunks` to be greater than 0, got: " << chunks; - CHECK_OR_RETURN(-ndim <= dim && dim <= (ndim - 1)) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << dim << ")"; - if (dim < 0) { infferd_dim += ndim; } + infferd_dim = JUST(maybe_wrap_dim(infferd_dim, ndim)); const auto dim_size = x->shape()->At(infferd_dim); int64_t split_size = (dim_size + chunks - 1) / chunks; @@ -2452,10 +2442,7 @@ class SplitWithSizeFunctor { const std::vector& split_size_or_sections, const int64_t& dim) const { int64_t axis = dim; - if (axis < 0) { axis += x->ndim(); } - CHECK_OR_RETURN(axis >= 0 && axis < x->ndim()) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -x->ndim() << ", " << x->ndim() - 1 << "], but got " << axis << ")"; + axis = JUST(maybe_wrap_dim(axis, x->ndim())); int64_t dim_size = x->shape()->At(axis); int64_t num_splits = split_size_or_sections.size(); TensorTuple splits(num_splits); @@ -2639,11 +2626,7 @@ class IndexSelectFunctor { CHECK_EQ_OR_RETURN(index_dtype_flag, true) << Error::RuntimeError() << "index_select(): Expected dtype int32 or int64 for index"; int64_t new_dim = dim; - if (dim < 0) { new_dim += input_num_axes; } - CHECK_LE_OR_RETURN(new_dim, input_num_axes) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -input_num_axes << ", " << input_num_axes - 1 << "], but got " << new_dim << ")"; - + new_dim = JUST(maybe_wrap_dim(new_dim, input_num_axes)); return JUST(functional::Gather(input, index, new_dim)); } }; @@ -2964,10 +2947,7 @@ class RepeatInterLeaveIntFunctor { int32_t dim_ = JUST(dim); const auto& input_shape = input->shape(); const int64_t& num_axes = input_shape->NumAxes(); - if (dim_ < 0) { dim_ += num_axes; } - CHECK_OR_RETURN(dim_ >= -num_axes && dim_ < num_axes) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -num_axes << ", " << num_axes - 1 << "], but got " << dim_ << ")"; + dim_ = JUST(maybe_wrap_dim(dim_, num_axes)); std::shared_ptr repeats_expand = JUST( Expand(JUST(Constant(Shape{1}, Scalar(repeats), DType::Int32(), JUST(input->device()))), Shape{input->shape()->At(dim_)})); @@ -3012,10 +2992,7 @@ class RepeatInterLeaveTensorFunctor { int32_t dim_ = dim; const auto& input_shape = input->shape(); const int64_t& num_axes = input_shape->NumAxes(); - if (dim_ < 0) { dim_ += num_axes; } - CHECK_OR_RETURN(dim_ >= -num_axes && dim_ < num_axes) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -num_axes << ", " << num_axes - 1 << "], but got " << dim_ << ")"; + dim_ = JUST(maybe_wrap_dim(dim_, num_axes)); CHECK_OR_RETURN(repeats_shape->At(0) == input->shape()->At(dim_)) << Error::RuntimeError() << "repeats must have the same size as input along dim"; std::shared_ptr cumsum = JUST(Cumsum(repeats, 0, DType::Int32())); diff --git a/oneflow/core/functional/impl/common.cpp b/oneflow/core/functional/impl/common.cpp index b24e18cd121..11cf67a2ab9 100644 --- a/oneflow/core/functional/impl/common.cpp +++ b/oneflow/core/functional/impl/common.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #include "oneflow/core/functional/impl/common.h" #include "oneflow/core/autograd/autograd_mode.h" +#include "oneflow/core/common/wrap_dim_utils.h" namespace oneflow { namespace one { @@ -39,14 +40,7 @@ Maybe> CheckAxis(const std::vector& axis, const in std::vector reduce_axis(naxis); std::vector axis_num(ndim); for (int32_t i = 0; i < naxis; i++) { - CHECK_OR_RETURN(axis[i] >= -ndim && axis[i] < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << axis[i] << ")"; - if (axis[i] < 0) { - reduce_axis[i] = axis[i] + ndim; - } else { - reduce_axis[i] = axis[i]; - } + reduce_axis[i] = JUST(maybe_wrap_dim(axis[i], ndim)); axis_num[reduce_axis[i]]++; CHECK_OR_RETURN(axis_num[reduce_axis[i]] < 2) << Error::RuntimeError() << "dim " << reduce_axis[i] diff --git a/oneflow/core/functional/impl/math_functor.cpp b/oneflow/core/functional/impl/math_functor.cpp index 3d8b67b6db8..ab22624934a 100644 --- a/oneflow/core/functional/impl/math_functor.cpp +++ b/oneflow/core/functional/impl/math_functor.cpp @@ -372,12 +372,7 @@ class Max2Functor { const bool& keepdims) const { auto outputs = std::make_shared(2); int32_t axis = dim; - if (axis < -x->ndim() || axis >= x->ndim()) { - return Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -x->ndim() << ", " << x->ndim() - 1 << "], but got " << axis - << ")"; - } - if (axis < 0) { axis += x->ndim(); } + axis = JUST(maybe_wrap_dim(axis, x->ndim())); (*outputs)[0] = JUST(ReduceMax(x, {axis}, keepdims)); (*outputs)[1] = JUST(ArgMax(x, dim, keepdims, NullOpt)); return outputs; @@ -399,12 +394,7 @@ class Min2Functor { const bool& keepdims) const { auto outputs = std::make_shared(2); int32_t axis = dim; - if (axis < -x->ndim() || axis >= x->ndim()) { - return Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -x->ndim() << ", " << x->ndim() - 1 << "], but got " << axis - << ")"; - } - if (axis < 0) { axis += x->ndim(); } + axis = JUST(maybe_wrap_dim(axis, x->ndim())); (*outputs)[0] = JUST(ReduceMin(x, {axis}, keepdims)); (*outputs)[1] = JUST(ArgMin(x, dim, keepdims, NullOpt)); return outputs; @@ -419,13 +409,7 @@ class AminFunctor { const int32_t ndim = x->ndim(); std::vector& dims = *JUST(dim); - for (int i = 0; i < dims.size(); i++) { - if (dims[i] < -ndim || dims[i] >= ndim) { - return Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -ndim << ", " << ndim - 1 << "], but got " << dims[i] << ")"; - } - if (dims[i] < 0) { dims[i] += ndim; } - } + for (int i = 0; i < dims.size(); i++) { dims[i] = JUST(maybe_wrap_dim(dims[i], ndim)); } return ReduceMin(x, dims, keepdim); } }; @@ -438,13 +422,7 @@ class AmaxFunctor { const int32_t ndim = x->ndim(); std::vector& dims = *JUST(dim); - for (int i = 0; i < dims.size(); i++) { - if (dims[i] < -ndim || dims[i] >= ndim) { - return Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -ndim << ", " << ndim - 1 << "], but got " << dims[i] << ")"; - } - if (dims[i] < 0) { dims[i] += ndim; } - } + for (int i = 0; i < dims.size(); i++) { dims[i] = JUST(maybe_wrap_dim(dims[i], ndim)); } return ReduceMax(x, dims, keepdim); } }; @@ -458,6 +436,7 @@ class ReduceSumWholeFunctor { Maybe operator()(const std::shared_ptr& x) const { MutableAttrMap attrs; const int32_t naxis = x->ndim(); + if (naxis == 0) { return x; } // for 0-dim Tensor std::vector axis(naxis); std::iota(axis.begin(), axis.end(), 0); JUST(attrs.SetAttr>("axis", axis)); @@ -819,18 +798,14 @@ class MedianWithIndicesFunctor { const bool& keepdim) const { MutableAttrMap attrs; int32_t axis = dim; - if (axis < -x->ndim() || axis >= x->ndim()) { - return Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -x->ndim() << ", " << x->ndim() - 1 << "], but got " << axis - << ")"; - } - if (axis < 0) { axis += x->ndim(); } + const int64_t ndim = x->ndim(); + axis = JUST(maybe_wrap_dim(axis, ndim)); std::shared_ptr tensor = x; if (x->dim(axis) == 0) { return Error::IndexError() << "IndexError: Expected reduction dim " << axis << " to have non-zero size."; } - if (axis != x->ndim() - 1) { + if (axis != ndim - 1) { tensor = JUST(functional::Squeeze( JUST(functional::Transpose2dim(JUST(functional::Unsqueeze(x, -1)), axis, -1)), std::vector({axis}))); @@ -897,10 +872,7 @@ class TransposeFunctor { // so copy it to local var and do modification. auto positive_perm = permute; for (auto i = 0; i < positive_perm.size(); i++) { - if (positive_perm[i] < 0) { positive_perm[i] += ndim; } - CHECK_OR_RETURN(positive_perm[i] >= 0 && positive_perm[i] < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << "," << ndim << " ) but got " << positive_perm[i] << ")"; + positive_perm[i] = JUST(maybe_wrap_dim(positive_perm[i], ndim)); } // currently, view only support eager and local mode if (view::IsViewApplicable(input)) { return JUST(view::Transpose(input, positive_perm)); } @@ -926,15 +898,8 @@ class Transpose2dimFunctor { int32_t dim_0 = dim0; int32_t dim_1 = dim1; - if (dim0 < 0) { dim_0 += ndim; } - if (dim1 < 0) { dim_1 += ndim; } - - CHECK_OR_RETURN(dim_0 >= 0 && dim0 < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << dim_0 << ")"; - CHECK_OR_RETURN(dim_1 >= 0 && dim1 < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << dim_1 << ")"; + dim_0 = JUST(maybe_wrap_dim(dim_0, ndim)); + dim_1 = JUST(maybe_wrap_dim(dim_1, ndim)); for (int32_t i = 0; i < ndim; ++i) { permute.emplace_back(i); } std::swap(permute[dim_0], permute[dim_1]); Shape shape(DimVector(permute.begin(), permute.end())); @@ -1681,10 +1646,7 @@ class SelectFunctor { const int32_t& index) const { int32_t ndim = input->ndim(); CHECK_OR_RETURN(ndim > 0) << "select() cannot be applied to a 0-dim tensor."; - CHECK_OR_RETURN((dim >= -ndim) && (dim < ndim)) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << "," << ndim - 1 << "], but got " << dim << ")"; - int32_t pos_dim = dim >= 0 ? dim : dim + ndim; + int32_t pos_dim = JUST(maybe_wrap_dim(dim, ndim)); auto size = input->dim(pos_dim); CHECK_OR_RETURN((index >= -size) && (index < size)) << "Index out of range (expected to be in range of [" << -size << "," << size - 1 @@ -1952,7 +1914,12 @@ class StandardDeviationFunctor { Maybe operator()(const std::shared_ptr& input, const Optional>& dim, const Optional& unbiased, const Optional& keepdim) const { - std::vector axis = *JUST(CheckAxis(*JUST(dim), input->ndim())); + std::vector axis; + if (!dim) { + for (int i = 0; i < input->ndim(); i++) { axis.emplace_back(i); } + } else { + axis = *JUST(CheckAxis(*JUST(dim), input->ndim())); + } bool unbias = true; bool keepdims = false; if (unbiased.has_value()) { unbias = JUST(unbiased); } @@ -2043,9 +2010,7 @@ class VarianceFunctor { for (int i = 0; i < ndim; i++) { axis.emplace_back(i); } } else { std::vector& dims = *JUST(dim); - CHECK_GE_OR_RETURN(ndim, dims.size()) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << dims.size() << ")"; + JUST(maybe_wrap_dim(dims.size(), ndim)); std::sort(dims.begin(), dims.end()); axis.assign(dims.begin(), dims.end()); } @@ -2083,10 +2048,7 @@ class MovedimVecFunctor { std::vector is_used(ndim, false); FOR_RANGE(size_t, i, 0, perm.size()) { int32_t item = perm[i]; - if (item < 0) { item += ndim; } - CHECK_OR_RETURN(item >= -ndim && item < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << ", " << ndim - 1 << "], but got " << item << ")"; + item = JUST(maybe_wrap_dim(item, ndim)); CHECK_EQ_OR_RETURN(is_used[item], false) << "repeated dim in " << desc; is_used[item] = true; @@ -2153,10 +2115,7 @@ class TensorSplitVecFunctor { const std::vector& indices_or_sections, const int32_t& dim) const { int32_t ndim = input->ndim(); - CHECK_OR_RETURN((dim >= -ndim) && (dim < ndim)) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << "," << ndim - 1 << "], but got " << dim << ")"; - int32_t pos_dim = dim >= 0 ? dim : dim + ndim; + int32_t pos_dim = JUST(maybe_wrap_dim(dim, ndim)); std::vector start(ndim, 0); std::vector stop(ndim); @@ -2184,12 +2143,9 @@ class TensorSplitIntFunctor { Maybe operator()(const std::shared_ptr& input, const int32_t& indices_or_sections, const int32_t& dim) const { int32_t ndim = input->ndim(); - CHECK_OR_RETURN((dim >= -ndim) && (dim < ndim)) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << "," << ndim - 1 << "], but got " << dim << ")"; + int32_t pos_dim = JUST(maybe_wrap_dim(dim, ndim)); CHECK_OR_RETURN(indices_or_sections > 0) << "number of sections must be larger than 0, got ," << indices_or_sections << ");"; - int32_t pos_dim = dim >= 0 ? dim : dim + ndim; const auto dim_size = input->dim(pos_dim); int64_t min_split_size = dim_size / indices_or_sections; @@ -2313,10 +2269,7 @@ class CumBaseFunctor { Maybe operator()(const std::shared_ptr& input, int64_t dim, const Optional>& dtype) const { auto ndim = input->ndim(); - if (dim < 0) { dim += ndim; } - CHECK_OR_RETURN(dim >= 0 && dim < ndim) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" << -ndim - << "," << ndim << " ) but got " << dim << ")"; + dim = JUST(maybe_wrap_dim(dim, ndim)); MutableAttrMap attrs; JUST(attrs.SetAttr("dim", dim)); diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 204b6b266db..a453cdb4dfe 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -61,6 +61,14 @@ class BiasAddFunctor { const int64_t num_axes = x->shape()->NumAxes(); axis_val += num_axes; } + CHECK_LT_OR_RETURN(axis_val, x->shape()->NumAxes()) + << Error::IndexError() << "Dimension out of range (expected to be in range of [-" + << x->shape()->NumAxes() << "," << x->shape()->NumAxes() - 1 << "], but got " << axis_val + << ")"; + CHECK_EQ_OR_RETURN(x->shape()->At(axis_val), bias->shape()->At(0)) + << Error::RuntimeError() << "The size of tensor x " << x->shape()->ToString() + << " must match the size of tensor b " << bias->shape()->ToString() << " at dimension " + << axis_val; JUST(attrs.SetAttr("axis", axis_val)); return OpInterpUtil::Dispatch(*op_, {x, bias}, attrs); } @@ -83,8 +91,12 @@ class ConvBaseFunctor { const std::string& channel_pos) const { MutableAttrMap conv_attrs; std::vector kernel_size_vec(num_spatial_dims_); + int32_t channel_idx = 1; int32_t kernel_idx_offset = 2; - if (channel_pos == "channels_last") { kernel_idx_offset = 1; } + if (channel_pos == "channels_last") { + kernel_idx_offset = 1; + channel_idx = kernel_idx_offset + num_spatial_dims_; + } for (int i = 0; i < num_spatial_dims_; i++) { kernel_size_vec.at(i) = ((weight->shape())->At(i + kernel_idx_offset)); @@ -99,9 +111,7 @@ class ConvBaseFunctor { const std::shared_ptr& conv_out = JUST(OpInterpUtil::Dispatch(*conv_op_, {x, weight}, conv_attrs)); if (bias) { - MutableAttrMap bias_attrs; - JUST(bias_attrs.SetAttr("axis", 1)); - return OpInterpUtil::Dispatch(*bias_op_, {conv_out, JUST(bias)}, bias_attrs); + return functional::BiasAdd(conv_out, JUST(bias), channel_idx); } else { return conv_out; } @@ -273,8 +283,10 @@ class MatMulFunctor { const auto& b_shape = b->shape(); // TODO(): Support 1-d tensor by dot. - CHECK_GE_OR_RETURN(a_shape->NumAxes(), 2) << "Tensor a's dim should >= 2"; - CHECK_GE_OR_RETURN(b_shape->NumAxes(), 2) << "Tensor b's dim should >= 2"; + CHECK_GE_OR_RETURN(a_shape->NumAxes(), 2) + << Error::RuntimeError() << "Tensor a's dim should >= 2"; + CHECK_GE_OR_RETURN(b_shape->NumAxes(), 2) + << Error::RuntimeError() << "Tensor b's dim should >= 2"; MutableAttrMap attrs; JUST(attrs.SetAttr("transpose_a", transpose_a)); @@ -282,6 +294,7 @@ class MatMulFunctor { JUST(attrs.SetAttr("alpha", alpha)); if (a_shape->NumAxes() != b_shape->NumAxes()) { CHECK_EQ_OR_RETURN(b_shape->NumAxes(), 2) + << Error::RuntimeError() << "Not support number of dimensions of a being less than number of dimensions of b!"; return OpInterpUtil::Dispatch(*bcast_matmul_op_, {a, b}, attrs); } @@ -374,14 +387,8 @@ class TensorDotFunctor { std::vector dot_dims_a(dims_a.begin(), dims_a.end()); std::vector dot_dims_b(dims_b.begin(), dims_b.end()); for (int64_t i = 0; i < dot_dims_a.size(); i++) { - CHECK_OR_RETURN(dot_dims_a[i] >= -a->ndim() && dot_dims_a[i] < a->ndim()) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -a->ndim() << ", " << a->ndim() - 1 << "], but got " << dot_dims_a[i] << ")"; - CHECK_OR_RETURN(dot_dims_b[i] >= -b->ndim() && dot_dims_b[i] < b->ndim()) - << Error::IndexError() << "Dimension out of range (expected to be in range of [" - << -b->ndim() << ", " << b->ndim() - 1 << "], but got " << dot_dims_b[i] << ")"; - dot_dims_a[i] = dot_dims_a[i] < 0 ? dot_dims_a[i] + a->ndim() : dot_dims_a[i]; - dot_dims_b[i] = dot_dims_b[i] < 0 ? dot_dims_b[i] + b->ndim() : dot_dims_b[i]; + dot_dims_a[i] = JUST(maybe_wrap_dim(dot_dims_a[i], a->ndim())); + dot_dims_b[i] = JUST(maybe_wrap_dim(dot_dims_b[i], b->ndim())); } std::vector if_dot_dims_a(a->ndim(), false); std::vector if_dot_dims_b(b->ndim(), false); @@ -466,7 +473,7 @@ class TensorDotFunctor { class FusedMLPFunctor { public: FusedMLPFunctor() { -#if CUDA_VERSION >= 11040 +#if CUDA_VERSION >= 11060 fused_op_.resize(kMaxInputCount /*the maximum number of inputs*/); for (int n = 1; n < fused_op_.size(); ++n) { fused_op_[n] = CHECK_JUST(one::OpBuilder("cublas_fused_mlp") @@ -484,9 +491,10 @@ class FusedMLPFunctor { const TensorTuple& biases, bool skip_final_activation) const { const int64_t weight_size = weights.size(); const int64_t bias_size = biases.size(); - CHECK_GE_OR_RETURN(weight_size, 1) << "The number of weights should be greater equal than 1. "; + CHECK_GE_OR_RETURN(weight_size, 1) + << Error::RuntimeError() << "The number of weights should be greater equal than 1. "; CHECK_EQ_OR_RETURN(weight_size, bias_size) - << "The number of weights should be equal to biases. "; + << Error::RuntimeError() << "The number of weights should be equal to biases. "; int64_t n = 0, k = 0; /* x: (m, k) @@ -500,13 +508,16 @@ class FusedMLPFunctor { const auto& bias_shape = biases[i]->shape(); // TODO(): Support Fused batch/broadcast matmul. - CHECK_EQ_OR_RETURN(weight_shape->NumAxes(), 2) << "Weight's dim should == 2"; - CHECK_EQ_OR_RETURN(bias_shape->NumAxes(), 1) << "Bias's dim should == 1"; + CHECK_EQ_OR_RETURN(weight_shape->NumAxes(), 2) + << Error::RuntimeError() << "Weight's dim size should == 2"; + CHECK_EQ_OR_RETURN(bias_shape->NumAxes(), 1) + << Error::RuntimeError() << "Bias's dim size should == 1"; n = weight_shape->At(0); - CHECK_EQ_OR_RETURN(bias_shape->At(0), n) << "Bias's dim is not equal to weight's last dim. "; + CHECK_EQ_OR_RETURN(bias_shape->At(0), n) + << Error::RuntimeError() << "Bias's dim is not equal to weight's first dim. "; CHECK_EQ_OR_RETURN(weight_shape->At(1), k) - << "weight's first dim should be equal to input's last dim. "; + << Error::RuntimeError() << "weight's second dim should be equal to input's second dim. "; // Set for next layer. k = n; @@ -541,7 +552,7 @@ class FusedMLPFunctor { biases[layer_idx], 1)); if ((layer_idx != weight_size - 1) || (!skip_final_activation)) { /* - When it is not finaly dense layer, or it is final dense layer and skip_final_activate=False, + When it is not last dense layer, or it is last dense layer and skip_final_activate=False, we add relu Layer. */ out = JUST(functional::Relu(out, false)); @@ -551,7 +562,115 @@ class FusedMLPFunctor { } private: -#if CUDA_VERSION >= 11040 +#if CUDA_VERSION >= 11060 + std::vector> fused_op_; +#endif +}; + +class FusedMatmulBiasAddReluDropoutFunctor { + public: + FusedMatmulBiasAddReluDropoutFunctor() { +#if CUDA_VERSION >= 11060 + fused_op_.resize(kMaxInputCount /*the maximum number of inputs*/); + for (int n = 1; n < fused_op_.size(); ++n) { + fused_op_[n] = CHECK_JUST(one::OpBuilder("fused_matmul_bias_add_relu_dropout") + .Input("x") + .Input("weights", n) + .Input("biases", n) + .Output("out") + .Output("cublas_aux", n) + .Output("hidden", n) + .Build()); + } +#endif + } + Maybe operator()(const std::shared_ptr& x, const TensorTuple& weights, + const TensorTuple& biases, bool skip_final_activation, + const std::vector& dropout_rate_list, + const Optional& generator) const { + const int64_t weight_size = weights.size(); + const int64_t bias_size = biases.size(); + CHECK_GE_OR_RETURN(weight_size, 1) + << Error::RuntimeError() << "The number of weights should be greater equal than 1. "; + CHECK_EQ_OR_RETURN(weight_size, bias_size) + << Error::RuntimeError() << "The number of weights should be equal to biases. "; + CHECK_EQ_OR_RETURN(weight_size, dropout_rate_list.size()) + << Error::RuntimeError() + << "The dropout rate list length should be equal to the number of weights. "; + int64_t n = 0, k = 0; + /* + x: (m, k) + weight: (n, k) need transpose + bias: (n) + */ + const auto& x_shape = x->shape(); + k = x_shape->At(1); + const auto gen = generator.value_or(JUST(one::DefaultAutoGenerator())); + const auto& dropout_state = std::make_shared(gen); + for (int64_t i = 0; i < weight_size; i++) { + CHECK_GE_OR_RETURN(dropout_rate_list[i], 0.0f) + << Error::RuntimeError() << "Dropout rate should be >= 0.0"; + + const auto& weight_shape = weights[i]->shape(); + const auto& bias_shape = biases[i]->shape(); + // TODO(): Support Fused batch/broadcast matmul. + CHECK_EQ_OR_RETURN(weight_shape->NumAxes(), 2) << "Weight's dim should == 2"; + CHECK_EQ_OR_RETURN(bias_shape->NumAxes(), 1) << "Bias's dim should == 1"; + + n = weight_shape->At(0); + CHECK_EQ_OR_RETURN(bias_shape->At(0), n) << "Bias's dim is not equal to weight's last dim. "; + CHECK_EQ_OR_RETURN(weight_shape->At(1), k) + << "weight's first dim should be equal to input's last dim. "; + // Set for next layer. + k = n; + } + +#if CUDA_VERSION >= 11060 + DeviceType device_type{}; + if (x->is_consistent()) { + device_type = JUST(x->parallel_desc())->device_type(); + } else { + device_type = JUST(x->device())->enum_type(); + } + + if ((device_type == DeviceType::kCUDA) && (weight_size <= kMaxInputCount) + && (!ParseBooleanFromEnv("ONEFLOW_FUNCTOR_DISABLE_FUSED_MLP", false))) { + TensorTuple input(2 * weight_size + 1); + input[0] = x; + std::copy(weights.begin(), weights.end(), input.begin() + 1); + std::copy(biases.begin(), biases.end(), input.begin() + 1 + weight_size); + MutableAttrMap attrs; + JUST(attrs.SetAttr("skip_final_activation", skip_final_activation)); + JUST(attrs.SetAttr>("dropout_rate_list", dropout_rate_list)); + return OpInterpUtil::Dispatch(*fused_op_[weight_size], input, + OpExprInterpContext(attrs, dropout_state)); + } +#endif // CUDA_VERSION >= 11060 + + // Fall back to Naive matmul + bias_add + relu + dropout + std::shared_ptr out = x; + for (int32_t layer_idx = 0; layer_idx < weight_size; layer_idx++) { + out = JUST( + functional::BiasAdd(JUST(functional::MatMul(out, weights[layer_idx], false, true, 1.0)), + biases[layer_idx], 1)); + if ((layer_idx != weight_size - 1) || !skip_final_activation) { + out = JUST(functional::Relu(out, false)); + out = JUST(functional::Dropout(out, JUST(VectorAt(dropout_rate_list, layer_idx)), + /*training=*/true, + /*inplace=*/false, + /*generator=*/gen, /*addend=*/NullOpt)); + } else { + out = JUST(functional::Dropout(out, JUST(VectorAt(dropout_rate_list, layer_idx)), + /*training=*/true, + /*inplace=*/false, + /*generator=*/gen, /*addend=*/NullOpt)); + } + } + return out; + } + + private: +#if CUDA_VERSION >= 11060 std::vector> fused_op_; #endif }; @@ -615,13 +734,14 @@ class PixelShuffleFunctor { PixelShuffleFunctor() {} Maybe operator()(const std::shared_ptr& x, const int64_t& h_upscale_factor, const int64_t& w_upscale_factor) const { - CHECK_OR_RETURN(x->ndim() == 4) << "Only Accept 4D Tensor"; + CHECK_OR_RETURN(x->ndim() == 4) << Error::RuntimeError() << "Only Accept 4D Tensor"; const int64_t batch = x->shape()->At(0); const int64_t channel = x->shape()->At(1); const int64_t height = x->shape()->At(2); const int64_t width = x->shape()->At(3); std::shared_ptr out; CHECK_OR_RETURN(channel % (h_upscale_factor * w_upscale_factor) == 0) + << Error::RuntimeError() << "The channels of input tensor must be divisible by (upscale_factor * upscale_factor) or " "(h_upscale_factor * w_upscale_factor)"; const int64_t new_c = static_cast(channel / (h_upscale_factor * w_upscale_factor)); @@ -793,7 +913,7 @@ class LossFunctorBase { public: Maybe apply_reduction(const Maybe& x, const std::string& reduction) const { CHECK_OR_RETURN(reduction == "none" || reduction == "sum" || reduction == "mean") - << "Reduction should be none, sum or mean."; + << Error::RuntimeError() << "Reduction should be none, sum or mean."; if (reduction == "sum") { return functional::ReduceSum(JUST(x), {}, false); } if (reduction == "mean") { return functional::ReduceMean(JUST(x), {}, false); } return x; @@ -1001,12 +1121,15 @@ class NllLossFunctor { const Optional& weight, const int64_t& ignore_index, const std::string& reduction) const { CHECK_OR_RETURN(reduction == "none" || reduction == "sum" || reduction == "mean") - << "Reduction should be none, sum or mean."; + << Error::RuntimeError() << "Reduction should be none, sum or mean."; const auto& input_shape = input->shape(); const auto& target_shape = target->shape(); - CHECK_LE_OR_RETURN(input_shape->NumAxes(), 5); - CHECK_EQ_OR_RETURN(input_shape->NumAxes() - 1, target_shape->NumAxes()); + CHECK_LE_OR_RETURN(input_shape->NumAxes(), 5) + << Error::RuntimeError() << "The number of input's axis should be less equal to 5. "; + CHECK_EQ_OR_RETURN(input_shape->NumAxes() - 1, target_shape->NumAxes()) + << Error::RuntimeError() + << "The number of input's axis should be equal to the number of target's axis - 1. "; MutableAttrMap attrs; JUST(attrs.SetAttr("ignore_index", ignore_index)); @@ -1067,7 +1190,7 @@ class CrossEntropyFunctor { const Optional& weight, const int64_t& ignore_index, const std::string& reduction) const { CHECK_OR_RETURN(reduction == "none" || reduction == "sum" || reduction == "mean") - << "Reduction should be none, sum or mean."; + << Error::RuntimeError() << "Reduction should be none, sum or mean."; const auto& input_shape = input->shape(); const auto& target_shape = target->shape(); MutableAttrMap attrs; @@ -1278,7 +1401,8 @@ class SparseSoftmaxCrossEntropyFunctor { sbp.mutable_broadcast_parallel(); new_sbp_parallels.emplace_back(sbp); } else { - CHECK_EQ_OR_RETURN(split_axis, 0); + CHECK_EQ_OR_RETURN(split_axis, 0) + << Error::RuntimeError() << "Split axis must equal to 0. "; new_sbp_parallels.emplace_back(sbp_parallel); } } else { @@ -1457,7 +1581,8 @@ class CtcLossFunctor { CHECK_OR_RETURN([&]() -> bool { if ((reduction != "none") && (reduction != "sum") && (reduction != "mean")) return false; return true; - }()); + }()) << Error::RuntimeError() + << "Reduction should be none, sum or mean."; if (reduction == "sum") { return functional::ReduceSum(out, {}, false); } if (reduction == "mean") { return sequence_function(functional::Clamp) @@ -1492,7 +1617,8 @@ class TripletMarginLossFunctor { CHECK_OR_RETURN([&]() -> bool { if ((reduction != "none") && (reduction != "sum") && (reduction != "mean")) return false; return true; - }()); + }()) << Error::RuntimeError() + << "Reduction should be none, sum or mean."; auto da_p = JUST(VectorNorm( JUST(ScalarAdd(eps, JUST(Sub(anchor, positive, /*alpha=*/1.0, /*inplace=*/false)), /*alpha=*/1)), @@ -1590,14 +1716,14 @@ class NormalFunctor { if (optional_dtype.has_value()) { CHECK_OR_RETURN(output_tensor_dtype == dtype) << Error::RuntimeError() << "data type " << dtype->name() - << " does not match data type of out parameter (" << output_tensor_dtype->name(); + << " does not match data type of out parameter " << output_tensor_dtype->name(); } dtype = output_tensor_dtype; Symbol out_tensor_device = JUST(out_tensor->device()); if (optional_device.has_value()) { CHECK_OR_RETURN(out_tensor_device == JUST(optional_device)) << Error::RuntimeError() << "device type " << device->ToString() - << " does not match device type of out parameter (" << out_tensor_device->ToString(); + << " does not match device type of out parameter " << out_tensor_device->ToString(); } device = out_tensor_device; } @@ -1738,13 +1864,14 @@ class NormalizationFunctor { JUST(attrs.SetAttr("momentum", 1.0 - momentum)); CHECK_OR_RETURN((moving_mean && moving_variance) || (!moving_mean && !moving_variance)) + << Error::RuntimeError() << "Both moving_mean and moving_variance should be None or Tensor."; std::shared_ptr gamma_val; std::shared_ptr beta_val; CHECK_GE_OR_RETURN(x->shape()->NumAxes(), 2) - << "NumAxes of x should be greater or equal than 2. "; + << Error::RuntimeError() << "NumAxes of x should be greater or equal than 2. "; if (gamma.has_value() && beta.has_value()) { gamma_val = JUST(gamma); beta_val = JUST(beta); @@ -1756,7 +1883,7 @@ class NormalizationFunctor { if (!training) { CHECK_OR_RETURN(moving_mean && moving_variance) - << "Must have moving_mean and moving_variance in eval mode."; + << Error::RuntimeError() << "Must have moving_mean and moving_variance in eval mode."; return OpInterpUtil::Dispatch( *norm_eval_op_, {x, JUST(moving_mean), JUST(moving_variance), gamma_val, beta_val}, attrs); @@ -1852,10 +1979,11 @@ class NormalizationAddReluFunctor { JUST(attrs.SetAttr("momentum", 1.0f - momentum)); CHECK_OR_RETURN((moving_mean && moving_variance) || (!moving_mean && !moving_variance)) + << Error::RuntimeError() << "Both moving_mean and moving_variance should be None or Tensor."; if (!is_training) { CHECK_OR_RETURN(moving_mean && moving_variance) - << "Must have moving_mean and moving_variance in eval mode."; + << Error::RuntimeError() << "Must have moving_mean and moving_variance in eval mode."; const auto& normalize_result = JUST(OpInterpUtil::Dispatch( *norm_eval_op_, {x, JUST(moving_mean), JUST(moving_variance), gamma, beta}, attrs)); if (addend) { @@ -1907,12 +2035,13 @@ class PadFunctor { const std::string& mode, const Scalar& value) const { const int64_t ndim = x->shape()->NumAxes(); CHECK_LE_OR_RETURN(pad.size(), 2 * ndim) - << "Pad size should less than or equal to input axes * 2."; + << Error::RuntimeError() << "Pad size should less than or equal to input axes * 2."; MutableAttrMap attrs; JUST(attrs.SetAttr>("padding", pad)); if (mode == "constant") { CHECK_EQ_OR_RETURN(pad.size() % 2, 0) - << "Length of pad must be even but instead it equals " << pad.size(); + << Error::RuntimeError() << "Length of pad must be even but instead it equals " + << pad.size(); if (IsFloatingDataType(x->dtype()->data_type()) || x->dtype()->data_type() == DataType::kFloat16) { JUST(attrs.SetAttr("floating_constant_value", value.As())); @@ -1939,6 +2068,7 @@ class PadFunctor { const int64_t pad_h = x->shape()->dim_vec().at(2); const int64_t pad_w = x->shape()->dim_vec().at(3); CHECK_OR_RETURN(pad[2] < pad_h && pad[3] < pad_h && pad[0] < pad_w && pad[1] < pad_w) + << Error::RuntimeError() << "padding size should be less than the corresponding input dimension!"; return OpInterpUtil::Dispatch(*reflect_pad_, {x}, attrs); } else if (mode == "replicate") { @@ -2087,7 +2217,8 @@ class UnfoldFunctor { const std::vector& strides) const { const auto& x_shape = x->shape(); // Only Support 4d tensor now. - CHECK_EQ_OR_RETURN(x_shape->NumAxes(), 4) << "Input Tensor dim should == 4"; + CHECK_EQ_OR_RETURN(x_shape->NumAxes(), 4) + << Error::RuntimeError() << "Input Tensor dim should == 4"; MutableAttrMap attrs; JUST(attrs.SetAttr("data_format", data_format)); JUST(attrs.SetAttr>("kernel_size", kernel_size)); @@ -2113,7 +2244,8 @@ class FoldFunctor { const std::vector& strides) const { const auto& x_shape = x->shape(); // Only Support 3d tensor fold now. format is (N, C*K*K, L) - CHECK_EQ_OR_RETURN(x_shape->NumAxes(), 3) << "Input Tensor dim should == 3"; + CHECK_EQ_OR_RETURN(x_shape->NumAxes(), 3) + << Error::RuntimeError() << "Input Tensor dim should == 3"; MutableAttrMap attrs; JUST(attrs.SetAttr("data_format", data_format)); JUST(attrs.SetAttr>("output_size", output_size)); @@ -2136,9 +2268,8 @@ class OneHotFunctor { } Maybe operator()(const std::shared_ptr& input, const int64_t& num_classes, const Scalar& on_value, const Scalar& off_value) const { - if (IsFloatingDataType(input->dtype()->data_type())) { - OF_RUNTIME_ERROR() << "one_hot is only applicable to index tensor."; - } + CHECK_OR_RETURN(!IsFloatingDataType(input->dtype()->data_type())) + << Error::RuntimeError() << "one_hot is only applicable to index tensor."; MutableAttrMap attrs; if (num_classes == -1) { std::vector axis(input->ndim()); @@ -2241,9 +2372,10 @@ class L2NormalizeFunctor { const auto final_dim = ndims - 1; auto axis_ = axis >= 0 ? axis : axis + ndims; - CHECK_GE_OR_RETURN(axis_, 0) << "Axis should >=0 but axis is " << axis_ << " now."; - CHECK_LE_OR_RETURN(axis_, final_dim) - << "Axis should <" << ndims << " but axis is " << axis_ << " now."; + CHECK_GE_OR_RETURN(axis_, 0) << Error::RuntimeError() << "Axis should >=0 but axis is " << axis_ + << " now."; + CHECK_LE_OR_RETURN(axis_, final_dim) << Error::RuntimeError() << "Axis should < " << ndims + << " but axis is " << axis_ << " now."; MutableAttrMap attrs; JUST(attrs.SetAttr("epsilon", epsilon)); @@ -2761,6 +2893,37 @@ class FusedDotFeatureInteractionFunctor { std::vector> ops_no_output_concat_; }; +class FusedCrossFeatureInteractionFunctor { + public: + FusedCrossFeatureInteractionFunctor() { + op_ = CHECK_JUST(one::OpBuilder("fused_cross_feature_interaction") + .Input("x") + .Input("weight") + .Input("x0") + .Input("bias") + .Output("out") + .Output("matmul_result") + .Build()); + } + + Maybe operator()(const std::shared_ptr& x, + const std::shared_ptr& weight, + const std::shared_ptr& x0, + const std::shared_ptr& bias, + const std::string& interaction_mode) const { + if (interaction_mode != "vector" && interaction_mode != "matrix") { + UNIMPLEMENTED_THEN_RETURN() + << "Fused Cross Interaction mode only support `vector` and `matrix`. "; + } + MutableAttrMap attrs; + JUST(attrs.SetAttr("interaction_mode", interaction_mode)); + return OpInterpUtil::Dispatch(*op_, {x, weight, x0, bias}, attrs); + } + + private: + std::shared_ptr op_; +}; + class OneEmbeddingIdShuffleFunctor { public: OneEmbeddingIdShuffleFunctor() { @@ -3164,6 +3327,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("TensorDot"); m.add_functor("TensorDotIntDims"); m.add_functor("FusedMLP"); + m.add_functor("FusedMatmulBiasAddReluDropout"); m.add_functor("LayerNorm"); m.add_functor("LayerNormAffine"); m.add_functor("TFAvgPool2D"); @@ -3223,6 +3387,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("RoiAlign"); m.add_functor("RoiAlignGrad"); m.add_functor("FusedDotFeatureInteraction"); + m.add_functor("FusedCrossFeatureInteraction"); m.add_functor("OneEmbeddingIdShuffle"); m.add_functor("OneEmbeddingEmbeddingShuffle"); m.add_functor( diff --git a/oneflow/core/functional/impl/nn_grad_functor.cpp b/oneflow/core/functional/impl/nn_grad_functor.cpp index 949c3121e82..8e43b83ddb1 100644 --- a/oneflow/core/functional/impl/nn_grad_functor.cpp +++ b/oneflow/core/functional/impl/nn_grad_functor.cpp @@ -949,8 +949,11 @@ class CublasBiasAddReluMatmulGradFunctor { } Maybe operator()(const std::shared_ptr& dy, const std::shared_ptr& weight, - const std::shared_ptr& aux) const { - return OpInterpUtil::Dispatch(*op_, {dy, weight, aux}); + const std::shared_ptr& aux, + const double& alpha) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("alpha", alpha)); + return OpInterpUtil::Dispatch(*op_, {dy, weight, aux}, attrs); } private: @@ -976,6 +979,23 @@ class CublasMatmulBiasAddGradFunctor { std::shared_ptr op_; }; +class FusedReluDropoutGradFunctor { + public: + FusedReluDropoutGradFunctor() { + op_ = CHECK_JUST( + one::OpBuilder("fused_relu_dropout_grad").Input("dy").Input("mask").Output("dx").Build()); + } + Maybe operator()(const std::shared_ptr& dy, + const std::shared_ptr& mask, const float& scale) const { + MutableAttrMap attr_map; + JUST(attr_map.SetAttr("scale", scale)); + return OpInterpUtil::Dispatch(*op_, {dy, mask}, attr_map); + } + + private: + std::shared_ptr op_; +}; + class FusedDotFeatureInteractionGradFunctor { public: FusedDotFeatureInteractionGradFunctor() { @@ -1031,6 +1051,65 @@ class FusedDotFeatureInteractionGradFunctor { std::vector> ops_no_output_concat_grad_; }; +class FusedCrossFeatureInteractionV1GradFunctor { + public: + FusedCrossFeatureInteractionV1GradFunctor() { + v1_grad_op_ = CHECK_JUST(one::OpBuilder("fused_cross_feature_interaction_v1_grad") + .Input("dy") + .Input("weight") + .Input("x") + .Input("x0") + .Input("matmul_result") + .Output("dx") + .Output("dw") + .Output("dx0") + .Output("dbias") + .Build()); + } + + Maybe operator()(const std::shared_ptr& dy, + const std::shared_ptr& weight, + const std::shared_ptr& x, + const std::shared_ptr& x0, + const std::shared_ptr& matmul_result) const { + return OpInterpUtil::Dispatch(*v1_grad_op_, {dy, weight, x, x0, matmul_result}); + } + + private: + std::shared_ptr v1_grad_op_; +}; + +class FusedCrossFeatureInteractionV2GradFunctor { + public: + FusedCrossFeatureInteractionV2GradFunctor() { + v2_grad_op_ = CHECK_JUST(one::OpBuilder("fused_cross_feature_interaction_v2_grad") + .Input("dy") + .Input("weight") + .Input("bias") + .Input("x") + .Input("x0") + .Input("matmul_result") + .Output("dx") + .Output("dw") + .Output("dx0") + .Output("dbias") + .Build()); + } + + Maybe operator()(const std::shared_ptr& dy, + const std::shared_ptr& weight, + const std::shared_ptr& bias, + const std::shared_ptr& x, + const std::shared_ptr& x0, + const std::shared_ptr& matmul_result) const { + return OpInterpUtil::Dispatch(*v2_grad_op_, + {dy, weight, bias, x, x0, matmul_result}); + } + + private: + std::shared_ptr v2_grad_op_; +}; + } // namespace impl ONEFLOW_FUNCTION_LIBRARY(m) { @@ -1068,7 +1147,12 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("FusedScaleMaskSoftmaxDropoutGrad"); m.add_functor("CublasBiasAddReluMatmulGrad"); m.add_functor("CublasMatmulBiasAddGrad"); + m.add_functor("FusedReluDropoutGrad"); m.add_functor("FusedDotFeatureInteractionGrad"); + m.add_functor( + "FusedCrossFeatureInteractionV1Grad"); + m.add_functor( + "FusedCrossFeatureInteractionV2Grad"); }; } // namespace functional diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp index 03e7c6529e7..0845a1b5b02 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp @@ -26,6 +26,9 @@ limitations under the License. #include "oneflow/core/graph/boxing/one_to_one_sub_task_graph_builder.h" #include "oneflow/core/graph/boxing/sub_task_graph_builder_util.h" #include "oneflow/core/job/sbp_parallel.h" +#include "oneflow/core/graph/nccl_send_recv_boxing_task_node.h" +#include "oneflow/core/job/nd_sbp_util.h" +#include "oneflow/core/graph/task_stream_id.h" namespace oneflow { @@ -117,6 +120,27 @@ std::shared_ptr Make1DSubTskGphBuilder() { return std::make_shared(builders); } +void MergeParallelConf(const ParallelDesc& parallel_desc_0, const ParallelDesc& parallel_desc_1, + ParallelConf* parallel_conf) { + CHECK_EQ(parallel_desc_0.device_tag(), parallel_desc_1.device_tag()); + std::set> machine_device_ids; + for (int64_t machine_id : parallel_desc_0.sorted_machine_ids()) { + for (int64_t device_id : parallel_desc_0.sorted_dev_phy_ids(machine_id)) { + machine_device_ids.insert(std::make_pair(machine_id, device_id)); + } + } + for (int64_t machine_id : parallel_desc_1.sorted_machine_ids()) { + for (int64_t device_id : parallel_desc_1.sorted_dev_phy_ids(machine_id)) { + machine_device_ids.insert(std::make_pair(machine_id, device_id)); + } + } + parallel_conf->set_device_tag(parallel_desc_0.device_tag()); + for (const auto& pair : machine_device_ids) { + parallel_conf->add_device_name("@" + std::to_string(pair.first) + ":" + + std::to_string(pair.second)); + } +} + } // namespace void InOutParallelDimReduce(const ParallelDesc& in_parallel_desc, @@ -171,6 +195,66 @@ class FlatSubTskGphBuilder final : public HierarchicalSubTskGphBuilder { std::shared_ptr sub_tsk_gph_builder_; }; +class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphBuilder { + public: + OF_DISALLOW_COPY_AND_MOVE(NDNcclSendRecvBoxingSubTskGphBuilder); + NDNcclSendRecvBoxingSubTskGphBuilder() {} + ~NDNcclSendRecvBoxingSubTskGphBuilder() override = default; + + Maybe Build(SubTskGphBuilderCtx* ctx, + const std::vector& sorted_in_tasks, + std::vector* sorted_out_tasks, + std::vector>* sorted_ctrl_tasks, + const ParallelDesc& in_parallel_desc, + const ParallelDesc& out_parallel_desc, + const LogicalBlobId& lbi, const BlobDesc& logical_blob_desc, + const NdSbp& in_nd_sbp, const NdSbp& out_nd_sbp, + const Shape& time_shape) const override { + if (in_parallel_desc.device_type() == DeviceType::kCUDA + && out_parallel_desc.device_type() == DeviceType::kCUDA + && !NdSbpHasPartialParallel(out_nd_sbp)) { +#if defined(WITH_CUDA) && NCCL_VERSION_CODE > 2700 + ParallelConf merged_parallel_conf; + MergeParallelConf(in_parallel_desc.parallel_conf(), out_parallel_desc.parallel_conf(), + &merged_parallel_conf); + ParallelDesc merged_parallel_desc(merged_parallel_conf); + TaskNode* first_in_node = sorted_in_tasks.front(); + sorted_ctrl_tasks->resize(out_parallel_desc.parallel_num()); + FOR_RANGE(int64_t, id, 0, merged_parallel_desc.parallel_num()) { + NcclSendRecvBoxingTaskNode* node = ctx->task_graph()->NewNode(); + const int64_t machine_id = JUST(merged_parallel_desc.MachineId4ParallelId(id)); + int64_t device_index = JUST(merged_parallel_desc.DeviceId4ParallelId(id)); + int64_t thrd_id = EncodeStreamIdToInt64(GenerateNamedTaskStreamId( + machine_id, merged_parallel_desc.device_type(), device_index, "NCCL_SEND_RECV_BOXING")); + bool has_input = in_parallel_desc.Containing(machine_id, device_index); + bool has_output = out_parallel_desc.Containing(machine_id, device_index); + node->Init(machine_id, thrd_id, lbi, logical_blob_desc.shape(), + logical_blob_desc.data_type(), in_nd_sbp, out_nd_sbp, in_parallel_desc, + out_parallel_desc, id, merged_parallel_desc, has_input, has_output); + if (has_input) { + int64_t in_id = + JUST(in_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + ctx->task_graph()->ConnectWithLbi(sorted_in_tasks.at(in_id), node, lbi); + } else { + // TODO: find nearest + std::string regst_desc_name; + first_in_node->BuildCtrlRegstDesc(node, ®st_desc_name); + TaskEdge* edge = ctx->task_graph()->NewEdge(); + Connect(first_in_node, edge, node); + first_in_node->BindEdgeWithProducedRegst(edge, regst_desc_name); + } + if (has_output) { sorted_out_tasks->push_back(node); } + } + return BuildSubTskGphBuilderStatus("NDNcclSendRecvBoxingSubTskGphBuilder", ""); +#else + return Error::BoxingNotSupportedError(); +#endif + } else { + return Error::BoxingNotSupportedError(); + } + } +}; + class IntraGroupSubTskGphBuilder final : public HierarchicalSubTskGphBuilder { public: OF_DISALLOW_COPY_AND_MOVE(IntraGroupSubTskGphBuilder); @@ -350,21 +434,22 @@ class Dim0NdSbpMismatchedSubTskGphBuilder final : public HierarchicalSubTskGphBu if (in_parallel_desc.hierarchy()->NumAxes() == 2 && (*in_parallel_desc.hierarchy() == *out_parallel_desc.hierarchy()) && in_nd_sbp.sbp_parallel(0) != out_nd_sbp.sbp_parallel(0) - && in_nd_sbp.sbp_parallel(1) == out_nd_sbp.sbp_parallel(1)) { - if (!(NdSbpAllSameSplitParallel(in_nd_sbp) || NdSbpAllSameSplitParallel(out_nd_sbp))) { - return inter_group_sub_tsk_gph_builder_->Build( - ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, in_parallel_desc, - out_parallel_desc, lbi, logical_blob_desc, in_nd_sbp, out_nd_sbp, time_shape); - } else { - return Error::BoxingNotSupportedError(); - } + && in_nd_sbp.sbp_parallel(1) == out_nd_sbp.sbp_parallel(1) + && !(NdSbpAllSameSplitParallel(in_nd_sbp) || NdSbpAllSameSplitParallel(out_nd_sbp))) { + return inter_group_sub_tsk_gph_builder_->Build( + ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, in_parallel_desc, + out_parallel_desc, lbi, logical_blob_desc, in_nd_sbp, out_nd_sbp, time_shape); } else { - return Error::BoxingNotSupportedError(); + return nd_nccl_send_recv_boxing_sub_tsk_gph_builder_->Build( + ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, in_parallel_desc, + out_parallel_desc, lbi, logical_blob_desc, in_nd_sbp, out_nd_sbp, time_shape); } } private: std::unique_ptr inter_group_sub_tsk_gph_builder_; + std::unique_ptr + nd_nccl_send_recv_boxing_sub_tsk_gph_builder_; }; class Same2DHierarchySubTskGphBuilder final : public HierarchicalSubTskGphBuilder { @@ -391,12 +476,10 @@ class Same2DHierarchySubTskGphBuilder final : public HierarchicalSubTskGphBuilde return intra_group_sub_tsk_gph_builder_->Build( ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, in_parallel_desc, out_parallel_desc, lbi, logical_blob_desc, in_nd_sbp, out_nd_sbp, time_shape); - } else if (in_nd_sbp.sbp_parallel(1) == out_nd_sbp.sbp_parallel(1)) { + } else { return dim0_nd_sbp_mismatched_sub_tsk_gph_builder_->Build( ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, in_parallel_desc, out_parallel_desc, lbi, logical_blob_desc, in_nd_sbp, out_nd_sbp, time_shape); - } else { - return Error::BoxingNotSupportedError(); } } else { return Error::BoxingNotSupportedError(); @@ -464,6 +547,8 @@ struct DispatchHierarchicalSubTskGphBuilder::Impl { std::unique_ptr same_2d_hierarchy_sub_tsk_gph_builder_; std::unique_ptr expand_to_same_2d_hierarchy_sub_tsk_gph_builder_; + std::unique_ptr + nd_nccl_send_recv_boxing_sub_tsk_gph_builder_; }; DispatchHierarchicalSubTskGphBuilder::Impl::Impl() { @@ -471,6 +556,7 @@ DispatchHierarchicalSubTskGphBuilder::Impl::Impl() { same_2d_hierarchy_sub_tsk_gph_builder_.reset(new Same2DHierarchySubTskGphBuilder()); expand_to_same_2d_hierarchy_sub_tsk_gph_builder_.reset( new ExpandToSame2DHierarchySubTskGphBuilder()); + nd_nccl_send_recv_boxing_sub_tsk_gph_builder_.reset(new NDNcclSendRecvBoxingSubTskGphBuilder()); } DispatchHierarchicalSubTskGphBuilder::DispatchHierarchicalSubTskGphBuilder() { @@ -495,6 +581,14 @@ Maybe DispatchHierarchicalSubTskGphBuilder::Build( &reduced_out_nd_sbp); const auto& in_hierarchy = reduced_in_parallel_desc.hierarchy(); const auto& out_hierarchy = reduced_out_parallel_desc.hierarchy(); + if ((in_hierarchy->NumAxes() > 2 || out_hierarchy->NumAxes() > 2) + && reduced_in_parallel_desc.device_type() == DeviceType::kCUDA + && reduced_out_parallel_desc.device_type() == DeviceType::kCUDA) { + return impl_->nd_nccl_send_recv_boxing_sub_tsk_gph_builder_->Build( + ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, reduced_in_parallel_desc, + reduced_out_parallel_desc, lbi, logical_blob_desc, reduced_in_nd_sbp, reduced_out_nd_sbp, + time_shape); + } if (in_hierarchy->NumAxes() <= 2 && out_hierarchy->NumAxes() <= 2) { if (in_hierarchy->NumAxes() == 1 && out_hierarchy->NumAxes() == 1) { return impl_->flat_sub_tsk_gph_builder_->Build( @@ -513,6 +607,12 @@ Maybe DispatchHierarchicalSubTskGphBuilder::Build( ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, reduced_in_parallel_desc, reduced_out_parallel_desc, lbi, logical_blob_desc, reduced_in_nd_sbp, reduced_out_nd_sbp, time_shape); + } else if (reduced_in_parallel_desc.device_type() == DeviceType::kCUDA + && reduced_out_parallel_desc.device_type() == DeviceType::kCUDA) { + return impl_->nd_nccl_send_recv_boxing_sub_tsk_gph_builder_->Build( + ctx, sorted_in_tasks, sorted_out_tasks, sorted_ctrl_tasks, reduced_in_parallel_desc, + reduced_out_parallel_desc, lbi, logical_blob_desc, reduced_in_nd_sbp, reduced_out_nd_sbp, + time_shape); } else { return Error::BoxingNotSupportedError(); } diff --git a/oneflow/core/graph/exec_graph.cpp b/oneflow/core/graph/exec_graph.cpp index 6fa2777a1e8..b530f135b28 100644 --- a/oneflow/core/graph/exec_graph.cpp +++ b/oneflow/core/graph/exec_graph.cpp @@ -14,6 +14,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/graph/exec_graph.h" +#include +#include "oneflow/core/common/just.h" #include "oneflow/core/graph/op_graph.h" namespace oneflow { @@ -92,9 +94,10 @@ Maybe CheckPhysicalBlobDesc( continue; } if (*JUST(op.GetParallelDesc4BnInOp(bn)) == *op_parallel_desc) { - JUST(CheckPhysicalBlobDesc(*JUST(GetLogicalBlobDesc(bn)), - nd_sbp_signature->bn_in_op2nd_sbp().at(bn), *op_parallel_desc, - parallel_ctx, *physical_blob_desc)); + JUST_MSG(CheckPhysicalBlobDesc(*JUST(GetLogicalBlobDesc(bn)), + nd_sbp_signature->bn_in_op2nd_sbp().at(bn), *op_parallel_desc, + parallel_ctx, *physical_blob_desc), + std::stringstream() << " check physical shape failed, op name " << op.op_loc()); } } return Maybe::Ok(); @@ -114,15 +117,18 @@ void ExecNode::InferBlobDescs(const ParallelContext* parallel_ctx) { std::bind(&Operator::GetLogicalBlobDesc4Ibn, op().get(), std::placeholders::_1), nd_sbp_signature, parallel_ctx, GetBlobDesc4BnInOp)); } - CHECK_JUST(op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc())); + CHECK_JUST_MSG(op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()), + std::stringstream() << " infer blob descs if failed, op name " << op_->op_loc()); if (op_node != nullptr && parallel_ctx->parallel_num() > 1 && nd_sbp_signature != nullptr) { CHECK_JUST(CheckPhysicalBlobDesc( *op(), op()->output_bns(), std::bind(&Operator::GetLogicalBlobDesc4Obn, op().get(), std::placeholders::_1), nd_sbp_signature, parallel_ctx, GetBlobDesc4BnInOp)); } - CHECK_JUST(op_->InferInplaceObn2IbnIf(&mut_inplace_obn2ibn_, &con_inplace_obn2ibn_, - GetBlobDesc4BnInOp, parallel_ctx)); + CHECK_JUST_MSG(op_->InferInplaceObn2IbnIf(&mut_inplace_obn2ibn_, &con_inplace_obn2ibn_, + GetBlobDesc4BnInOp, parallel_ctx), + std::stringstream() + << " infer inplace obn to ibn if failed, op name " << op_->op_loc()); } std::function ExecNode::GetBlobDesc4BnInOpFunc() const { diff --git a/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp new file mode 100644 index 00000000000..95438c6d2b2 --- /dev/null +++ b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp @@ -0,0 +1,92 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/to_string.h" +#include "oneflow/core/graph/nccl_send_recv_boxing_task_node.h" + +namespace oneflow { + +void NcclSendRecvBoxingTaskNode::Init(int64_t machine_id, int64_t thrd_id, const LogicalBlobId& lbi, + const Shape& logical_shape, const DataType& data_type, + const NdSbp& src_nd_sbp, const NdSbp& dst_nd_sbp, + const ParallelDesc& src_parallel_desc, + const ParallelDesc& dst_parallel_desc, + const int64_t parallel_id, const ParallelDesc& parallel_desc, + const bool has_input, const bool has_output) { + set_machine_id(machine_id); + set_thrd_id(thrd_id); + set_lbi(lbi); + logical_shape_ = logical_shape; + src_nd_sbp_ = src_nd_sbp; + dst_nd_sbp_ = dst_nd_sbp; + src_parallel_conf_ = src_parallel_desc.parallel_conf(); + dst_parallel_conf_ = dst_parallel_desc.parallel_conf(); + parallel_conf_ = parallel_desc.parallel_conf(); + parallel_ctx_.set_parallel_id(parallel_id); + parallel_ctx_.set_parallel_num(parallel_desc.parallel_num()); + has_input_ = has_input; + has_output_ = has_output; + data_type_ = data_type; +} + +void NcclSendRecvBoxingTaskNode::ProduceAllRegstsAndBindEdges() { + if (has_output_) { + std::shared_ptr out_regst = ProduceRegst("out", true, 1, 1); + this->ForEachOutDataEdge([&](TaskEdge* out_dege) { out_dege->AddRegst("out", out_regst); }); + } + ProduceRegst("tmp", true); +} + +void NcclSendRecvBoxingTaskNode::ConsumeAllRegsts() { + this->ForEachInDataEdge( + [&](TaskEdge* in_edge) { ConsumeRegst("in", SoleInDataEdge()->GetSoleRegst()); }); +} + +void NcclSendRecvBoxingTaskNode::BuildExecGphAndRegst() { + ExecNode* node = mut_exec_gph().NewNode(); + OperatorConf op_conf; + op_conf.set_name("System-Nccl-Send-Recv-Boxing-" + NewUniqueId()); + op_conf.set_device_tag(*CHECK_JUST(DeviceTag4DeviceType(this->device_type()))); + auto* nccl_send_recv_boxing_conf = op_conf.mutable_nccl_send_recv_boxing_conf(); + *nccl_send_recv_boxing_conf->mutable_lbi() = lbi(); + logical_shape_.ToProto(nccl_send_recv_boxing_conf->mutable_logical_shape()); + nccl_send_recv_boxing_conf->set_data_type(data_type_); + *nccl_send_recv_boxing_conf->mutable_src_nd_sbp() = src_nd_sbp_; + *nccl_send_recv_boxing_conf->mutable_dst_nd_sbp() = dst_nd_sbp_; + *nccl_send_recv_boxing_conf->mutable_parallel_conf() = parallel_conf_; + *nccl_send_recv_boxing_conf->mutable_src_parallel_conf() = src_parallel_conf_; + *nccl_send_recv_boxing_conf->mutable_dst_parallel_conf() = dst_parallel_conf_; + nccl_send_recv_boxing_conf->set_has_input(has_input_); + nccl_send_recv_boxing_conf->set_has_output(has_output_); + std::shared_ptr sole_op = CHECK_JUST(ConstructOp(op_conf)); + node->mut_op() = sole_op; + if (has_input_) { node->BindBnWithRegst(sole_op->SoleIbn(), GetSoleConsumedRegst("in")); } + if (has_output_) { + std::shared_ptr out_regst = GetProducedRegst("out"); + out_regst->AddLbi(sole_op->BnInOp2Lbi(sole_op->SoleObn())); + node->BindBnWithRegst(sole_op->SoleObn(), out_regst); + } + node->AddBnToRegstAndBindIt(&Operator::tmp_bns, GetProducedRegst("tmp")); + node->InferBlobDescs(parallel_ctx()); +} + +void NcclSendRecvBoxingTaskNode::InferProducedDataRegstTimeShape() { + auto out_regst = GetProducedRegst("out"); + if (out_regst != nullptr) { out_regst->mut_data_regst_time_shape()->reset(new Shape({1, 1})); } + auto tmp_regst = GetProducedRegst("tmp"); + tmp_regst->mut_data_regst_time_shape()->reset(new Shape({1, 1})); +} + +} // namespace oneflow diff --git a/oneflow/core/graph/nccl_send_recv_boxing_task_node.h b/oneflow/core/graph/nccl_send_recv_boxing_task_node.h new file mode 100644 index 00000000000..fee688222ca --- /dev/null +++ b/oneflow/core/graph/nccl_send_recv_boxing_task_node.h @@ -0,0 +1,57 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_GRAPH_NCCL_SEND_RECV_BOXING_TASK_NODE_H_ +#define ONEFLOW_CORE_GRAPH_NCCL_SEND_RECV_BOXING_TASK_NODE_H_ + +#include "oneflow/core/graph/transport_task_node.h" + +namespace oneflow { + +class NcclSendRecvBoxingTaskNode : public TransportTaskNode { + public: + OF_DISALLOW_COPY_AND_MOVE(NcclSendRecvBoxingTaskNode); + NcclSendRecvBoxingTaskNode() = default; + ~NcclSendRecvBoxingTaskNode() override = default; + + void Init(int64_t machine_id, int64_t thrd_id, const LogicalBlobId& lbi, + const Shape& logical_shape, const DataType& data_type, const NdSbp& src_nd_sbp, + const NdSbp& dst_nd_sbp, const ParallelDesc& src_parallel_desc, + const ParallelDesc& dst_parallel_desc, const int64_t parallel_id, + const ParallelDesc& parallel_desc, const bool has_input, const bool has_output); + TaskType GetTaskType() const override { return TaskType::kNcclSendRecvBoxing; } + const ParallelContext* parallel_ctx() const override { return ¶llel_ctx_; } + + private: + void BuildExecGphAndRegst() override; + void ProduceAllRegstsAndBindEdges() override; + void ConsumeAllRegsts() final; + void InferProducedDataRegstTimeShape() final; + + Shape logical_shape_; + DataType data_type_; + NdSbp src_nd_sbp_; + NdSbp dst_nd_sbp_; + ParallelConf src_parallel_conf_; + ParallelConf dst_parallel_conf_; + ParallelConf parallel_conf_; + ParallelContext parallel_ctx_; + bool has_input_; + bool has_output_; +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_GRAPH_NCCL_SEND_RECV_BOXING_TASK_NODE_H_ diff --git a/oneflow/core/graph/op_graph.cpp b/oneflow/core/graph/op_graph.cpp index 09df4962fcb..4bd88e55f5f 100644 --- a/oneflow/core/graph/op_graph.cpp +++ b/oneflow/core/graph/op_graph.cpp @@ -443,6 +443,9 @@ void OpGraph::ForEachDataAndCtrlInNode(OpNode* node, const std::function& Handler) const { node->ForEachNodeOnInEdge(Handler); for (const auto& ctrl_in_op_name : node->op().op_conf().ctrl_in_op_name()) { + CHECK(op_name2op_node_.find(ctrl_in_op_name) != op_name2op_node_.end()) + << " cannot find ctrl_in_op_name: [" << ctrl_in_op_name << "] of op: [" + << node->op().op_name() << "] in OpGraph. "; Handler(op_name2op_node_.at(ctrl_in_op_name)); } } @@ -453,6 +456,9 @@ void OpGraph::ForEachDataAndCtrlOutNode(OpNode* node, const auto& op_name_it = producer_op_name2ctrl_consumer_op_names_.find(node->op().op_name()); if (op_name_it == producer_op_name2ctrl_consumer_op_names_.end()) { return; } for (const std::string& ctrl_consumer_op_name : op_name_it->second) { + CHECK(op_name2op_node_.find(ctrl_consumer_op_name) != op_name2op_node_.end()) + << " cannot find ctrl_consumer_op_name: [" << ctrl_consumer_op_name << "] of op: [" + << node->op().op_name() << "] in OpGraph."; Handler(op_name2op_node_.at(ctrl_consumer_op_name)); } } diff --git a/oneflow/core/graph/task_graph.cpp b/oneflow/core/graph/task_graph.cpp index 5fd69c40274..040e113ad14 100644 --- a/oneflow/core/graph/task_graph.cpp +++ b/oneflow/core/graph/task_graph.cpp @@ -721,6 +721,12 @@ DEFINE_BLD_SUB_TASK_GRAPH_METHOD(BldSubTskGphByBoxing) { const ParallelDesc& src_parallel_desc = src_op_node->parallel_desc(); const ParallelDesc& dst_parallel_desc = dst_op_node->parallel_desc(); const BlobDesc& blob_desc = src_op_node->LogicalBlobDesc4Lbi(lbi); + VLOG(3) << "src op: " << src_op_node->op().op_name() + << " dst op: " << dst_op_node->op().op_name() + << " src_parallel_conf: " << src_parallel_desc.parallel_conf().DebugString() + << " dst parallel conf: " << dst_parallel_desc.parallel_conf().DebugString() + << " src_nd_sbp " << src_nd_sbp.DebugString() << " dst nd_sbp " + << dst_nd_sbp.DebugString(); auto status = CHECK_JUST(hierarchical_sub_tsk_gph_builder_->Build( sub_tsk_gph_builder_ctx_.get(), in_nodes, &out_nodes, &sorted_ctrl_tasks, src_parallel_desc, dst_parallel_desc, lbi, blob_desc, src_nd_sbp, dst_nd_sbp, diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index d8b77cdbb72..959a7837010 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -71,6 +71,9 @@ void CreateNcclComm(ncclComm_t* comm, const int dev, const std::string& key, << ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank << ", key = {" << key << "}\n"; OF_NCCL_CHECK(ncclCommInitRank(comm, device_vec.size(), nccl_unique_id, rank)); + VLOG(2) << " EagerNcclCommMgr::ncclCommInitRank succeed device_vec.size() = " << device_vec.size() + << ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank + << ", key = {" << key << "}\n"; } } // namespace diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index 5afc24192da..8ae659fd541 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -997,13 +997,19 @@ Maybe LazyJobBuildAndInferCtx::Complete() { } }; int32_t pass_cnt = 0; + const int64_t prev_v = FLAGS_v; auto DoPass = [&](const std::string& pass_name, int32_t cnt = 0) -> Maybe { + VLOG(1) << job_name << " is compiling with pass" + << " pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + << (cnt > 0 ? std::to_string(cnt) : ""); if (unlikely(NeedLogJob(pass_name))) { std::string cnt_str = cnt > 0 ? std::to_string(cnt) : ""; LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-before"); + FLAGS_v = 3; } JUST(JobPass4Name(pass_name)(mut_job(), &job_pass_ctx)); if (unlikely(NeedLogJob(pass_name))) { + FLAGS_v = prev_v; std::string cnt_str = cnt > 0 ? std::to_string(cnt) : ""; LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-after"); } diff --git a/oneflow/core/job/job_builder.cpp b/oneflow/core/job/job_builder.cpp index a7f81384376..b13bd8a67fd 100644 --- a/oneflow/core/job/job_builder.cpp +++ b/oneflow/core/job/job_builder.cpp @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/common/util.h" #include "oneflow/core/common/container_util.h" #include "oneflow/core/job/job.pb.h" +#include "oneflow/core/job/sbp_parallel.pb.h" #include "oneflow/core/operator/operator.h" namespace oneflow { diff --git a/oneflow/core/job/job_conf.proto b/oneflow/core/job/job_conf.proto index 69aa7ad29f0..03638feec30 100644 --- a/oneflow/core/job/job_conf.proto +++ b/oneflow/core/job/job_conf.proto @@ -211,6 +211,7 @@ message JobConfigProto { optional bool enable_gradients_stats_aggregation = 106 [default = true]; optional string optimizer_placement_optimization_mode = 107; optional int64 optimizer_placement_optimization_threshold = 108 [default = 1024]; + optional int64 optimizer_placement_optimization_shard_restore_level = 110 [default = 2]; optional QatConfig qat_config = 109; diff --git a/oneflow/core/job/job_ir.cpp b/oneflow/core/job/job_ir.cpp index 792735a0354..f5552b92514 100644 --- a/oneflow/core/job/job_ir.cpp +++ b/oneflow/core/job/job_ir.cpp @@ -19,6 +19,10 @@ namespace oneflow { #ifndef WITH_MLIR +Maybe ConvertJobToTosaIR(Job* job) { + UNIMPLEMENTED_THEN_RETURN() << "ConvertJobToTosaIR is only supported WITH_MLIR"; +} + Maybe SaveJobToIR(Job* job, const std::string& path) { UNIMPLEMENTED_THEN_RETURN() << "SaveJobToIR is only supported WITH_MLIR"; } diff --git a/oneflow/core/job/job_ir.h b/oneflow/core/job/job_ir.h index c57d0eebeb8..7dbd8da0c31 100644 --- a/oneflow/core/job/job_ir.h +++ b/oneflow/core/job/job_ir.h @@ -21,6 +21,7 @@ limitations under the License. namespace oneflow { +Maybe ConvertJobToTosaIR(Job* job); Maybe SaveJobToIR(Job* job, const std::string& path); Maybe LoadJobFromIR(Job* job, const std::string& path); diff --git a/oneflow/core/job/plan_util.cpp b/oneflow/core/job/plan_util.cpp index 44adaf07d6a..dff5faa8065 100644 --- a/oneflow/core/job/plan_util.cpp +++ b/oneflow/core/job/plan_util.cpp @@ -898,7 +898,6 @@ void PlanUtil::PlanMemoryLog(Plan* plan, const std::string& plan_name) { }; std::sort(ordered_tasks.begin(), ordered_tasks.end(), CompTask); - // HashMap rank2memory_info; std::vector rank_device_memory_infos(GlobalProcessCtx::WorldSize(), RankDeviceMemoryInfo()); HashMap mem_block_id2info; @@ -985,14 +984,135 @@ void PlanUtil::PlanMemoryLog(Plan* plan, const std::string& plan_name) { CHECK(mem_block_id2info.find(mem_block_id) != mem_block_id2info.end()); const auto& mem_block_info = mem_block_id2info.at(mem_block_id); for (int64_t i = 0; i < mem_block_info.ordered_op_names.size(); ++i) { - VLOG(3) << " In MemBlock id: " << mem_block_id << " order: " << i - << " op_name: " << mem_block_info.ordered_op_names.at(i); + VLOG(3) << " In Chunk id: " << chunk_id << " MemBlock id: " << mem_block_id + << " order: " << i << " op_name: " << mem_block_info.ordered_op_names.at(i); } } } } } +void PlanUtil::GenLightPlan(Plan* plan, const std::string& plan_name) { + std::vector ordered_tasks; + for (const TaskProto& task : plan->task()) { ordered_tasks.push_back(&task); } + auto CompTask = [](const TaskProto* a, const TaskProto* b) { + return a->task_set_info().order_in_graph() < b->task_set_info().order_in_graph(); + }; + std::sort(ordered_tasks.begin(), ordered_tasks.end(), CompTask); + + HashMap task_id2name; + HashMap task_id2proto; + HashMap regst_id2name; + HashMap regst_id2proto; + for (const auto* task : ordered_tasks) { + const auto& exec_seq = task->exec_sequence(); + std::string name; + if (exec_seq.exec_node_size() >= 1) { + const auto& kernel_conf = task->exec_sequence().exec_node(0).kernel_conf(); + if (kernel_conf.has_op_attribute_ref()) { + name = kernel_conf.op_attribute_ref(); + } else { + name = kernel_conf.op_attribute().op_conf().name(); + } + } else { + name = TaskType_Name(task->task_type()); + } + task_id2name.emplace(task->task_id(), name); + task_id2proto.emplace(task->task_id(), task); + CHECK(!name.empty()); + for (const auto& pair : task->produced_regst_desc()) { + std::string regst_name = name + "/" + pair.first; + regst_id2name.emplace(pair.second.regst_desc_id(), regst_name); + regst_id2proto.emplace(pair.second.regst_desc_id(), pair.second); + } + } + + auto RegstId2TensorStr = [&](int64_t regst_id) { + std::string ret; + CHECK(regst_id2proto.find(regst_id) != regst_id2proto.end()) + << " regst_id2proto cannot find: " << regst_id; + const RegstDescProto& regst = regst_id2proto.at(regst_id); + ret += " regst_num: " + std::to_string(regst.register_num()); + std::string mem = ", cpu "; + if (regst.mem_case().has_device_cuda_mem()) { mem = ", cuda "; } + ret += mem; + if (regst.regst_desc_type().has_data_regst_desc()) { + const DataRegstDesc& data = regst.regst_desc_type().data_regst_desc(); + ret += ", time_shape: " + Shape(data.time_shape()).ToString(); + const BlobDescProto& blob = data.lbi2blob_desc(0).blob_desc(); + ret += ", shape: " + Shape(blob.shape()).ToString() + + " , dtype: " + DataType_Name(blob.data_type()); + } else { + ret += ", ctrl "; + } + return ret; + }; + std::vector> rank2ordered_task(GlobalProcessCtx::WorldSize(), + std::vector()); + for (const auto* task : ordered_tasks) { + CHECK_LT(task->machine_id(), rank2ordered_task.size()); + rank2ordered_task.at(task->machine_id()).push_back(task); + } + for (int64_t rank = 0; rank < GlobalProcessCtx::WorldSize(); ++rank) { + auto file_stream = + TeePersistentLogStream::Create(plan_name + "_rank_" + std::to_string(rank) + "_light_plan"); + file_stream << "rank : " << std::to_string(rank) << "\n"; + CHECK_LT(rank, rank2ordered_task.size()); + const auto& ordered_task_in_rank = rank2ordered_task.at(rank); + for (int64_t i = 0; i < ordered_task_in_rank.size(); ++i) { + CHECK_LT(i, ordered_task_in_rank.size()); + const auto* task = ordered_task_in_rank.at(i); + int64_t task_id = task->task_id(); + CHECK(task_id2name.find(task_id) != task_id2name.end()) + << " task_id2name cannot find" << task_id; + int64_t thrd_id = task->thrd_id(); + StreamId stream_id = DecodeStreamIdFromInt64(thrd_id); + file_stream << "order : " << std::to_string(i) << " , actor id : " << std::to_string(task_id) + << " name : " << task_id2name.at(task_id) << " thrd : " << std::to_string(thrd_id) + << " device_type : " << DeviceType_Name(stream_id.device_type()) + << " stream_index : " << std::to_string(stream_id.stream_index()) << " {\n"; + for (const auto& key2consume_regst : task->consumed_regst_desc_id()) { + std::string key = key2consume_regst.first; + for (int64_t consume_regst_id : key2consume_regst.second.regst_desc_id()) { + std::string other_rank_str = ""; + CHECK(regst_id2proto.find(consume_regst_id) != regst_id2proto.end()) + << " regst_id2proto cannot find: " << consume_regst_id; + int64_t consume_task_id = regst_id2proto.at(consume_regst_id).producer_task_id(); + CHECK(task_id2proto.find(consume_task_id) != task_id2proto.end()) + << " task_id2proto cannot find: " << consume_task_id; + int64_t other_rank = task_id2proto.at(consume_task_id)->machine_id(); + if (other_rank != rank) { other_rank_str = " , rank: " + std::to_string(other_rank); } + CHECK(regst_id2name.find(consume_regst_id) != regst_id2name.end()) + << " regst_id2name cannot find: " << consume_regst_id; + file_stream << " consume : " << key << " : <- [ " << regst_id2name.at(consume_regst_id) + << " ] ( actor_id: " << std::to_string(consume_task_id) << other_rank_str + << ", regst: " << RegstId2TensorStr(consume_regst_id) << " )\n"; + } + } + for (const auto& key2produce_regst : task->produced_regst_desc()) { + const RegstDescProto& regst = key2produce_regst.second; + file_stream << " produce : " << key2produce_regst.first + << " regst: " << RegstId2TensorStr(regst.regst_desc_id()) << " {\n"; + for (int64_t consumer_task_id : regst.consumer_task_id()) { + std::string other_rank_str = ""; + CHECK(task_id2proto.find(consumer_task_id) != task_id2proto.end()) + << " task_id2proto cannot find " << consumer_task_id; + CHECK(task_id2name.find(consumer_task_id) != task_id2name.end()) + << " task_id2name cannot find " << consumer_task_id; + int64_t other_rank = task_id2proto.at(consumer_task_id)->machine_id(); + if (other_rank != rank) { other_rank_str = " , rank: " + std::to_string(other_rank); } + file_stream << " -> [ " << task_id2name.at(consumer_task_id) + << " ] ( actor_id: " << std::to_string(consumer_task_id) << other_rank_str + << " )\n"; + } + file_stream << " }\n"; + } + + file_stream << "}\n"; + } + } +} + const oneflow::OpAttribute& PlanUtil::GetOpAttribute(const Plan* plan, int64_t job_id, const oneflow::KernelConf& kernel_conf) { if (kernel_conf.has_op_attribute()) { diff --git a/oneflow/core/job/plan_util.h b/oneflow/core/job/plan_util.h index 2d58588be1e..6c45db300e0 100644 --- a/oneflow/core/job/plan_util.h +++ b/oneflow/core/job/plan_util.h @@ -39,6 +39,7 @@ struct PlanUtil { static void DumpCtrlRegstInfoToPlan(Plan* plan); static void GenCollectiveBoxingPlan(Job* job, Plan* plan); static void GenRegisterHint(Plan* plan); + static void GenLightPlan(Plan* plan, const std::string& plan_name); static void PlanMemoryLog(Plan* plan, const std::string& plan_name); static const oneflow::OpAttribute& GetOpAttribute(const Plan* plan, int64_t job_id, const oneflow::KernelConf& kernel_conf); diff --git a/oneflow/core/job/task.proto b/oneflow/core/job/task.proto index e4df1c4a0db..2fb82cc1ab9 100644 --- a/oneflow/core/job/task.proto +++ b/oneflow/core/job/task.proto @@ -38,6 +38,7 @@ enum TaskType { kSspVariableProxy = 63; kBoxingZeros = 64; kCriticalSectionWaitTick = 65; + kNcclSendRecvBoxing = 66; }; message RegstDescIdSet { diff --git a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp index 4609bea6e69..d51c171df19 100644 --- a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp +++ b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp @@ -27,6 +27,7 @@ const AMPList& AutoMixedPrecisionLists::WhiteList() { "prelu", "tf_prelu", "cublas_fused_mlp", + "fused_matmul_bias_add_relu_dropout", "fused_dot_feature_interaction", "embedding_lookup_placeholder"}; return white_list; @@ -89,6 +90,7 @@ const AMPList& AutoMixedPrecisionLists::ClearList() { "identity", "flatten", "squeeze", + "embedding", "expand_dims", "cast_to_static_shape", "parallel_cast", diff --git a/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp b/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp index 1ca857fd11f..522cf44305a 100644 --- a/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp +++ b/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp @@ -13,10 +13,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "oneflow/core/common/util.h" +#include "oneflow/core/framework/nd_sbp.h" +#include "oneflow/core/framework/user_op_conf.h" +#include "oneflow/core/job/nd_sbp_util.h" +#include "oneflow/core/job/sbp_parallel.h" +#include "oneflow/core/job/sbp_parallel.pb.h" #include "oneflow/core/job_rewriter/job_pass.h" #include "oneflow/core/graph/op_graph.h" #include "oneflow/core/job/job_desc.h" +#include "oneflow/core/operator/op_conf.pb.h" +#include "oneflow/core/operator/operator.h" namespace oneflow { @@ -31,7 +40,7 @@ int64_t GetSoleOutBlobSize(const OpNode* node) { class DataParallelNodeSequence final { public: DataParallelNodeSequence(std::vector nodes, int64_t order) - : nodes_(std::move(nodes)), order_(order) { + : nodes_(std::move(nodes)), order_(order), len_(nodes_.size()) { const OpNode* var_node = nodes_.front(); CHECK(var_node->op().op_conf().has_variable_conf()); model_size_ = GetSoleOutBlobSize(var_node); @@ -50,13 +59,23 @@ class DataParallelNodeSequence final { int64_t model_size() const { return model_size_; } + int64_t len() const { return len_; } + + void resize(const int64_t size) { + CHECK(size <= len_); + CHECK(size > 1); + nodes_.resize(size); + len_ = nodes().size(); + } + private: std::vector nodes_; int64_t order_; int64_t model_size_; + int64_t len_; }; -using SequencePtr = std::shared_ptr; +using SequencePtr = std::shared_ptr; ParallelConf NonDistributedParallelConf4ParallelId(const ParallelDesc& pd, const int64_t parallel_id) { @@ -76,7 +95,6 @@ Maybe GetDataParallelVariableAndNaiveSuccNode( // Find sequence like: vairable -> cast_fp32_to_fp16 if (!start->op().op_conf().has_variable_conf()) { return Maybe::Ok(); } const ParallelDesc& pd = start->parallel_desc(); - if (pd.device_type() != DeviceType::kCUDA) { return Maybe::Ok(); } if (pd.parallel_num() == 1) { return Maybe::Ok(); } const OpNode* cur_node = start; while (cur_node != nullptr) { @@ -85,12 +103,21 @@ Maybe GetDataParallelVariableAndNaiveSuccNode( if (cur_node->in_edges().size() > 1) { break; } if (cur_node->op().input_bns().size() != 1) { break; } const std::string& sole_ibn = cur_node->op().SoleIbn(); - if (!cur_node->SbpParallel4BnInOp(sole_ibn).has_broadcast_parallel()) { break; } + const NdSbp& ibn_nd_sbp = cur_node->NdSbp4BnInOp(sole_ibn); + bool has_broadcast = false; + FOR_RANGE(int, i, 0, ibn_nd_sbp.sbp_parallel_size()) { + if (ibn_nd_sbp.sbp_parallel(i).has_broadcast_parallel()) { has_broadcast = true; }; + } + if (!has_broadcast) { break; } } - if (!IsAllowed(cur_node)) { break; } if (cur_node->op().output_bns().size() != 1) { break; } const std::string& sole_obn = cur_node->op().SoleObn(); - if (!cur_node->SbpParallel4BnInOp(sole_obn).has_broadcast_parallel()) { break; } + const NdSbp& obn_nd_sbp = cur_node->NdSbp4BnInOp(sole_obn); + bool has_broadcast = false; + FOR_RANGE(int, i, 0, obn_nd_sbp.sbp_parallel_size()) { + if (obn_nd_sbp.sbp_parallel(i).has_broadcast_parallel()) { has_broadcast = true; }; + } + if (!has_broadcast) { break; } out->emplace_back(cur_node); if (cur_node->out_edges().size() == 1) { cur_node = cur_node->SoleOutEdge()->dst_node(); @@ -123,6 +150,79 @@ void SetBroadcastParallel4Consumers(JobBuilder* builder, const SequencePtr& sequ }); } +void SetNdSbp4OpNodeIbn(JobBuilder* builder, const OpNode* node, const std::string& ibn, + const NdSbp& nd_sbp) { + OpBlobArg op_blob_arg; + op_blob_arg.set_op_name(node->op().op_name()); + op_blob_arg.set_bn_in_op(ibn); + builder->SetNdSbp4Oba(op_blob_arg, nd_sbp); +} + +void SetNdSbp4Consumers(JobBuilder* builder, const SequencePtr& sequence, const NdSbp& nd_sbp) { + const OpNode* node = sequence->GetLastNode(); + const LogicalBlobId& lbi = node->op().BnInOp2Lbi(node->op().SoleObn()); + const int64_t shard_restore_level = + builder->job().job_conf().optimizer_placement_optimization_shard_restore_level(); + // If shard_restore_level == 0, no limit on consumer + if (shard_restore_level == 1) { + // Input lbn for parallel cast op + std::string parallel_cast_input_lbn = GenLogicalBlobName(lbi); + // Add indentity to enable mem reuse of boxing op when there is no op between var op and boxing. + if (sequence->len() == 1) { + VLOG(3) << "ZeRO find a data-parallel sequence only has one variable " + << sequence->GetVariableNode()->op().op_name(); + const auto var_identity_op = + user_op::UserOpConfWrapperBuilder("System-ZeRO-Identity-" + node->op().op_name() + "-" + + NewUniqueId()) + .Op("identity") + .Input("in", GenLogicalBlobName(lbi)) + .Output("out") + .ScopeSymbolId(node->op().op_conf().scope_symbol_id()) + .Build(); + builder->AddOps(node->parallel_desc().parallel_conf(), {var_identity_op.op_conf()}); + parallel_cast_input_lbn = var_identity_op.output("out", 0); + } + // Add parallel cast op to make soft limt on consumer to consume weight with Broadcast SBP. + const auto parallel_cast_op = + user_op::UserOpConfWrapperBuilder("System-ZeRO-ParallelCast-" + node->op().op_name() + "-" + + NewUniqueId()) + .Op("hierarchical_parallel_cast") + .Input("in", parallel_cast_input_lbn) + .Output("out") + .Attr>("nd_sbp", NdSbpToStringList(nd_sbp)) + .Attr("grad_mode", "identity") // don't do ndsbp cast at backward + .Attr>("grad_nd_sbp", std::vector()) + .ScopeSymbolId(node->op().op_conf().scope_symbol_id()) + .Build(); + builder->AddOps(node->parallel_desc().parallel_conf(), {parallel_cast_op.op_conf()}); + + // Make consumers to consume parallel cast op + auto out_lbn = parallel_cast_op.output("out", 0); + node->ForEachNodeOnOutEdge([&](const OpNode* out_node) { + for (const std::string& ibn : out_node->op().input_bns()) { + if (out_node->op().BnInOp2Lbi(ibn) == lbi) { + if (!CHECK_JUST(builder->IsInMutOpTransaction(out_node->op().op_name()))) { + CHECK_JUST(builder->MutOpTransactionMut(out_node->op().op_conf())); + } + OperatorConf& mut_consumer_op = + CHECK_JUST(builder->MutOpTransactionGet(out_node->op().op_name())); + const auto& old_lbn = ReplaceInputLbnInOpCustomizedConf(&mut_consumer_op, ibn, out_lbn); + CHECK_EQ(old_lbn, GenLogicalBlobName(lbi)); + } + } + }); + } else if (shard_restore_level == 2) { + // Hard limt consumer to consume weight as Broadcast. + node->ForEachNodeOnOutEdge([&](const OpNode* out_node) { + for (const std::string& ibn : out_node->op().input_bns()) { + if (out_node->op().BnInOp2Lbi(ibn) == lbi) { + SetNdSbp4OpNodeIbn(builder, out_node, ibn, nd_sbp); + } + } + }); + } +} + std::function MakeGetterOpNode2TopoOrder(const OpGraph& op_graph) { HashMap op_node2topo_order; int64_t node_cnt = 0; @@ -152,7 +252,7 @@ void ForEachDataParallelNodeSequence(const OpGraph& op_graph, CHECK_JUST(GetDataParallelVariableAndNaiveSuccNode(node, IsAllowed, &nodes)); if (nodes.empty()) { return; } const int64_t order = GetMinConsumerOrder(op_graph, nodes.back(), OpNode2Order); - Handler(std::make_shared(std::move(nodes), order)); + Handler(std::make_shared(std::move(nodes), order)); }); } @@ -188,6 +288,24 @@ bool IsS0Parallel(const SbpSignature& signature, const std::string& bn) { return IsS0Parallel(signature.bn_in_op2sbp_parallel().at(bn)); } +bool IsNdSbpMatch(const NdSbpSignature& signature, const std::string& bn, const NdSbp& nd_sbp) { + return signature.bn_in_op2nd_sbp().at(bn) == nd_sbp; +} + +bool IsNdSbpSupported4Op(const OpNode* node, const NdSbp& nd_sbp) { + if (node->op().input_bns().size() != 1 || node->op().output_bns().size() != 1) { return false; } + std::vector list; + auto LogicalBlobDesc4Ibn = [&](const std::string& bn) -> Maybe { + return Maybe(node->LogicalBlobDesc4Lbi(node->op().BnInOp2Lbi(bn))); + }; + CHECK_JUST(node->op().GetNdSbpSignatureList(LogicalBlobDesc4Ibn, node->parallel_desc(), &list)); + const auto IsInAndOutMatch = [&](const NdSbpSignature& signature) { + return IsNdSbpMatch(signature, node->op().SoleIbn(), nd_sbp) + && IsNdSbpMatch(signature, node->op().SoleObn(), nd_sbp); + }; + return std::any_of(list.cbegin(), list.cend(), IsInAndOutMatch); +} + bool IsS0SignatureSupported(const OpNode* node) { if (node->op().input_bns().size() != 1 || node->op().output_bns().size() != 1) { return false; } SbpSignatureList list; @@ -222,42 +340,143 @@ void ForEachModelSizeBalancedPartition( } } -Maybe RewriteDistributedSplit(const OpGraph& op_graph, JobBuilder* builder) { - const int64_t threshold = builder->job().job_conf().optimizer_placement_optimization_threshold(); - const auto IsAllowed = [threshold](const OpNode* n) -> bool { - if (n->op().op_conf().has_variable_conf()) { - const Shape shape(n->op().op_conf().variable_conf().shape()); - const int64_t parallel_num = n->parallel_desc().parallel_num(); - // Parameter needs to be able to evenly splited and one slice size >= threshold - return shape.At(0) % parallel_num == 0 && shape.elem_cnt() >= threshold * parallel_num; +namespace { +bool IsSplitValid(const Shape& shape, const NdSbp& nd_sbp, const Shape& hierachy, + int64_t min_size) { + if (shape.NumAxes() < 1 || shape.elem_cnt() < 1) { return false; } + CHECK_EQ(nd_sbp.sbp_parallel_size(), hierachy.NumAxes()); + Shape cur_shape = shape; + if (cur_shape.elem_cnt() < min_size) { return false; } + FOR_RANGE(int64_t, i, 0, hierachy.NumAxes()) { + const auto& sbp = nd_sbp.sbp_parallel(i); + if (sbp.has_split_parallel()) { + const int64_t dim = sbp.split_parallel().axis(); + if (dim >= cur_shape.NumAxes()) { return false; } + // Evenly split. + if (cur_shape.At(dim) % hierachy.At(i) != 0) { return false; } + cur_shape.Set(dim, cur_shape.At(dim) / hierachy.At(i)); + // Larger then min size. + if (cur_shape.elem_cnt() < min_size) { return false; } + } + } + return true; +} + +void GenerateSplitSignature(const NdSbp& var_nd_sbp, const OperatorConf& new_var_op_conf, + std::string& new_split_signature, int64_t& split_dim) { + if (new_var_op_conf.variable_conf().nd_sbp_size() > 0 && NdSbpIsAllBroadcast(var_nd_sbp)) { + // split last dim + split_dim = new_var_op_conf.variable_conf().nd_sbp_size() - 1; + // All B, B -> S0 + new_split_signature = "S(0)"; + } else { + // ND sbp, (*, B, S, *) -> (*, S, S, *) + // ND sbp, (*, S, B, *) -> (*, S, S, *) + FOR_RANGE(int64_t, j, 0, new_var_op_conf.variable_conf().nd_sbp_size()) { + if (new_var_op_conf.variable_conf().nd_sbp(j) == "B") { + std::vector adjacent_dim{j - 1, j + 1}; + for (auto const& dim_to_try : adjacent_dim) { + if (dim_to_try >= 0 && dim_to_try < new_var_op_conf.variable_conf().nd_sbp_size()) { + SbpParallel sbp; + if (ParseSbpParallelFromString(new_var_op_conf.variable_conf().nd_sbp(dim_to_try), &sbp) + && sbp.has_split_parallel()) { + new_split_signature = new_var_op_conf.variable_conf().nd_sbp(dim_to_try); + split_dim = j; + } + } + if (new_split_signature != "") break; + } + } + // Only split one more dim. + if (new_split_signature != "") break; + } + } +} +void ShardSequence(JobBuilder* builder, const int64_t threshold, const ParallelDesc& pd, + std::vector&& sorted_sequences) { + // For all sorted sequnence, set the variable op in the sequence to S + // and add ctrl edge to control the exectuion order between variable ops. + // A sequence is a variable op and its cast(fp32 to fp16) op. This is because the forward pass + // consume the fp16 variable and the optimizer consume the fp32 variable. + std::string prev_allowed_op_name = ""; + for (int64_t i = 0; i < sorted_sequences.size(); ++i) { + const OpNode* var_node = sorted_sequences.at(i)->GetVariableNode(); + OperatorConf new_var_op_conf = var_node->op().op_conf(); + const std::string& sole_obn = var_node->op().SoleObn(); + const NdSbp& var_nd_sbp = var_node->NdSbp4BnInOp(sole_obn); + const Shape& logical_shape = Shape(new_var_op_conf.variable_conf().shape()); + + std::string new_split_signature = ""; + int64_t split_dim = 0; + GenerateSplitSignature(var_nd_sbp, new_var_op_conf, new_split_signature, split_dim); + if (new_split_signature != "") { + *new_var_op_conf.mutable_variable_conf()->mutable_nd_sbp(split_dim) = new_split_signature; } else { - return IsS0SignatureSupported(n); + continue; } + + bool split_is_allowed = true; + if (split_is_allowed) { + NdSbp new_nd_sbp; + std::vector nd_sbp_str_vec; + for (const auto& sbp_str : new_var_op_conf.variable_conf().nd_sbp()) { + nd_sbp_str_vec.push_back(sbp_str); + } + ParseNdSbpFromStringList(nd_sbp_str_vec, &new_nd_sbp); + // check allowed by min shard size and evenly split + if (split_is_allowed) { + split_is_allowed = IsSplitValid(logical_shape, new_nd_sbp, *pd.hierarchy(), threshold); + } + if (split_is_allowed) { + // resize sequence by new nd sbp limit + auto& cur_seq = sorted_sequences.at(i); + int64_t max_len = 1; + if (cur_seq->len() > 1) { + FOR_RANGE(int64_t, node_idx, 1, cur_seq->len()) { + if (IsNdSbpSupported4Op(cur_seq->nodes().at(node_idx), new_nd_sbp)) { + ++max_len; + } else { + break; + } + } + } + if (max_len < cur_seq->len()) { cur_seq->resize(max_len); } + } + } + if (!split_is_allowed) { + VLOG(3) << var_node->op().op_name() << " failed to change form B to S " + << " with op conf " << new_var_op_conf.variable_conf().DebugString(); + continue; + } + if (!prev_allowed_op_name.empty()) { + new_var_op_conf.add_ctrl_in_op_name(prev_allowed_op_name); + } + builder->MutOpsOnlyOnce({new_var_op_conf}); + // Set consumers to consum this variable op's cast op's output as Broadcast. + if (new_split_signature != "") { + SetNdSbp4Consumers(builder, sorted_sequences.at(i), var_nd_sbp); + } + prev_allowed_op_name = var_node->op().op_name(); + VLOG(3) << var_node->op().op_name() << " succeed to change form B to " << new_split_signature + << " on ranks dim " << split_dim << " with op conf " + << new_var_op_conf.variable_conf().DebugString(); + } +} +} // namespace + +Maybe RewriteDistributedSplit(const OpGraph& op_graph, JobBuilder* builder) { + const int64_t threshold = builder->job().job_conf().optimizer_placement_optimization_threshold(); + const auto IsAllowed = [](const OpNode* n) -> bool { + // No need to limit here. + return true; }; const auto PlacementSequencesAsSplitParallel = [&](const ParallelDesc& pd, std::vector&& sorted_sequences) { - // For all sorted sequnence, set the variable op in the sequence to S(0) - // and add ctrl edge to control the exectuion order between variable ops. - // A sequence is a variable op and its cast(fp32 to fp16) op. This is because the forward pass - // consume the fp16 variable and the optimizer consume the fp32 variable. - for (int64_t i = 0; i < sorted_sequences.size(); ++i) { - const OpNode* var_node = sorted_sequences.at(i)->GetVariableNode(); - OperatorConf new_var_op_conf = var_node->op().op_conf(); - CHECK_EQ(pd.hierarchy()->NumAxes(), 1); - new_var_op_conf.mutable_variable_conf()->clear_nd_sbp(); - *new_var_op_conf.mutable_variable_conf()->add_nd_sbp() = "S(0)"; - if (i != 0) { - const std::string& prev_op_name = - sorted_sequences.at(i - 1)->GetVariableNode()->op().op_name(); - new_var_op_conf.add_ctrl_in_op_name(prev_op_name); - } - builder->MutOpsOnlyOnce({new_var_op_conf}); - // Set consumers to consum this variable op's cast op's output as Broadcast. - SetBroadcastParallel4Consumers(builder, sorted_sequences.at(i)); - } + ShardSequence(builder, threshold, pd, std::forward>(sorted_sequences)); }; ForEachParallelSortedNodeSequence(op_graph, IsAllowed, SequenceCompSortedByOrderAsc, PlacementSequencesAsSplitParallel); + JUST(builder->MutOpTransactionCommit()); return Maybe::Ok(); } @@ -313,7 +532,8 @@ class OptimizerPlacementOptimizationPass final : public JobPass { Maybe Apply(Job* job, JobPassCtx* ctx) const override { if (!(ctx->job_desc().IsTrain() - && ctx->job_desc().job_conf().has_optimizer_placement_optimization_mode())) { + && ctx->job_desc().job_conf().has_optimizer_placement_optimization_mode() + && ctx->job_desc().job_conf().optimizer_placement_optimization_mode() != "none")) { return Maybe::Ok(); } const std::string& mode = ctx->job_desc().job_conf().optimizer_placement_optimization_mode(); diff --git a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp new file mode 100644 index 00000000000..c573f9bf0ad --- /dev/null +++ b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp @@ -0,0 +1,258 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/kernel/kernel.h" +#include "oneflow/core/device/nccl_util.h" +#include "oneflow/core/job/eager_nccl_comm_manager.h" +#include "oneflow/core/register/tensor_slice_copier.h" +#include "oneflow/core/ep/include/primitive/memset.h" +#include "oneflow/core/ep/include/primitive/add.h" +#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h" + +#if defined(WITH_CUDA) && NCCL_VERSION_CODE > 2700 + +namespace oneflow { + +class NcclSendRecvBoxingKernel final : public Kernel { + public: + OF_DISALLOW_COPY_AND_MOVE(NcclSendRecvBoxingKernel); + NcclSendRecvBoxingKernel() = default; + ~NcclSendRecvBoxingKernel() override = default; + + const std::vector>& in_tensor_slice_copier_vec() const { + return in_tensor_slice_copier_vec_; + } + const std::vector>& out_tensor_slice_copier_vec() const { + return out_tensor_slice_copier_vec_; + } + const std::vector& send_elem_cnts() const { return send_elem_cnts_; } + const std::vector& recv_elem_cnts() const { return recv_elem_cnts_; } + const bool has_input() const { return has_input_; } + const bool has_output() const { return has_output_; } + ncclComm_t comm() const { return GetOrCreate().comm; } + + private: + struct Comm { + Comm(ncclComm_t comm) : comm(comm) {} + ncclComm_t comm; + }; + + void Init() const { + ParallelDesc parallel_desc(parallel_conf_); + std::set> device_set; + for (int64_t parallel_id = 0; parallel_id < parallel_desc.parallel_num(); ++parallel_id) { + int64_t machine_id = CHECK_JUST(parallel_desc.MachineId4ParallelId(parallel_id)); + int64_t device_id = CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_id)); + device_set.emplace(std::make_pair(machine_id, device_id)); + } + EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Global::Get()); + ncclComm_t comm; + if (has_independent_stream_) { + comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); + } else { + comm = comm_mgr->GetCommForDevice(device_set); + } + comm_.reset(new Comm(comm)); + } + + const Comm& GetOrCreate() const { + if (!comm_) { Init(); } + return *comm_; + } + + void VirtualKernelInit(KernelContext* ctx) override; + void ForwardDataContent(KernelContext* ctx) const override; + + bool has_independent_stream_; + std::string stream_name_; + ParallelConf parallel_conf_; + mutable std::unique_ptr comm_; + bool src_nd_sbp_no_partial_parallel_; + std::vector> in_tensor_slice_copier_vec_; + std::vector> out_tensor_slice_copier_vec_; + std::vector send_elem_cnts_; + std::vector recv_elem_cnts_; + bool has_input_; + bool has_output_; +}; + +void NcclSendRecvBoxingKernel::ForwardDataContent(KernelContext* ctx) const { + Blob* buf = ctx->BnInOp2Blob("buf"); + ncclComm_t comm = this->comm(); + cudaStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); + const std::vector& send_elem_cnts = this->send_elem_cnts(); + const std::vector& recv_elem_cnts = this->recv_elem_cnts(); + const int64_t parallel_num = this->kernel_conf().parallel_ctx().parallel_num(); + const DataType data_type = buf->data_type(); + std::vector send_in_ptr; + std::vector recv_out_ptr; + char* buf_ptr = buf->mut_dptr(); + int64_t offset = 0; + if (this->has_input()) { + for (int64_t i = 0; i < parallel_num; ++i) { + void* send_ptr = reinterpret_cast(buf_ptr + offset); + send_in_ptr.push_back(send_ptr); + offset += send_elem_cnts.at(i) * GetSizeOfDataType(data_type); + } + } + if (this->has_output()) { + for (int64_t i = 0; i < parallel_num; ++i) { + void* recv_ptr = reinterpret_cast(buf_ptr + offset); + recv_out_ptr.push_back(recv_ptr); + offset += recv_elem_cnts.at(i) * GetSizeOfDataType(data_type); + } + } + if (this->has_input()) { + const Blob* in = ctx->BnInOp2Blob("in"); + const std::vector>& in_tensor_slice_copier_vec = + this->in_tensor_slice_copier_vec(); + for (int64_t i = 0; i < parallel_num; ++i) { + if (in_tensor_slice_copier_vec.at(i)) { + in_tensor_slice_copier_vec.at(i)->Copy(ctx->stream(), send_in_ptr.at(i), in->dptr()); + } + } + } + const int64_t parallel_id = this->kernel_conf().parallel_ctx().parallel_id(); + OF_NCCL_CHECK(ncclGroupStart()); + for (int64_t i = 0; i < parallel_num; ++i) { + if (this->has_input() && send_elem_cnts.at(i) != 0) { + OF_NCCL_CHECK(ncclSend(send_in_ptr.at(i), send_elem_cnts.at(i), GetNcclDataType(data_type), i, + comm, cuda_stream)); + } + if (this->has_output() && recv_elem_cnts.at(i) != 0) { + OF_NCCL_CHECK(ncclRecv(recv_out_ptr.at(i), recv_elem_cnts.at(i), GetNcclDataType(data_type), + i, comm, cuda_stream)); + } + } + OF_NCCL_CHECK(ncclGroupEnd()); + if (!this->has_output()) { return; } + Blob* out = ctx->BnInOp2Blob("out"); + const std::vector>& out_tensor_slice_copier_vec = + this->out_tensor_slice_copier_vec(); + + if (src_nd_sbp_no_partial_parallel_) { + for (int64_t i = 0; i < parallel_num; ++i) { + if (out_tensor_slice_copier_vec.at(i)) { + out_tensor_slice_copier_vec.at(i)->Copy(ctx->stream(), out->mut_dptr(), recv_out_ptr.at(i)); + } + } + } else { + std::unique_ptr primitive = + ep::primitive::NewPrimitive(ctx->stream()->device_type(), + out->data_type()); + CHECK(primitive); + std::unique_ptr memset_primitive = + ep::primitive::NewPrimitive(ctx->stream()->device_type()); + CHECK(memset_primitive); + bool is_first_slice = true; + for (int64_t i = 0; i < parallel_num; ++i) { + if (out_tensor_slice_copier_vec.at(i)) { + if (is_first_slice) { + is_first_slice = false; + if (recv_elem_cnts.at(i) != out->shape().elem_cnt()) { + // if not same shape, memset out + memset_primitive->Launch(ctx->stream(), out->mut_dptr(), 0, + out->shape().elem_cnt() * GetSizeOfDataType(data_type)); + } + out_tensor_slice_copier_vec.at(i)->Copy(ctx->stream(), out->mut_dptr(), + recv_out_ptr.at(i)); + } else { + if (recv_elem_cnts.at(i) == out->shape().elem_cnt()) { + primitive->Launch(ctx->stream(), out->dptr(), recv_out_ptr.at(i), out->mut_dptr(), + out->shape().elem_cnt()); + } else { + void* out_buf = reinterpret_cast(buf_ptr + offset); + memset_primitive->Launch(ctx->stream(), out_buf, 0, + out->shape().elem_cnt() * GetSizeOfDataType(data_type)); + out_tensor_slice_copier_vec.at(i)->Copy(ctx->stream(), out_buf, recv_out_ptr.at(i)); + primitive->Launch(ctx->stream(), out->dptr(), out_buf, out->mut_dptr(), + out->shape().elem_cnt()); + } + } + } + } + } +} + +void NcclSendRecvBoxingKernel::VirtualKernelInit(KernelContext* ctx) { + const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); + has_independent_stream_ = this->op_conf().has_stream_name_hint(); + if (has_independent_stream_) { stream_name_ = this->op_conf().stream_name_hint(); } + parallel_conf_ = conf.parallel_conf(); + const int64_t parallel_id = this->kernel_conf().parallel_ctx().parallel_id(); + ParallelDesc parallel_desc(parallel_conf_); + ParallelDesc src_parallel_desc(conf.src_parallel_conf()); + ParallelDesc dst_parallel_desc(conf.dst_parallel_conf()); + const NdSbp& src_nd_sbp = conf.src_nd_sbp(); + const NdSbp& dst_nd_sbp = conf.dst_nd_sbp(); + has_input_ = conf.has_input(); + has_output_ = conf.has_output(); + src_nd_sbp_no_partial_parallel_ = !NdSbpHasPartialParallel(src_nd_sbp); + const DataType data_type = this->kernel_conf().data_type(); + const DeviceType device_type = parallel_desc.device_type(); + const Shape& logical_shape = Shape(conf.logical_shape()); + const int64_t parallel_num = parallel_desc.parallel_num(); + + std::vector src_send_intersections; + std::vector dst_recv_intersections; + GetRankSendRecvIntersection(parallel_id, parallel_desc, src_parallel_desc, dst_parallel_desc, + src_nd_sbp, dst_nd_sbp, logical_shape, &src_send_intersections, + &dst_recv_intersections); + // if parallel_id exists in src parallel desc, has send + int64_t src_parallel_id = GetMappedParallelId(parallel_id, parallel_desc, src_parallel_desc); + if (src_parallel_id != -1) { + CHECK_EQ(src_send_intersections.size(), parallel_num); + send_elem_cnts_.resize(parallel_num); + in_tensor_slice_copier_vec_.resize(parallel_num); + const TensorSliceView& cur_rank_in_slice = GetTensorSliceView4ParallelId( + *src_parallel_desc.hierarchy(), src_nd_sbp, logical_shape, src_parallel_id); + for (int64_t i = 0; i < parallel_num; ++i) { + const TensorSliceView& intersection = src_send_intersections.at(i); + if (!intersection.IsEmpty()) { + send_elem_cnts_.at(i) = intersection.shape().elem_cnt(); + in_tensor_slice_copier_vec_.at(i).reset( + new TensorSliceCopier(intersection, cur_rank_in_slice, data_type, device_type)); + } + } + } else { + CHECK_EQ(src_send_intersections.size(), 0); + } + + // if parallel_id exists in src parallel desc, has send + int64_t dst_parallel_id = GetMappedParallelId(parallel_id, parallel_desc, dst_parallel_desc); + if (dst_parallel_id != -1) { + CHECK_EQ(dst_recv_intersections.size(), parallel_num); + recv_elem_cnts_.resize(parallel_num); + out_tensor_slice_copier_vec_.resize(parallel_num); + const TensorSliceView& cur_rank_out_slice = GetTensorSliceView4ParallelId( + *dst_parallel_desc.hierarchy(), dst_nd_sbp, logical_shape, dst_parallel_id); + for (int64_t i = 0; i < parallel_num; ++i) { + const TensorSliceView& intersection = dst_recv_intersections.at(i); + if (!intersection.IsEmpty()) { + recv_elem_cnts_.at(i) = intersection.shape().elem_cnt(); + out_tensor_slice_copier_vec_.at(i).reset( + new TensorSliceCopier(cur_rank_out_slice, intersection, data_type, device_type)); + } + } + } else { + CHECK_EQ(dst_recv_intersections.size(), 0); + } +} + +REGISTER_KERNEL(OperatorConf::kNcclSendRecvBoxingConf, NcclSendRecvBoxingKernel); + +} // namespace oneflow + +#endif // WITH_CUDA && NCCL_VERSION_CODE > 2700 diff --git a/oneflow/core/lazy/actor/naive_actor.cpp b/oneflow/core/lazy/actor/naive_actor.cpp index ac557618b74..59abdb3437b 100644 --- a/oneflow/core/lazy/actor/naive_actor.cpp +++ b/oneflow/core/lazy/actor/naive_actor.cpp @@ -34,6 +34,7 @@ REGISTER_ACTOR(TaskType::kSliceBoxing, NaiveActor); REGISTER_ACTOR(TaskType::kBoxingIdentity, NaiveActor); REGISTER_ACTOR(TaskType::kCollectiveBoxingPack, NaiveActor); REGISTER_ACTOR(TaskType::kCollectiveBoxingUnpack, NaiveActor); +REGISTER_ACTOR(TaskType::kNcclSendRecvBoxing, NaiveActor); REGISTER_ACTOR(TaskType::kDecodeH2D, NaiveActor); REGISTER_ACTOR(TaskType::kCriticalSectionWaitTick, NaiveActor); #ifdef WITH_CUDA diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp new file mode 100644 index 00000000000..9e1481fa7aa --- /dev/null +++ b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp @@ -0,0 +1,133 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/operator/operator.h" +#include "oneflow/core/common/protobuf.h" +#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h" + +namespace oneflow { + +class NcclSendRecvBoxingOp : public Operator { + public: + OF_DISALLOW_COPY_AND_MOVE(NcclSendRecvBoxingOp); + NcclSendRecvBoxingOp() = default; + ~NcclSendRecvBoxingOp() override = default; + + Maybe InitFromOpConf() override; + Maybe InferInternalBlobDescs( + const std::function& GetBlobDesc4BnInOp, + const ParallelContext* parallel_ctx, const JobDesc* job_desc) const override; + Maybe InferLogicalOutBlobDescs( + const std::function& BlobDesc4BnInOp, + const ParallelDesc& parallel_desc) const override { + UNIMPLEMENTED_THEN_RETURN(); + } + Maybe InferOutBlobDescs( + const std::function& GetBlobDesc4BnInOp, + const ParallelContext* parallel_ctx) const override; + + private: + LogicalBlobId lbi4ibn(const std::string& input_bn) const override; + LogicalBlobId lbi4obn(const std::string& output_bn) const override; +}; + +Maybe NcclSendRecvBoxingOp::InitFromOpConf() { + const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); + if (conf.has_input()) { EnrollInputBn("in", false); } + if (conf.has_output()) { EnrollOutputBn("out", false); } + EnrollTmpBn("buf"); + return Maybe::Ok(); +} + +Maybe NcclSendRecvBoxingOp::InferInternalBlobDescs( + const std::function& GetBlobDesc4BnInOp, + const ParallelContext* parallel_ctx, const JobDesc* job_desc) const { + BlobDesc* buf = GetBlobDesc4BnInOp("buf"); + const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); + const NdSbp& src_nd_sbp = conf.src_nd_sbp(); + const NdSbp& dst_nd_sbp = conf.dst_nd_sbp(); + ParallelDesc parallel_desc(conf.parallel_conf()); + ParallelDesc in_parallel_desc(conf.src_parallel_conf()); + ParallelDesc out_parallel_desc(conf.dst_parallel_conf()); + const int64_t parallel_num = parallel_desc.parallel_num(); + const int64_t parallel_id = parallel_ctx->parallel_id(); + const Shape& logical_shape = Shape(conf.logical_shape()); + std::vector src_send_intersections; + std::vector dst_recv_intersections; + GetRankSendRecvIntersection(parallel_id, parallel_desc, in_parallel_desc, out_parallel_desc, + src_nd_sbp, dst_nd_sbp, logical_shape, &src_send_intersections, + &dst_recv_intersections); + int64_t buf_count = 0; + if (conf.has_input()) { + const BlobDesc* in = GetBlobDesc4BnInOp("in"); + buf->set_data_type(in->data_type()); + CHECK_EQ(src_send_intersections.size(), parallel_num); + for (int64_t i = 0; i < parallel_num; ++i) { + const TensorSliceView& intersection = src_send_intersections.at(i); + if (!intersection.IsEmpty()) { buf_count += intersection.shape().elem_cnt(); } + } + } + if (conf.has_output()) { + const BlobDesc* out = GetBlobDesc4BnInOp("out"); + buf->set_data_type(out->data_type()); + for (int64_t i = 0; i < parallel_num; ++i) { + const TensorSliceView& intersection = dst_recv_intersections.at(i); + if (!intersection.IsEmpty()) { buf_count += intersection.shape().elem_cnt(); } + } + if (NdSbpHasPartialParallel(src_nd_sbp)) { + // Note: when src_nd_sbp has partial_sum, need a out_size buffer to copy and add to out. + buf_count += out->shape().elem_cnt(); + } + } + buf->mut_shape() = Shape({buf_count}); + return Maybe::Ok(); +} + +LogicalBlobId NcclSendRecvBoxingOp::lbi4ibn(const std::string& input_bn) const { + return this->op_conf().nccl_send_recv_boxing_conf().lbi(); +} + +LogicalBlobId NcclSendRecvBoxingOp::lbi4obn(const std::string& output_bn) const { + return this->op_conf().nccl_send_recv_boxing_conf().lbi(); +} + +Maybe NcclSendRecvBoxingOp::InferOutBlobDescs( + const std::function& GetBlobDesc4BnInOp, + const ParallelContext* parallel_ctx) const { + const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); + const Shape& logical_shape = Shape(conf.logical_shape()); + if (conf.has_input()) { + const BlobDesc* in_blob_desc = GetBlobDesc4BnInOp("in"); + const NdSbp& src_nd_sbp = conf.src_nd_sbp(); + const ParallelDesc& src_parallel_desc = ParallelDesc(conf.src_parallel_conf()); + std::shared_ptr in_shape = + JUST(GetPhysicalShape(logical_shape, src_nd_sbp, src_parallel_desc, 0)); + CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()); + } + if (conf.has_output()) { + BlobDesc* out_blob_desc = GetBlobDesc4BnInOp("out"); + const NdSbp& dst_nd_sbp = conf.dst_nd_sbp(); + const ParallelDesc& dst_parallel_desc = ParallelDesc(conf.dst_parallel_conf()); + std::shared_ptr out_shape = + JUST(GetPhysicalShape(logical_shape, dst_nd_sbp, dst_parallel_desc, 0)); + out_blob_desc->mut_shape() = *out_shape; + out_blob_desc->set_data_type(conf.data_type()); + } + return Maybe::Ok(); +} + +REGISTER_OP(OperatorConf::kNcclSendRecvBoxingConf, NcclSendRecvBoxingOp); + +} // namespace oneflow diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op_util.cpp b/oneflow/core/operator/nccl_send_recv_boxing_op_util.cpp new file mode 100644 index 00000000000..a0be3320256 --- /dev/null +++ b/oneflow/core/operator/nccl_send_recv_boxing_op_util.cpp @@ -0,0 +1,170 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h" + +namespace oneflow { + +namespace { +// Go through all the ranks while transfer between two nd sbps with no PartialSum under the same +// placement. +// NOTE: We need to make sure no partial sums in the sbps of the producer and consumer. +void DfsTraverseRanks4NdSbp( + int32_t depth, std::vector& in_parallel_ids, + const std::vector& out_parallel_ids, const Shape& in_parallel_hierarchy, + const NdIndexOffsetHelper& in_hierarchy_index_helper, + const NdSbp& in_nd_sbp, const std::function& visit) { + if (depth >= in_parallel_hierarchy.NumAxes()) { + visit(in_hierarchy_index_helper.NdIndexToOffset(in_parallel_ids.data(), + in_parallel_hierarchy.NumAxes())); + return; + } + if (in_nd_sbp.sbp_parallel(depth).has_broadcast_parallel()) { + // If Broadcast in the sbp of the producer, only visit those ranks with the same id as the + // current rank along the depth-dimension. + in_parallel_ids[depth] = out_parallel_ids[depth]; + DfsTraverseRanks4NdSbp(depth + 1, in_parallel_ids, out_parallel_ids, in_parallel_hierarchy, + in_hierarchy_index_helper, in_nd_sbp, visit); + } else { + // If Split or PartialSum, go through all the ranks along the depth-dimension. + for (int64_t i = 0; i < in_parallel_hierarchy.dim_vec().at(depth); i++) { + in_parallel_ids[depth] = i; + DfsTraverseRanks4NdSbp(depth + 1, in_parallel_ids, out_parallel_ids, in_parallel_hierarchy, + in_hierarchy_index_helper, in_nd_sbp, visit); + } + } +} + +bool NdSbpNoPartialParallel(const NdSbp& nd_sbp) { + CHECK_GT(nd_sbp.sbp_parallel_size(), 0); + FOR_RANGE(int64_t, i, 0, nd_sbp.sbp_parallel_size()) { + if (nd_sbp.sbp_parallel(i).has_partial_sum_parallel()) { return false; } + } + return true; +} + +} // namespace + +int64_t GetMappedParallelId(const int64_t from_parallel_id, const ParallelDesc& from_parallel_desc, + const ParallelDesc& to_parallel_desc) { + const int64_t machine_id = CHECK_JUST(from_parallel_desc.MachineId4ParallelId(from_parallel_id)); + const int64_t device_index = CHECK_JUST(from_parallel_desc.DeviceId4ParallelId(from_parallel_id)); + if (to_parallel_desc.Containing(machine_id, device_index)) { + return CHECK_JUST(to_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + } else { + return -1; + } +} + +void GetRankSendRecvIntersection(int64_t parallel_id, const ParallelDesc& parallel_desc, + const ParallelDesc& in_parallel_desc, + const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, + const NdSbp& out_nd_sbp, const Shape& logical_shape, + std::vector* send_intersections, + std::vector* recv_intersections) { + const int64_t parallel_num = parallel_desc.parallel_num(); + CHECK_LT(parallel_id, parallel_num); + + const std::vector& in_slices = + GetTensorSliceView(*in_parallel_desc.hierarchy(), in_nd_sbp, logical_shape); + const std::vector& out_slices = + GetTensorSliceView(*out_parallel_desc.hierarchy(), out_nd_sbp, logical_shape); + + const auto& in_parallel_hierarchy = in_parallel_desc.hierarchy(); + int32_t in_hierarchy_dimension = in_parallel_hierarchy->NumAxes(); + const NdIndexOffsetHelper in_hierarchy_index_helper( + in_parallel_hierarchy->dim_vec().data(), in_hierarchy_dimension); + + const int64_t machine_id = CHECK_JUST(parallel_desc.MachineId4ParallelId(parallel_id)); + const int64_t device_index = CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_id)); + const int64_t in_parallel_num = in_parallel_desc.parallel_num(); + const int64_t out_parallel_num = out_parallel_desc.parallel_num(); + // cur rank recv from + // cur rank has output + if (out_parallel_desc.Containing(machine_id, device_index)) { + recv_intersections->resize(parallel_num); + int64_t out_id = + CHECK_JUST(out_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + const TensorSliceView& cur_rank_out_slice = out_slices.at(out_id); + const auto& add_to_recv_intersections = [&](int32_t send_id) { + const TensorSliceView& in_slice = in_slices.at(send_id); + const TensorSliceView& intersection = cur_rank_out_slice.Intersect(in_slice); + if (intersection.IsEmpty()) { return; } + const int64_t merged_id = GetMappedParallelId(send_id, in_parallel_desc, parallel_desc); + recv_intersections->at(merged_id) = intersection; + }; + int64_t corresponding_in_id = 0; + // For example [[0, 1], [2, 3]] -> [[1, 3], [5, 6]] + if (in_parallel_desc.Containing(machine_id, device_index)) { + // 1 and 3 are in [[0, 1], [2, 3]], use the same id in the producer parallel description + // The id of 1 is (0, 1), the id of 3 is (1, 1) + corresponding_in_id = + CHECK_JUST(in_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + } else { + // 5 and 7 are not in [[0, 1], [2, 3]] + // Then the id does not matter + corresponding_in_id = out_id % in_parallel_num; + } + std::vector in_parallel_ids(in_hierarchy_dimension); + // The corresponding parallel id of a consumer rank in the producer parallel description + std::vector out_parallel_ids(in_hierarchy_dimension); + in_hierarchy_index_helper.OffsetToNdIndex(corresponding_in_id, out_parallel_ids.data(), + in_hierarchy_dimension); + DfsTraverseRanks4NdSbp(0, in_parallel_ids, out_parallel_ids, *in_parallel_hierarchy, + in_hierarchy_index_helper, in_nd_sbp, add_to_recv_intersections); + } + + // cur rank send to + if (in_parallel_desc.Containing(machine_id, device_index)) { + send_intersections->resize(parallel_num); + int64_t in_id = + CHECK_JUST(in_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + const TensorSliceView& cur_rank_in_slice = in_slices.at(in_id); + for (int64_t recv_i = 0; recv_i < out_parallel_num; ++recv_i) { + const auto& add_to_send_intersections = [&](int32_t send_id) { + if (send_id != in_id) { return; } + const TensorSliceView& out_slice = out_slices.at(recv_i); + const TensorSliceView& intersection = out_slice.Intersect(cur_rank_in_slice); + if (intersection.IsEmpty()) { return; } + const int64_t merged_id = GetMappedParallelId(recv_i, out_parallel_desc, parallel_desc); + send_intersections->at(merged_id) = intersection; + }; + int64_t out_device_id = CHECK_JUST(out_parallel_desc.DeviceId4ParallelId(recv_i)); + int64_t out_machine_id = CHECK_JUST(out_parallel_desc.MachineId4ParallelId(recv_i)); + int64_t corresponding_in_id = 0; + // For example [[0, 1], [2, 3]] -> [[1, 3], [5, 6]] + if (in_parallel_desc.Containing(out_machine_id, out_device_id)) { + // 1 and 3 are in [[0, 1], [2, 3]], use the same id in the producer parallel description + // The id of 1 is (0, 1), the id of 3 is (1, 1) + corresponding_in_id = + CHECK_JUST(in_parallel_desc.ParallelId4MachineDeviceId(out_machine_id, out_device_id)); + } else { + // 5 and 7 are not in [[0, 1], [2, 3]] + // Then the id does not matter + corresponding_in_id = recv_i % in_parallel_num; + } + std::vector in_parallel_ids(in_hierarchy_dimension); + // The corresponding parallel id of a consumer rank in the producer parallel description + std::vector out_parallel_ids(in_hierarchy_dimension); + in_hierarchy_index_helper.OffsetToNdIndex(corresponding_in_id, out_parallel_ids.data(), + in_hierarchy_dimension); + DfsTraverseRanks4NdSbp(0, in_parallel_ids, out_parallel_ids, *in_parallel_hierarchy, + in_hierarchy_index_helper, in_nd_sbp, add_to_send_intersections); + } + } +} + +} // namespace oneflow diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op_util.h b/oneflow/core/operator/nccl_send_recv_boxing_op_util.h new file mode 100644 index 00000000000..f491a50e91b --- /dev/null +++ b/oneflow/core/operator/nccl_send_recv_boxing_op_util.h @@ -0,0 +1,31 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/register/tensor_slice_view.h" +#include "oneflow/core/job/nd_sbp_util.h" + +namespace oneflow { + +int64_t GetMappedParallelId(const int64_t from_parallel_id, const ParallelDesc& from_parallel_desc, + const ParallelDesc& to_parallel_desc); + +void GetRankSendRecvIntersection(int64_t parallel_id, const ParallelDesc& parallel_desc, + const ParallelDesc& in_parallel_desc, + const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, + const NdSbp& out_nd_sbp, const Shape& logical_shape, + std::vector* send_intersections, + std::vector* recv_intersections); + +} // namespace oneflow diff --git a/oneflow/core/operator/op_conf.proto b/oneflow/core/operator/op_conf.proto index 4589ae3507e..cb6cc5d80a3 100644 --- a/oneflow/core/operator/op_conf.proto +++ b/oneflow/core/operator/op_conf.proto @@ -13,6 +13,7 @@ import "oneflow/core/job/sbp_parallel.proto"; import "oneflow/core/graph/boxing/collective_boxing.proto"; import "oneflow/core/job/initializer_conf.proto"; import "oneflow/core/job/regularizer_conf.proto"; +import "oneflow/core/job/placement.proto"; import "oneflow/core/job/learning_rate_schedule_conf.proto"; import "oneflow/core/operator/interface_blob_conf.proto"; import "oneflow/core/register/blob_desc.proto"; @@ -401,6 +402,19 @@ message BoxingZerosOpConf { required DataType data_type = 3; } +message NcclSendRecvBoxingOpConf { + required LogicalBlobId lbi = 1; + required NdSbp src_nd_sbp = 2; + required NdSbp dst_nd_sbp = 3; + required ParallelConf parallel_conf = 4; + required ParallelConf src_parallel_conf = 5; + required ParallelConf dst_parallel_conf = 6; + required ShapeProto logical_shape = 7; + required DataType data_type = 8; + required bool has_input = 9; + required bool has_output = 10; +} + message OperatorConf { required string name = 1; optional string device_tag = 4 [default = "invalid_device"]; @@ -446,6 +460,7 @@ message OperatorConf { CollectiveBoxingPackOpConf collective_boxing_pack_conf = 174; CollectiveBoxingUnpackOpConf collective_boxing_unpack_conf = 175; BoxingZerosOpConf boxing_zeros_conf = 176; + NcclSendRecvBoxingOpConf nccl_send_recv_boxing_conf = 177; UserOpConf user_conf = 199; // domain op diff --git a/oneflow/extension/python/numpy_internal.h b/oneflow/extension/python/numpy_internal.h index fd3e9594034..84590a38990 100644 --- a/oneflow/extension/python/numpy_internal.h +++ b/oneflow/extension/python/numpy_internal.h @@ -22,7 +22,7 @@ limitations under the License. // ************************ #include "oneflow/core/common/data_type.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/common/shape_vec.h" // PyArrayObject cannot be forward declared, or a compile error will occur diff --git a/oneflow/ir/include/OneFlow/OneFlowOps.td b/oneflow/ir/include/OneFlow/OneFlowOps.td index 441271e33dd..405ff4499e0 100644 --- a/oneflow/ir/include/OneFlow/OneFlowOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowOps.td @@ -282,6 +282,10 @@ def LowerOneFlowToTosaPass : Pass<"lower-oneflow-to-tosa", "ModuleOp"> { let summary = ""; let constructor = "mlir::oneflow::createLowerOneFlowToTosaPass()"; let dependentDialects = ["tosa::TosaDialect", "memref::MemRefDialect", "mlir::func::FuncDialect"]; + let options = [ + Option<"variableAsConstant", "variable-as-constant", "int", "0", + "convert variable op as const op of tosa">, + ]; } def MapSCFToGPUPass : Pass<"gpu-greedy-parallel-loop-mapping", "ModuleOp"> { diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 6a94d35cafc..60d13342c1e 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -2191,8 +2191,8 @@ def OneFlow_EagerSymmetricSToPOp : OneFlow_BaseOp<"eager_symmetric_s_to_p", [NoS #endif // GET_ONEFLOW_EAGER_OP_DEFINITIONS // Group: FUSED -// cudnn_fused_normalization_add_relu, cudnn_fused_normalization_add_relu_grad, fused_bias_add_gelu, fused_bias_add_gelu_grad, fused_bias_add_mask_scale, fused_cast_scale, fused_scale_mask_softmax, fused_scale_mask_softmax_dropout, fused_scale_mask_softmax_dropout_grad, fused_scale_mask_softmax_grad, fused_scale_tril, fused_self_attention_query_mul_key_and_value, fused_self_attention_query_mul_key_and_value_grad, fused_tril_scale_softmax_mask_scale, fused_tril_scale_softmax_mask_scale_grad, normalization_add_relu_grad, fused_dot_feature_interaction, fused_dot_feature_interaction_grad -// Total: 18 +// cudnn_fused_normalization_add_relu, cudnn_fused_normalization_add_relu_grad, fused_bias_add_gelu, fused_bias_add_gelu_grad, fused_bias_add_mask_scale, fused_cast_scale, fused_scale_mask_softmax, fused_scale_mask_softmax_dropout, fused_scale_mask_softmax_dropout_grad, fused_scale_mask_softmax_grad, fused_scale_tril, fused_self_attention_query_mul_key_and_value, fused_self_attention_query_mul_key_and_value_grad, fused_tril_scale_softmax_mask_scale, fused_tril_scale_softmax_mask_scale_grad, normalization_add_relu_grad, fused_dot_feature_interaction, fused_dot_feature_interaction_grad, fused_cross_feature_interaction, fused_cross_feature_interaction_grad_v1, fused_cross_feature_interaction_grad_v2 +// Total: 21 #ifdef GET_ONEFLOW_FUSED_OP_DEFINITIONS @@ -2573,6 +2573,68 @@ def OneFlow_FusedDotFeatureInteractionGradOp : OneFlow_BaseOp<"fused_dot_feature let has_data_type_infer_fn = 1; } +def OneFlow_FusedCrossFeatureInteractionOp : OneFlow_BaseOp<"fused_cross_feature_interaction", [NoSideEffect, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$x, + OneFlow_Tensor:$weight, + OneFlow_Tensor:$bias, + OneFlow_Tensor:$x0 + ); + let output = (outs + OneFlow_Tensor:$out, + OneFlow_Tensor:$matmul_result + ); + let attrs = (ins + StrAttr:$interaction_mode + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + + +def OneFlow_FusedCrossFeatureInteractionV1GradOp : OneFlow_BaseOp<"fused_cross_feature_interaction_v1_grad", [NoSideEffect, DeclareOpInterfaceMethods, NoGrad]> { + let input = (ins + OneFlow_Tensor:$dy, + OneFlow_Tensor:$weight, + OneFlow_Tensor:$x0, + OneFlow_Tensor:$x, + OneFlow_Tensor:$matmul_result + ); + let output = (outs + OneFlow_Tensor:$dx0, + OneFlow_Tensor:$dw, + OneFlow_Tensor:$dx, + OneFlow_Tensor:$dbias + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + +def OneFlow_FusedCrossFeatureInteractionV2GradOp : OneFlow_BaseOp<"fused_cross_feature_interaction_v2_grad", [NoSideEffect, DeclareOpInterfaceMethods, NoGrad]> { + let input = (ins + OneFlow_Tensor:$dy, + OneFlow_Tensor:$weight, + OneFlow_Tensor:$bias, + OneFlow_Tensor:$x0, + OneFlow_Tensor:$x, + OneFlow_Tensor:$matmul_result + ); + let output = (outs + OneFlow_Tensor:$dx0, + OneFlow_Tensor:$dw, + OneFlow_Tensor:$dx, + OneFlow_Tensor:$dbias + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + #endif // GET_ONEFLOW_FUSED_OP_DEFINITIONS // Group: IDEMPOTENT @@ -4344,8 +4406,8 @@ def OneFlow_ErfInvOp : OneFlow_BaseOp<"erfinv", [NoSideEffect, DeclareOpInterfac #endif // GET_ONEFLOW_MATH_OP_DEFINITIONS // Group: MATMUL -// batch_matmul, broadcast_matmul, broadcast_matmul_grad_b, distributed_partial_fc_sample, distributed_partial_fc_sample_disable_boxing, erfc, erfc_grad, matmul, cublas_fused_mlp, cublas_bias_add_relu_matmul_grad, cublas_matmul_bias_add_grad -// Total: 11 +// batch_matmul, broadcast_matmul, broadcast_matmul_grad_b, distributed_partial_fc_sample, distributed_partial_fc_sample_disable_boxing, erfc, erfc_grad, matmul, cublas_fused_mlp, cublas_bias_add_relu_matmul_grad, cublas_matmul_bias_add_grad, fused_matmul_bias_add_relu_dropout, fused_relu_dropout_grad +// Total: 13 #ifdef GET_ONEFLOW_MATMUL_OP_DEFINITIONS @@ -4520,6 +4582,9 @@ def OneFlow_CublasBiasAddReluMatmulGradOp : OneFlow_BaseOp<"cublas_bias_add_relu OneFlow_Tensor:$d_grad, OneFlow_Tensor:$d_bias ); + let attrs = (ins + DefaultValuedAttr:$alpha + ); let has_logical_tensor_desc_infer_fn = 1; let has_physical_tensor_desc_infer_fn = 1; let has_get_sbp_fn = 1; @@ -4541,6 +4606,44 @@ def OneFlow_CublasMatmulBiasAddGradOp : OneFlow_BaseOp<"cublas_matmul_bias_add_g let has_data_type_infer_fn = 1; } +def OneFlow_FusedMatmulBiasAddReluDropoutOp : OneFlow_BaseOp<"fused_matmul_bias_add_relu_dropout", [NoSideEffect, AttrSizedOperandSegments, AttrSizedResultSegments, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$x, + Variadic:$weights, + Variadic:$biases + ); + let output = (outs + OneFlow_Tensor:$out, + Variadic:$cublas_aux, + Variadic:$hidden + ); + let attrs = (ins + DefaultValuedAttr:$skip_final_activation, + F32ArrayAttr:$dropout_rate_list + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + +def OneFlow_FusedReluDropoutGradOp : OneFlow_BaseOp<"fused_relu_dropout_grad", [NoSideEffect, NoGrad, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$dy, + OneFlow_Tensor:$mask + ); + let output = (outs + OneFlow_Tensor:$dx + ); + let attrs = (ins + DefaultValuedAttr:$scale + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + #endif // GET_ONEFLOW_MATMUL_OP_DEFINITIONS // Group: MISC diff --git a/oneflow/ir/install-llvm.cmake b/oneflow/ir/install-llvm.cmake index e7c09ba1aae..e01bba1b36d 100644 --- a/oneflow/ir/install-llvm.cmake +++ b/oneflow/ir/install-llvm.cmake @@ -24,6 +24,7 @@ if(NOT llvm_monorepo_POPULATED) -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_PROJECTS=mlir -DLLVM_APPEND_VC_REV=OFF -DLLVM_ENABLE_ZLIB=OFF -DLLVM_INSTALL_UTILS=ON -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -DLLVM_ENABLE_OCAMLDOC=OFF -DLLVM_ENABLE_BINDINGS=OFF + -DLLVM_ENABLE_TERMINFO=OFF # Disable terminfo in llvm so that oneflow doesn't need to link against it -DMLIR_ENABLE_CUDA_RUNNER=${WITH_MLIR_CUDA_CODEGEN} -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} -DINJA_URL=${INJA_URL} -DINJA_URL_HASH=${INJA_URL_HASH} -DJSON_URL=${JSON_URL} -DJSON_URL_HASH=${JSON_URL_HASH} @@ -49,6 +50,8 @@ if(NOT llvm_monorepo_POPULATED) endif() endif() +set(LLVM_INCLUDE_DIRS ${llvm_monorepo_SOURCE_DIR}/llvm/include;${llvm_monorepo_BINARY_DIR}/include) + if(WITH_MLIR) set(LLVM_DIR ${LLVM_INSTALL_DIR}/lib/cmake/llvm) set(MLIR_DIR ${LLVM_INSTALL_DIR}/lib/cmake/mlir) diff --git a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp index 488198828ac..ec92bb352ec 100644 --- a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp +++ b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp @@ -14,21 +14,26 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "OneFlow/OneFlowOps.h" +#include #include #include #include "OneFlow/OneFlowDialect.h" #include "OneFlow/Passes.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/Passes.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Diagnostics.h" #include "mlir/IR/OpImplementation.h" #include "mlir/Pass/Pass.h" @@ -36,11 +41,46 @@ limitations under the License. #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/Passes.h" +#include "oneflow/core/framework/op_expr_grad_function.h" +#include "oneflow/core/framework/variable_tensor_mgr.h" + +#include namespace mlir { namespace oneflow { +Value CreateTranspose(Location& loc, ConversionPatternRewriter& rewriter, Value input, + ArrayRef perms) { + int perms_size = perms.size(); + auto transpose_perms = rewriter.create( + loc, RankedTensorType::get({perms_size}, rewriter.getI32Type()), + rewriter.getI32TensorAttr(perms)); + const auto shape_type = input.getType().cast(); + std::vector ranked_type; + for (const auto& index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return rewriter.create( + loc, RankedTensorType::get(ranked_type, shape_type.getElementType()), input, transpose_perms); +}; + +Value CreateBNOp(Location loc, ConversionPatternRewriter& rewriter, Value output, Value x, + Value mean, Value variance, Value epsilon, Value gamma, Value beta) { + const auto output_type = output.getType(); + // sub_op = sub(input, mean) + auto sub_op0 = rewriter.create(loc, output_type, x, mean); + // add_op0 = add(var, epsilon) + auto add_op0 = rewriter.create(loc, variance.getType(), variance, epsilon); + // rsqrt_op = rsqrt(add_op0) + auto rsqrt_op = rewriter.create(loc, variance.getType(), add_op0); + // op4 = mul(sub_op, rsqrt_op) + auto mul_op0 = rewriter.create(loc, output_type, sub_op0, rsqrt_op, 0); + // op5 = mul(mul_op0, gamma) + auto mul_op1 = rewriter.create(loc, output_type, mul_op0, gamma, 0); + // op6 = add(mul_op1, beta) + auto batch_norm = rewriter.create(loc, output_type, mul_op1, beta); + return batch_norm; +}; + struct ScalarMulByTensorOpLowering final : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -72,18 +112,456 @@ struct ScalarMulByTensorOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(Job op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto func = + rewriter.create(op.getLoc(), op.getName(), op.getFunctionType()); + rewriter.inlineRegionBefore(op.getRegion(), func.getBody(), func.end()); + rewriter.eraseOp(op); + return success(); + } +}; + +struct ReturnOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(ReturnOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + rewriter.replaceOpWithNewOp(op, + /* operands */ op.operands()); + return success(); + } +}; + +struct InputOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(InputOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + // TODO: more choices to passing data between tosa and oneflow + const auto newValues = op.input(); + const auto is_block_arg = newValues.dyn_cast() != nullptr; + if (!is_block_arg) op->emitError("input is not block arg"); + rewriter.replaceOp(op, newValues); + return success(); + } +}; + +struct OutputOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(OutputOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + // TODO: more choices to passing data between tosa and oneflow + const auto newValues = op.input(); + rewriter.replaceOp(op, newValues); + return success(); + } +}; + +struct VariableOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(VariableOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto mgr = ::oneflow::Global<::oneflow::VariableTensorMgr>::Get(); + if (!mgr) op->emitError("global variable tensor manager miss"); + + const auto tensor = mgr->Get(op.op_name().str()); + if (!tensor) op->emitError("tensor is null"); + const auto value = support::TensorToDenseElementsAttr(tensor, rewriter.getContext()); + const auto output = op.output().getType(); + + rewriter.replaceOpWithNewOp(op, output, value); + return success(); + } +}; + +struct VariableOpToConstLowering final : public OpConversionPattern { + public: + VariableOpToConstLowering(TypeConverter& typeConverter, MLIRContext* context, int const_val) + : OpConversionPattern(typeConverter, context), const_val_(const_val){}; + + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(VariableOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto output = op.output().getType(); + const auto type = output.cast().getElementType(); + + // TODO: more control about this scope with flag + if (type.isa()) { + const auto float_attr = rewriter.getFloatAttr(type, const_val_); + auto value = DenseElementsAttr::get(output, float_attr); + + rewriter.replaceOpWithNewOp(op, output, value); + } else if (auto integerType = type.dyn_cast()) { + const auto int_attr = + rewriter.getIntegerAttr(type, APInt(type.cast().getWidth(), const_val_)); + auto value = DenseElementsAttr::get(output, int_attr); + + rewriter.replaceOpWithNewOp(op, output, value); + } else { + op->emitError( + "OneFlow variable op lower to TOSA const op only support integer and float value now"); + } + + return success(); + } + + private: + int const_val_; +}; + struct CastOpLowering final : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(CastOp op, OpAdaptor adaptor, ConversionPatternRewriter& rewriter) const override { - rewriter.replaceOpWithNewOp(op, - /* output */ op.out().getType(), - /* input */ op.in()); + auto output = op.out().getType(); + auto input = op.in(); + rewriter.replaceOpWithNewOp(op, output, input); return success(); } }; +struct ReluOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(ReluOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto floatMax = std::numeric_limits::max(); + const auto intMax = std::numeric_limits::max(); + + const auto output = op.y().getType(); + auto input = op.x(); + auto max_int = static_cast(intMax); + auto max_fp = static_cast<::llvm::APFloat>(floatMax); + + rewriter.replaceOpWithNewOp(op, output, input, max_int, max_fp); + return success(); + } +}; + +struct BroadcastAddOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(BroadcastAddOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto output = op.z().getType(); + auto input1 = op.x(); + auto input2 = op.y(); + + rewriter.replaceOpWithNewOp(op, output, input1, input2); + return success(); + } +}; + +struct Add2OpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(Add2Op op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto output = op.out().getType(); + auto input1 = op.in0(); + auto input2 = op.in1(); + + rewriter.replaceOpWithNewOp(op, output, input1, input2); + return success(); + } +}; + +struct AvgPool2DOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(AvgPool2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair { + return {arr.getValue()[0].cast().getSInt(), + arr.getValue()[1].cast().getSInt()}; + }; + + auto reshape_type = [](ShapedType shape_type, ArrayRef perms) -> RankedTensorType { + std::vector ranked_type; + for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return RankedTensorType::get(ranked_type, shape_type.getElementType()); + }; + + auto stride_pairs = get_pair_int64_from_array(op.stride()); + auto pad_pairs = get_pair_int64_from_array(op.padding()); + auto kernel_pairs = get_pair_int64_from_array(op.kernel_size()); + + auto loc = op.getLoc(); + auto perms = {0, 2, 3, 1}; + + const auto kernel = rewriter.getI64ArrayAttr({kernel_pairs.first, kernel_pairs.second}); + const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second}); + const auto pad = rewriter.getI64ArrayAttr( + {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second}); + + auto input = CreateTranspose(loc, rewriter, op.x(), perms); + auto output = reshape_type(op.y().getType().cast(), perms); + + auto avg_pool2d = rewriter.create(loc, output, input, kernel, stride, pad); + + auto out = CreateTranspose(loc, rewriter, avg_pool2d, {0, 3, 1, 2}); + rewriter.replaceOp(op, {out}); + return success(); + } +}; + +struct MaxPool2DOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(MaxPool2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair { + return {arr.getValue()[0].cast().getSInt(), + arr.getValue()[1].cast().getSInt()}; + }; + auto reshape_type = [](ShapedType shape_type, ArrayRef perms) -> RankedTensorType { + std::vector ranked_type; + for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return RankedTensorType::get(ranked_type, shape_type.getElementType()); + }; + // TODO: support return indice + if (op.return_indices()) op->emitError("not support return indices now"); + auto stride_pairs = get_pair_int64_from_array(op.stride()); + auto kernel_pairs = get_pair_int64_from_array(op.kernel_size()); + auto pad_pairs = get_pair_int64_from_array(op.padding()); + + auto loc = op.getLoc(); + auto perms = {0, 2, 3, 1}; + + const auto kernel = rewriter.getI64ArrayAttr({kernel_pairs.first, kernel_pairs.second}); + const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second}); + const auto pad = rewriter.getI64ArrayAttr( + {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second}); + + auto input = CreateTranspose(loc, rewriter, op.x(), perms); + auto output = reshape_type(op.y().getType().cast(), perms); + + auto max_pool2d = rewriter.create(loc, output, input, kernel, stride, pad); + + auto y = CreateTranspose(loc, rewriter, max_pool2d, {0, 3, 1, 2}); + + auto indice_output = op.indice().getType(); + auto value = DenseElementsAttr::get(indice_output, rewriter.getZeroAttr(rewriter.getI64Type())); + + auto indice = rewriter.create(loc, indice_output, value); + rewriter.replaceOp(op, {y, indice}); + return success(); + } +}; + +struct FlattenOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(FlattenOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto start_dim = op.start_dim(); + const auto end_dim = op.end_dim(); + const auto in_type = op.in().getType(); + + const auto in_shape = in_type.cast(); + const auto rank = in_type.dyn_cast().getRank(); + + // calculate reshape_vec + std::vector reshape_vec; + for (auto dim = 0; dim < start_dim; ++dim) { reshape_vec.push_back(in_shape.getDimSize(dim)); } + auto last_dim = end_dim < 0 ? rank : end_dim + 1; + int flatten_size = 1; + for (auto dim = start_dim; dim < last_dim; ++dim) { flatten_size *= in_shape.getDimSize(dim); } + reshape_vec.push_back(flatten_size); + if (end_dim > 0) { + for (auto dim = end_dim + 1; dim < rank; ++dim) { + reshape_vec.push_back(in_shape.getDimSize(dim)); + } + } + // generate reshape op + const auto output = RankedTensorType::get(reshape_vec, in_shape.getElementType()); + auto input1 = op.in(); + auto new_shape = rewriter.getI64ArrayAttr(reshape_vec); + + rewriter.replaceOpWithNewOp(op, output, input1, new_shape); + return success(); + } +}; + +struct MatmulOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(MatmulOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + // TODO: more throw for robust in matmul shape rank + auto loc = op.getLoc(); + + auto preprocess = [&](Value matrix, bool transpose) -> Value { + auto shape_type = matrix.getType().cast(); + if (transpose) { matrix = CreateTranspose(loc, rewriter, matrix, {1, 0}); } + + shape_type = matrix.getType().cast(); + auto reshape_type = RankedTensorType::get( + {1, shape_type.getDimSize(0), shape_type.getDimSize(1)}, shape_type.getElementType()); + + return rewriter.create( + op.getLoc(), reshape_type, matrix, + rewriter.getI64ArrayAttr({1, shape_type.getDimSize(0), shape_type.getDimSize(1)})); + }; + + auto a = preprocess(op.a(), op.transpose_a()); + auto b = preprocess(op.b(), op.transpose_b()); + + const auto out_shape_type = op.out().getType().cast(); + const auto out_reshape_type = + RankedTensorType::get({1, out_shape_type.getDimSize(0), out_shape_type.getDimSize(1)}, + out_shape_type.getElementType()); + + auto matmul = rewriter.create(loc, out_reshape_type, a, b); + const auto new_shape = + rewriter.getI64ArrayAttr({out_shape_type.getDimSize(0), out_shape_type.getDimSize(1)}); + + rewriter.replaceOpWithNewOp(op, out_shape_type, matmul, new_shape); + return success(); + } +}; + +struct NormalizationInferenceOpLowering final + : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(NormalizationInferenceOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto reshape_dim = [&](Type type, Value value) -> Value { + RankedTensorType in_type = value.getType().dyn_cast(); + RankedTensorType out_type = type.cast(); + SmallVector new_shape = {in_type.getShape()[0]}; + for (auto i = 2; i < out_type.getRank(); ++i) new_shape.push_back(1); + auto new_type = RankedTensorType::get(new_shape, out_type.getElementType()); + return rewriter.create(op->getLoc(), new_type, value, + rewriter.getI64ArrayAttr(new_shape)); + }; + + auto loc = op->getLoc(); + const auto out_type = op.y().getType(); + + const auto epsilon_type = RankedTensorType::get({}, rewriter.getF32Type()); + // epsilon = reshape(epsilon, shape_1) + auto epsilon = rewriter.create( + loc, epsilon_type, DenseElementsAttr::get(epsilon_type, op.epsilon())); + // mean = reshape(mean, shape_0) + auto mean = reshape_dim(out_type, adaptor.moving_mean()); + // variance= reshape(variance, shape_0) + auto variance = reshape_dim(out_type, adaptor.moving_variance()); + // scale = reshape(scale, shape_0) + auto gamma = reshape_dim(out_type, adaptor.gamma()); + // beta = reshape(beta, shape_0) + auto beta = reshape_dim(out_type, adaptor.beta()); + auto output = op.y(); + auto x = op.x(); + + auto batch_norm = + oneflow::CreateBNOp(loc, rewriter, output, x, mean, variance, epsilon, gamma, beta); + rewriter.replaceOp(op, {batch_norm}); + return success(); + } +}; + +struct NormalizationOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(NormalizationOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto reshape_dim = [&](Type type, Value value) -> Value { + const RankedTensorType in_type = value.getType().dyn_cast(); + const RankedTensorType out_type = type.cast(); + SmallVector new_shape = {in_type.getShape()[0]}; + for (auto i = 2; i < out_type.getRank(); ++i) new_shape.push_back(1); + const auto new_type = RankedTensorType::get(new_shape, out_type.getElementType()); + return rewriter.create(op->getLoc(), new_type, value, + rewriter.getI64ArrayAttr(new_shape)); + }; + + auto loc = op->getLoc(); + const auto out_type = op.y().getType(); + + const auto epsilon_type = RankedTensorType::get({}, rewriter.getF32Type()); + // epsilon = reshape(epsilon, shape_1) + auto epsilon = rewriter.create( + loc, epsilon_type, DenseElementsAttr::get(epsilon_type, op.epsilon())); + // mean = reshape(mean, shape_0) + auto mean = reshape_dim(out_type, adaptor.moving_mean()); + // variance= reshape(variance, shape_0) + auto variance = reshape_dim(out_type, adaptor.moving_variance()); + // scale = reshape(scale, shape_0) + auto gamma = reshape_dim(out_type, adaptor.gamma()); + // beta = reshape(beta, shape_0) + auto beta = reshape_dim(out_type, adaptor.beta()); + auto output = op.y(); + auto x = op.x(); + + auto batch_norm = + oneflow::CreateBNOp(loc, rewriter, output, x, mean, variance, epsilon, gamma, beta); + auto moving_mean = op.moving_mean(); + auto moving_variance = op.moving_variance(); + + rewriter.replaceOp(op, {batch_norm, moving_mean, moving_variance}); + return success(); + } +}; + +struct Conv2DOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(Conv2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair { + return {arr.getValue()[0].cast().getSInt(), + arr.getValue()[1].cast().getSInt()}; + }; + auto reshape_type = [](ShapedType shape_type, ArrayRef perms) -> RankedTensorType { + std::vector ranked_type; + for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return RankedTensorType::get(ranked_type, shape_type.getElementType()); + }; + + auto stride_pairs = get_pair_int64_from_array(op.strides()); + auto pad_pairs = get_pair_int64_from_array(op.padding_beforeAttr()); + auto dilation_pairs = get_pair_int64_from_array(op.dilation_rate()); + + const auto pad = rewriter.getI64ArrayAttr( + {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second}); + const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second}); + const auto dilation = rewriter.getI64ArrayAttr({dilation_pairs.first, dilation_pairs.second}); + + auto bias = op.bias(); + auto loc = op.getLoc(); + if (!bias) { + const auto output_shape = op.out().getType().cast(); + const auto output_channels = output_shape.getDimSize(1); + const auto bias_elem_type = output_shape.getElementType(); + const auto type = RankedTensorType::get(output_channels, bias_elem_type); + bias = rewriter.create( + op.getLoc(), type, DenseElementsAttr::get(type, rewriter.getZeroAttr(bias_elem_type))); + } + + auto perms = {0, 2, 3, 1}; + auto in = CreateTranspose(loc, rewriter, op.in(), perms); + auto weight = CreateTranspose(loc, rewriter, op.weight(), perms); + const auto output = reshape_type(op.out().getType().cast(), perms); + + auto conv2d = + rewriter.create(loc, output, in, weight, bias, pad, stride, dilation); + + auto res = CreateTranspose(loc, rewriter, conv2d, {0, 3, 1, 2}); + rewriter.replaceOp(op, {res}); + return success(); + getTypeConverter(); + } +}; + namespace { struct OneFlowLoweringToTosaPass : public LowerOneFlowToTosaPassBase { void runOnOperation() override; @@ -95,11 +573,29 @@ std::unique_ptr createLowerOneFlowToTosaPass() { } void OneFlowLoweringToTosaPass::runOnOperation() { - ConversionTarget target(getContext()); - target.addLegalDialect(); + MLIRContext* context = &getContext(); + ConversionTarget target(*context); + target.addLegalDialect(); target.addIllegalDialect(); - RewritePatternSet patterns(&getContext()); - patterns.insert(&getContext()); + + TypeConverter typeConverter; + typeConverter.addConversion([](Type type) { return type; }); + RewritePatternSet patterns(context); + + const auto mgr = ::oneflow::Global<::oneflow::VariableTensorMgr>::Get(); + // judge whether the pass is trigger by python through the existence of variable tensor manger + if (mgr) { + patterns.add(typeConverter, context); + } else { + patterns.add(typeConverter, context, this->variableAsConstant); + } + patterns + .add( + typeConverter, context); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) { getOperation()->dump(); signalPassFailure(); diff --git a/oneflow/ir/lib/OneFlow/OneFlowOps.cpp b/oneflow/ir/lib/OneFlow/OneFlowOps.cpp index 7e159585cb7..db388dcd150 100644 --- a/oneflow/ir/lib/OneFlow/OneFlowOps.cpp +++ b/oneflow/ir/lib/OneFlow/OneFlowOps.cpp @@ -155,6 +155,7 @@ struct ConcreteUserOps : public OpRewritePattern { NamedAttrList attributes(op->getAttrDictionary()); attributes.erase(op.input_sizesAttrName()); attributes.erase(op.output_sizesAttrName()); + attributes.erase(op.output_lbnsAttrName()); attributes.erase(OpTrait::AttrSizedOperandSegments::getOperandSegmentSizeAttr()); attributes.erase(OpTrait::AttrSizedResultSegments::getResultSegmentSizeAttr()); llvm::SmallVector input_sizes, output_sizes; diff --git a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp index d4093babaf9..6a4e3bb380b 100644 --- a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp +++ b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp @@ -69,9 +69,9 @@ ::oneflow::Symbol<::oneflow::Device> MakeDevice(const mlir::Attribute& device_ta return ::oneflow::Device::ParseAndNew(device_info).GetOrThrow(); } -template +template mlir::DenseElementsAttr __TensorToDenseElementsAttr( - const std::shared_ptr<::oneflow::one::Tensor>& tensor, const mlir::FloatType& float_type) { + const std::shared_ptr<::oneflow::one::Tensor>& tensor, const MLIR_T& mlir_type) { ::oneflow::LazyMode::Guard guard{false}; const auto tensor_ = ::oneflow::one::functional::ToContiguous(tensor).GetPtrOrThrow(); auto shape = tensor_->shape(); @@ -81,7 +81,7 @@ mlir::DenseElementsAttr __TensorToDenseElementsAttr( CHECK_JUST(::oneflow::BlobBufferCopyUtil::To(ofblob_ptr, data.data(), data.size())); }; ::oneflow::one::SyncAccessTensorWithTimeOut(tensor_, callback, "const").GetOrThrow(); - return mlir::DenseElementsAttr::get(mlir::RankedTensorType::get(shape_vec, float_type), + return mlir::DenseElementsAttr::get(mlir::RankedTensorType::get(shape_vec, mlir_type), llvm::makeArrayRef(data)); } @@ -115,7 +115,12 @@ mlir::DenseElementsAttr TensorToDenseElementsAttr( const std::shared_ptr<::oneflow::one::Tensor>& tensor, MLIRContext* ctx) { const auto dtype = tensor->dtype()->data_type(); if (dtype == ::oneflow::DataType::kFloat) { - return __TensorToDenseElementsAttr(tensor, mlir::FloatType::getF32(ctx)); + return __TensorToDenseElementsAttr(tensor, + mlir::FloatType::getF32(ctx)); + } else if (dtype == ::oneflow::DataType::kInt64) { + auto mlir_type = mlir::IntegerType::IntegerType::get( + ctx, 64, mlir::IntegerType::SignednessSemantics::Signed); + return __TensorToDenseElementsAttr(tensor, mlir_type); } llvm::errs() << "Converting oneflow::Tensor to mlir::DenseElementsAttr only support float32 now." << "\n"; @@ -132,8 +137,9 @@ std::shared_ptr<::oneflow::one::Tensor> DenseElementsAttrToTensor( return __DenseElementsAttrToTensor(dense_attr_, device_tag_attr, device_name_attr, ::oneflow::DataType::kFloat); } - llvm::errs() << "Converting mlir::DenseElementsAttr to oneflow::Tensor only support float32 now." - << "\n"; + llvm::errs() + << "Converting mlir::DenseElementsAttr to oneflow::Tensor only support float32 and int64 now." + << "\n"; exit(EXIT_FAILURE); } diff --git a/oneflow/ir/llvm-in-tree.cmake b/oneflow/ir/llvm-in-tree.cmake index 939f74b8685..cbde3c95709 100644 --- a/oneflow/ir/llvm-in-tree.cmake +++ b/oneflow/ir/llvm-in-tree.cmake @@ -13,6 +13,8 @@ endif() set(CMAKE_INSTALL_PREFIX ${LLVM_INSTALL_DIR} CACHE STRING "" FORCE) set(LLVM_ENABLE_RTTI ON CACHE BOOL "turn this on to make it compatible with protobuf") set(LLVM_ENABLE_EH ON CACHE BOOL "turn this on to make it compatible with half (the library)") +set(LLVM_ENABLE_TERMINFO OFF + CACHE BOOL "disable terminfo in llvm so that oneflow doesn't need to link against it") set(LLVM_BUILD_EXAMPLES OFF CACHE BOOL "") set(LLVM_BUILD_TOOLS OFF CACHE BOOL "") set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "") diff --git a/oneflow/ir/oneflow-extension/ir_pass.cpp b/oneflow/ir/oneflow-extension/ir_pass.cpp index e339b95f077..038c9578f55 100644 --- a/oneflow/ir/oneflow-extension/ir_pass.cpp +++ b/oneflow/ir/oneflow-extension/ir_pass.cpp @@ -177,6 +177,11 @@ Maybe IRRoundTrip::Apply(Job* job, JobPassCtx* ctx) const { template class IRRoundTrip; template class IRRoundTrip; +Maybe ConvertJobToTosaIR(Job* job) { + RoundTripOneFlowJobWrapper job_wrapper(job); + return ::mlir::oneflow::ConvertJobToTosaIR(job_wrapper); +} + Maybe SaveJobToIR(Job* job, const std::string& path) { // TODO: check path is valid dir if (IsInDebugMode()) { TeePersistentLogStream::Create("saved_job")->Write(*job); } diff --git a/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h b/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h index 7d8baa77f68..b2afa4fbae1 100644 --- a/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h +++ b/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h @@ -150,6 +150,7 @@ void RoundTripOneFlowJob( void registerFromOneFlowJobTranslation(); +std::string ConvertJobToTosaIR(RoundTripOneFlowJobWrapperInterface& job_wrapper); void SaveJobToIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path); void LoadJobFromIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path); diff --git a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp index fb876048a99..9491b715593 100644 --- a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp +++ b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp @@ -850,6 +850,35 @@ void RoundTripOneFlowJob( } } +std::string ConvertJobToTosaIR(RoundTripOneFlowJobWrapperInterface& job_wrapper) { + const ::oneflow::Job* job = job_wrapper.job(); + mlir::MLIRContext context; + context.getOrLoadDialect(); + context.loadDialect(); + + OwningOpRef module( + ModuleOp::create(FileLineColLoc::get(&context, "", /*line=*/0, /*column=*/0))); + JobImporter imp(job_wrapper, &context, module.get()); + if (succeeded(imp.ProcessJob())) { + mlir::PassManager pm(&context); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createLowerOneFlowToTosaPass()); + if (mlir::failed(pm.run(*module))) { + module->emitError("Failed to run oneflow-to-tosa pass"); + exit(EXIT_FAILURE); + } + + std::string mlir; + llvm::raw_string_ostream os_mlir(mlir); + module->print(os_mlir); + return mlir; + } else { + const auto& job_name = job->job_conf().job_name(); + llvm::errs() << "fail to convert job to IR, job_name: " << job_name << "\n"; + exit(EXIT_FAILURE); + } +} + void SaveJobToIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path) { const ::oneflow::Job* job = job_wrapper.job(); mlir::MLIRContext context; diff --git a/oneflow/ir/test/Frontend/OneFlowToIree.mlir b/oneflow/ir/test/Frontend/OneFlowToIree.mlir new file mode 100644 index 00000000000..834063b7a71 --- /dev/null +++ b/oneflow/ir/test/Frontend/OneFlowToIree.mlir @@ -0,0 +1,266 @@ +// RUN: oneflow-opt %s \ +// RUN: -split-input-file \ +// RUN: -lower-oneflow-to-tosa \ +// RUN: -verify-diagnostics -o - \ +// RUN: | ireec \ +// RUN: --iree-input-type=tosa \ +// RUN: --iree-vm-bytecode-module-output-format=flatbuffer-binary \ +// RUN: -iree-hal-target-backends=dylib-llvm-aot \ +// RUN: -iree-mlir-to-vm-bytecode-module - + + +oneflow.job @test_func(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + oneflow.return %arg0 : tensor<1xf32> +} + + +oneflow.job @test_input(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.input"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +oneflow.job @test_output(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.output"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +oneflow.job @test_variable() -> tensor<64x3x7x7xf32> +{ + %res = "oneflow.variable"() { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + nd_sbp = ["B"], + op_name = "fw.model.conv1.weight", + output_lbns = ["fw.model.conv1.weight/out"], + scope_symbol_id = 4611686018427432959 : i64, + shape = [64 : si64, 3 : si64, 7 : si64, 7 : si64] + } : () -> tensor<64x3x7x7xf32> + oneflow.return %res : tensor<64x3x7x7xf32> +} + + +oneflow.job @test_add_n2(%arg0: tensor<1x7x7xf32>, %arg1: tensor<1x7x7xf32>) -> tensor<1x7x7xf32> +{ + %res = "oneflow.add_n2"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + op_type_name = "add_n", + output_lbns = [""], + scope_symbol_id = 4611686018431205375 : i64 + } : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32> + oneflow.return %res: tensor<1x7x7xf32> +} + + +oneflow.job @test_broadcast_add(%arg0: tensor<1x1000xf32>, %arg1: tensor<1000xf32>) -> tensor<1x1000xf32> +{ + %res = "oneflow.broadcast_add"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64 + } : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +oneflow.job @test_max_pool_2d(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32> +{ + %y, %indice = "oneflow.max_pool_2d"(%arg0) + { + ceil_mode = false, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation = [1 : si32, 1 : si32], + hierarchy = [1], kernel_size = [3 : si32, 3 : si32], + op_name = "", + output_lbns = ["", ""], + padding = [1 : si32, 1 : si32], + return_indices = false, + scope_symbol_id = 4611686018427502591 : i64, + stride = [2 : si32, 2 : si32] + } : (tensor<1x64x112x112xf32>) -> (tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>) + oneflow.return %y : tensor<1x64x56x56xf32> +} + + +oneflow.job @test_avg_pool_2d(%arg0: tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> +{ + %res = "oneflow.avg_pool_2d"(%arg0) + { + ceil_mode = false, + count_include_pad = true, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + divisor_override = 0 : si32, + hierarchy = [1], + kernel_size = [7 : si32, 7 : si32], + op_name = "model.avgpool-avg_pool_2d-172", + output_lbns = ["model.avgpool-avg_pool_2d-172/y_0"], + padding = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018430775295 : i64, + stride = [7 : si32, 7 : si32] + } : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> + oneflow.return %res: tensor<1x2048x1x1xf32> +} + + +oneflow.job @test_conv2d(%arg0: tensor<1x3x224x224xf32>, %arg1: tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> +{ + %res = "oneflow.conv2d"(%arg0, %arg1) + { + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation_rate = [1 : si32, 1 : si32], + filters = 512 : si32, + groups = 1 : si32, + hierarchy = [1], + kernel_size = [1 : si32, 1 : si32], + op_name = "", + operand_segment_sizes = dense<[1, 1, 0, 0]> : vector<4xi32>, + output_lbns = [""], + padding_before = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018431012863 : i64, + strides = [1 : si32, 1 : si32] + } : (tensor<1x3x224x224xf32>, tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> + oneflow.return %res : tensor<1x5x224x224xf32> +} + + +oneflow.job @test_flatten(%arg0: tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> +{ + %res = "oneflow.flatten"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + end_dim = 2 : si32, + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431217663 : i64, + start_dim = 1 : si32 + } : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> + oneflow.return %res : tensor<4x6x1xf32> +} + + +oneflow.job @test_matmul(%arg0: tensor<1x2048xf32>, %arg1: tensor<1000x2048xf32>) ->tensor<1x1000xf32> +{ + %res = "oneflow.matmul"(%arg0, %arg1) + { + alpha = 1.000000e+00 : f64, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64, + transpose_a = false, + transpose_b = true + } : (tensor<1x2048xf32>, tensor<1000x2048xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +oneflow.job @test_relu(%arg0: tensor<1xf32>) -> tensor<1xf32> { + %res = "oneflow.relu"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427424767 : i64 + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + +oneflow.job @test_bn( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y, %mean, %inv_variance = "oneflow.normalization"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) + oneflow.return %y: tensor<1x64x112x112xf32> +} + +oneflow.job @test_bn_infer( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y = "oneflow.normalization_infer"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x64x112x112xf32> + oneflow.return %y: tensor<1x64x112x112xf32> +} diff --git a/oneflow/ir/test/Frontend/lit.local.cfg b/oneflow/ir/test/Frontend/lit.local.cfg new file mode 100644 index 00000000000..a63a9b31aa9 --- /dev/null +++ b/oneflow/ir/test/Frontend/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.WITH_ONEFLOW_IREE: + config.unsupported = True diff --git a/oneflow/ir/test/Frontend/test_iree_resnet.py b/oneflow/ir/test/Frontend/test_iree_resnet.py new file mode 100644 index 00000000000..885291f4251 --- /dev/null +++ b/oneflow/ir/test/Frontend/test_iree_resnet.py @@ -0,0 +1,107 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +# RUN: python3 %s + +from oneflow_iree.compiler import Runner +from flowvision.models import resnet50 +import oneflow as flow +import oneflow.unittest +import unittest +import os +import numpy as np +import time + +os.environ["ONEFLOW_MLIR_ENABLE_ROUND_TRIP"] = "1" +os.environ["ONEFLOW_MLIR_ENABLE_CODEGEN_FUSERS"] = "1" + + +def _test_iree_resnet_cpu(test_case): + model = resnet50(pretrained=True) + model.eval() + + class GraphModuleForIree(flow.nn.Graph): + def __init__(self): + super().__init__() + self.model = model + + def build(self, x): + return self.model(x) + + class GraphModuleForOFMLIR(flow.nn.Graph): + def __init__(self): + super().__init__() + self.model = model + + def build(self, x): + return self.model(x) + + func = Runner(GraphModuleForIree, return_numpy=True) + input = flow.ones([1, 3, 224, 224]) + f = GraphModuleForOFMLIR() + for iter in range(2): + iree_output = func(input) + graph_output = f(input) + graph_output = graph_output.cpu().detach().numpy() + # the rtol accumulate layer by layer + test_case.assertTrue( + np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3) + ) + + +def _test_iree_resnet_cuda(test_case): + model = resnet50(pretrained=True).cuda() + model.eval() + + class GraphModuleForIree(flow.nn.Graph): + def __init__(self): + super().__init__() + self.model = model + + def build(self, x): + return self.model(x) + + class GraphModuleForOFMLIR(flow.nn.Graph): + def __init__(self): + super().__init__() + self.model = model + + def build(self, x): + return self.model(x) + + func = Runner(GraphModuleForIree, return_numpy=True) + input = flow.ones([1, 3, 224, 224]).cuda() + f = GraphModuleForOFMLIR() + for iter in range(2): + iree_output = func(input) + graph_output = f(input) + graph_output = graph_output.cpu().detach().numpy() + # the rtol accumulate layer by layer + test_case.assertTrue( + np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3) + ) + + +@flow.unittest.skip_unless_1n1d() +class TestIreeResnet(oneflow.unittest.TestCase): + def test_iree_resnet_cpu(test_case): + _test_iree_resnet_cpu(test_case) + + def test_iree_resnet_cuda(test_case): + _test_iree_resnet_cuda(test_case) + + +if __name__ == "__main__": + unittest.main() diff --git a/oneflow/ir/test/Frontend/test_iree_runner.py b/oneflow/ir/test/Frontend/test_iree_runner.py new file mode 100644 index 00000000000..a0caa90fecd --- /dev/null +++ b/oneflow/ir/test/Frontend/test_iree_runner.py @@ -0,0 +1,71 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +# RUN: python3 %s + +from oneflow_iree.compiler import Runner +import oneflow as flow +import oneflow.unittest +import unittest +import numpy as np + + +class RELU(flow.nn.Module): + def __init__(self): + super().__init__() + self.relu = flow.nn.ReLU() + + def forward(self, x): + return self.relu(x) + + +class GraphModule(flow.nn.Graph): + def __init__(self): + super().__init__() + self.fw = RELU() + + def build(self, x): + return self.fw(x) + + +def _test_check_iree_runner(test_case): + func = Runner(GraphModule, return_numpy=True).cuda() + # run on iree cuda backend + input = flow.Tensor([-1.0, 1.0]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 1.0])) + # change input shape + input = flow.Tensor([-1.0, 1.0, -1]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 1.0, 0.0])) + # change on iree cpu backend + func = func.cpu() + input = flow.Tensor([-1.0, 0.0, 1.0]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 0.0, 1.0])) + # change input shape + input = flow.Tensor([-1, 1.0]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 1.0])) + + +@flow.unittest.skip_unless_1n1d() +class TestCheckIreeRunner(oneflow.unittest.TestCase): + def test_check_iree_runner(test_case): + _test_check_iree_runner(test_case) + + +if __name__ == "__main__": + unittest.main() diff --git a/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir new file mode 100644 index 00000000000..34ee5b499dc --- /dev/null +++ b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir @@ -0,0 +1,16 @@ +// RUN: oneflow-opt %s \ +// RUN: -pass-pipeline="func.func(tosa-to-linalg)" -cse \ +// RUN: --linalg-fuse-elementwise-ops -linalg-bufferize \ +// RUN: -tensor-bufferize -func-bufferize -buffer-results-to-out-params \ +// RUN: -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm \ +// RUN: -convert-func-to-llvm -convert-memref-to-llvm -reconcile-unrealized-casts --print-after-all \ +// RUN: | oneflow-translate -mlir-to-llvmir | clang -x ir - -c -o test.o + +builtin.module { + func.func @Graph_0(%arg0: tensor<2xf32>) -> tensor<2xf32> { + %0 = "tosa.cast"(%arg0) : (tensor<2xf32>) -> tensor<2xf32> + %1 = "tosa.reluN"(%0) {max_fp = 3.40282347E+38 : f32, max_int = 9223372036854775807 : i64} : (tensor<2xf32>) -> tensor<2xf32> + %2 = "tosa.cast"(%1) : (tensor<2xf32>) -> tensor<2xf32> + func.return %2 : tensor<2xf32> + } +} diff --git a/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir b/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir new file mode 100644 index 00000000000..3028b3c04bf --- /dev/null +++ b/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir @@ -0,0 +1,342 @@ +// RUN: oneflow-opt %s \ +// RUN: -split-input-file \ +// RUN: -lower-oneflow-to-tosa \ +// RUN: -verify-diagnostics -o - \ +// RUN: | FileCheck %s + + +// CHECK-LABEL: test_func +// CHECK: return [[V0:%.+]] : tensor<1xf32> +oneflow.job @test_func(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + oneflow.return %arg0 : tensor<1xf32> +} + + +// CHECK-LABEL: test_input +// CHECK: return [[V0:%.+]] : tensor<1xf32> +oneflow.job @test_input(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.input"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +// CHECK-LABEL: test_output +// CHECK: return [[V0:%.+]] : tensor<1xf32> +oneflow.job @test_output(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.output"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +// CHECK-LABEL: test_variable +// CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<64x3x7x7xf32>} : () -> tensor<64x3x7x7xf32> +// CHECK: return [[V0]] : tensor<64x3x7x7xf32> +oneflow.job @test_variable() -> tensor<64x3x7x7xf32> +{ + %res = "oneflow.variable"() { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + nd_sbp = ["B"], + op_name = "fw.model.conv1.weight", + output_lbns = ["fw.model.conv1.weight/out"], + scope_symbol_id = 4611686018427432959 : i64, + shape = [64 : si64, 3 : si64, 7 : si64, 7 : si64] + } : () -> tensor<64x3x7x7xf32> + oneflow.return %res : tensor<64x3x7x7xf32> +} + + +//CHECK-LABEL: test_add_n2 +//CHECK: [[V0:%.+]] = "tosa.add"(%arg0, %arg1) : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32> +//CHECK: return [[V0]] : tensor<1x7x7xf32> +oneflow.job @test_add_n2(%arg0: tensor<1x7x7xf32>, %arg1: tensor<1x7x7xf32>) -> tensor<1x7x7xf32> +{ + %res = "oneflow.add_n2"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + op_type_name = "add_n", + output_lbns = [""], + scope_symbol_id = 4611686018431205375 : i64 + } : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32> + oneflow.return %res: tensor<1x7x7xf32> +} + + +//CHECK-LABEL: test_broadcast_add +//CHECK: [[V0:%.+]] = "tosa.add"(%arg0, %arg1) : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32> +//CHECK: return [[V0]] : tensor<1x1000xf32> +oneflow.job @test_broadcast_add(%arg0: tensor<1x1000xf32>, %arg1: tensor<1000xf32>) -> tensor<1x1000xf32> +{ + %res = "oneflow.broadcast_add"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64 + } : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +//CHECK-LABEL: test_max_pool_2d +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V1:%.+]] = "tosa.transpose"(%arg0, [[V0]]) : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32> +//CHECK: [[V2:%.+]] = "tosa.max_pool2d"([[V1]]) {kernel = [3, 3], pad = [1, 1, 1, 1], stride = [2, 2]} : (tensor<1x112x112x64xf32>) -> tensor<1x56x56x64xf32> +//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V4:%.+]] = "tosa.transpose"([[V2]], [[V3]]) : (tensor<1x56x56x64xf32>, tensor<4xi32>) -> tensor<1x64x56x56xf32> +//CHECK: [[V5:%.+]] = "tosa.const"() {value = dense<0> : tensor<1x64x56x56xi64>} : () -> tensor<1x64x56x56xi64> +//CHECK: return [[V4]] : tensor<1x64x56x56xf32> +oneflow.job @test_max_pool_2d(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32> +{ + %y, %indice = "oneflow.max_pool_2d"(%arg0) + { + ceil_mode = false, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation = [1 : si32, 1 : si32], + hierarchy = [1], kernel_size = [3 : si32, 3 : si32], + op_name = "", + output_lbns = ["", ""], + padding = [1 : si32, 1 : si32], + return_indices = false, + scope_symbol_id = 4611686018427502591 : i64, + stride = [2 : si32, 2 : si32] + } : (tensor<1x64x112x112xf32>) -> (tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>) + oneflow.return %y : tensor<1x64x56x56xf32> +} + + +//CHECK-LABEL: test_avg_pool_2d +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V1:%.+]] = "tosa.transpose"(%arg0, [[V0]]) : (tensor<1x2048x7x7xf32>, tensor<4xi32>) -> tensor<1x7x7x2048xf32> +//CHECK: [[V2:%.+]] = "tosa.avg_pool2d"([[V1]]) {kernel = [7, 7], pad = [0, 0, 0, 0], stride = [7, 7]} : (tensor<1x7x7x2048xf32>) -> tensor<1x1x1x2048xf32> +//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V4:%.+]] = "tosa.transpose"([[V2]], [[V3]]) : (tensor<1x1x1x2048xf32>, tensor<4xi32>) -> tensor<1x2048x1x1xf32> +//CHECK: return [[V4]] : tensor<1x2048x1x1xf32> +oneflow.job @test_avg_pool_2d(%arg0: tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> +{ + %res = "oneflow.avg_pool_2d"(%arg0) + { + ceil_mode = false, + count_include_pad = true, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + divisor_override = 0 : si32, + hierarchy = [1], + kernel_size = [7 : si32, 7 : si32], + op_name = "model.avgpool-avg_pool_2d-172", + output_lbns = ["model.avgpool-avg_pool_2d-172/y_0"], + padding = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018430775295 : i64, + stride = [7 : si32, 7 : si32] + } : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> + oneflow.return %res: tensor<1x2048x1x1xf32> +} + + +//CHECK-LABEL: test_conv2d +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<5xf32>} : () -> tensor<5xf32> +//CHECK: [[V1:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V2:%.+]] = "tosa.transpose"(%arg0, [[V1]]) : (tensor<1x3x224x224xf32>, tensor<4xi32>) -> tensor<1x224x224x3xf32> +//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V4:%.+]] = "tosa.transpose"(%arg1, [[V3]]) : (tensor<5x3x1x1xf32>, tensor<4xi32>) -> tensor<5x1x1x3xf32> +//CHECK: [[V5:%.+]] = "tosa.conv2d"([[V2]], [[V4]], [[V0]]) {dilation = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]} : (tensor<1x224x224x3xf32>, tensor<5x1x1x3xf32>, tensor<5xf32>) -> tensor<1x224x224x5xf32> +//CHECK: [[V6:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V7:%.+]] = "tosa.transpose"([[V5]], [[V6]]) : (tensor<1x224x224x5xf32>, tensor<4xi32>) -> tensor<1x5x224x224xf32> +//CHECK: return [[V7]] : tensor<1x5x224x224xf32> +oneflow.job @test_conv2d(%arg0: tensor<1x3x224x224xf32>, %arg1: tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> +{ + %res = "oneflow.conv2d"(%arg0, %arg1) + { + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation_rate = [1 : si32, 1 : si32], + filters = 512 : si32, + groups = 1 : si32, + hierarchy = [1], + kernel_size = [1 : si32, 1 : si32], + op_name = "", + operand_segment_sizes = dense<[1, 1, 0, 0]> : vector<4xi32>, + output_lbns = [""], + padding_before = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018431012863 : i64, + strides = [1 : si32, 1 : si32] + } : (tensor<1x3x224x224xf32>, tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> + oneflow.return %res : tensor<1x5x224x224xf32> +} + + +//CHECK-LABEL: test_flatten +//CHECK: [[V0:%.+]] = "tosa.reshape"(%arg0) {new_shape = [4, 6, 1]} : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> +//CHECK: return [[V0]] : tensor<4x6x1xf32> +oneflow.job @test_flatten(%arg0: tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> +{ + %res = "oneflow.flatten"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + end_dim = 2 : si32, + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431217663 : i64, + start_dim = 1 : si32 + } : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> + oneflow.return %res : tensor<4x6x1xf32> +} + + +//CHECK-LABEL: test_matmul +//CHECK: [[V0:%.+]] = "tosa.reshape"(%arg0) {new_shape = [1, 1, 2048]} : (tensor<1x2048xf32>) -> tensor<1x1x2048xf32> +//CHECK: [[V1:%.+]] = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32> +//CHECK: [[V2:%.+]] = "tosa.transpose"(%arg1, [[V1]]) : (tensor<1000x2048xf32>, tensor<2xi32>) -> tensor<2048x1000xf32> +//CHECK: [[V3:%.+]] = "tosa.reshape"([[V2]]) {new_shape = [1, 2048, 1000]} : (tensor<2048x1000xf32>) -> tensor<1x2048x1000xf32> +//CHECK: [[V4:%.+]] = "tosa.matmul"([[V0]], [[V3]]) : (tensor<1x1x2048xf32>, tensor<1x2048x1000xf32>) -> tensor<1x1x1000xf32> +//CHECK: [[V5:%.+]] = "tosa.reshape"([[V4]]) {new_shape = [1, 1000]} : (tensor<1x1x1000xf32>) -> tensor<1x1000xf32> +//CHECK: return [[V5]] : tensor<1x1000xf32> +oneflow.job @test_matmul(%arg0: tensor<1x2048xf32>, %arg1: tensor<1000x2048xf32>) ->tensor<1x1000xf32> +{ + %res = "oneflow.matmul"(%arg0, %arg1) + { + alpha = 1.000000e+00 : f64, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64, + transpose_a = false, + transpose_b = true + } : (tensor<1x2048xf32>, tensor<1000x2048xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +//CHECK-LABEL: test_relu +//CHECK: [[V0:%.+]] = "tosa.reluN"(%arg0) {max_fp = 3.40282347E+38 : f32, max_int = 9223372036854775807 : i64} : (tensor<1xf32>) -> tensor<1xf32> +//CHECK: return [[V0]] : tensor<1xf32> +oneflow.job @test_relu(%arg0: tensor<1xf32>) -> tensor<1xf32> { + %res = "oneflow.relu"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427424767 : i64 + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + +//CHECK-LABEL: test_bn +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<9.99999974E-6> : tensor} : () -> tensor +//CHECK: [[V1:%.+]] = "tosa.reshape"(%arg1) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V2:%.+]] = "tosa.reshape"(%arg2) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V3:%.+]] = "tosa.reshape"(%arg3) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V4:%.+]] = "tosa.reshape"(%arg4) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V5:%.+]] = "tosa.sub"(%arg0, [[V1]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V6:%.+]] = "tosa.add"([[V2]], [[V0]]) : (tensor<64x1x1xf32>, tensor) -> tensor<64x1x1xf32> +//CHECK: [[V7:%.+]] = "tosa.rsqrt"([[V6]]) : (tensor<64x1x1xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V8:%.+]] = "tosa.mul"([[V5]], [[V7]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V9:%.+]] = "tosa.mul"([[V8]], [[V3]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V10:%.+]] = "tosa.add"([[V9]], [[V4]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: return [[V10]] : tensor<1x64x112x112xf32> +oneflow.job @test_bn( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y, %mean, %inv_variance = "oneflow.normalization"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) + oneflow.return %y: tensor<1x64x112x112xf32> +} + +//CHECK-LABEL: test_bn_infer +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<9.99999974E-6> : tensor} : () -> tensor +//CHECK: [[V1:%.+]] = "tosa.reshape"(%arg1) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V2:%.+]] = "tosa.reshape"(%arg2) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V3:%.+]] = "tosa.reshape"(%arg3) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V4:%.+]] = "tosa.reshape"(%arg4) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V5:%.+]] = "tosa.sub"(%arg0, [[V1]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V6:%.+]] = "tosa.add"([[V2]], [[V0]]) : (tensor<64x1x1xf32>, tensor) -> tensor<64x1x1xf32> +//CHECK: [[V7:%.+]] = "tosa.rsqrt"([[V6]]) : (tensor<64x1x1xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V8:%.+]] = "tosa.mul"([[V5]], [[V7]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V9:%.+]] = "tosa.mul"([[V8]], [[V3]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V10:%.+]] = "tosa.add"([[V9]], [[V4]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: return [[V10]] : tensor<1x64x112x112xf32> +oneflow.job @test_bn_infer( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y = "oneflow.normalization_infer"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x64x112x112xf32> + oneflow.return %y: tensor<1x64x112x112xf32> +} diff --git a/oneflow/ir/test/OneFlow/test_fuser_cast_scale.py b/oneflow/ir/test/OneFlow/cuda_code_gen/test_fuser_cast_scale.py similarity index 100% rename from oneflow/ir/test/OneFlow/test_fuser_cast_scale.py rename to oneflow/ir/test/OneFlow/cuda_code_gen/test_fuser_cast_scale.py diff --git a/oneflow/ir/test/lit.cfg.py b/oneflow/ir/test/lit.cfg.py index af8af5b28af..275f16893d1 100644 --- a/oneflow/ir/test/lit.cfg.py +++ b/oneflow/ir/test/lit.cfg.py @@ -105,3 +105,10 @@ ] ) llvm_config.add_tool_substitutions(tools, tool_dirs) + +try: + import oneflow_iree.compiler + + config.WITH_ONEFLOW_IREE = True +except ImportError: + config.WITH_ONEFLOW_IREE = False diff --git a/oneflow/user/kernels/arg_where_kernel_util.cpp b/oneflow/user/kernels/arg_where_kernel_util.cpp index 25af71e776d..cc85cad8ad0 100644 --- a/oneflow/user/kernels/arg_where_kernel_util.cpp +++ b/oneflow/user/kernels/arg_where_kernel_util.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "oneflow/user/kernels/arg_where_kernel_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/kernel/kernel_util.h" namespace oneflow { diff --git a/oneflow/user/kernels/arg_where_kernel_util.cu b/oneflow/user/kernels/arg_where_kernel_util.cu index 61e6de4f543..522078e42ab 100644 --- a/oneflow/user/kernels/arg_where_kernel_util.cu +++ b/oneflow/user/kernels/arg_where_kernel_util.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "oneflow/user/kernels/arg_where_kernel_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/cuda/elementwise.cuh" #include "oneflow/core/kernel/kernel_util.h" #include "oneflow/core/ep/cuda/cuda_stream.h" diff --git a/oneflow/user/kernels/avg_pool_kernel_util.h b/oneflow/user/kernels/avg_pool_kernel_util.h index d6586bb70bf..d0b0ab8aeab 100644 --- a/oneflow/user/kernels/avg_pool_kernel_util.h +++ b/oneflow/user/kernels/avg_pool_kernel_util.h @@ -65,7 +65,7 @@ struct XPUAdd { OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) -typedef fixed_vector FixedDimVector; +typedef small_vector FixedDimVector; class AvgPoolParams3D { public: diff --git a/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu b/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu index 70d6c2eacf2..6ba3e0e8d09 100644 --- a/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu +++ b/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu @@ -20,6 +20,8 @@ limitations under the License. namespace oneflow { +namespace { + template class CublasBiasAddReluMatmulGradKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { @@ -41,7 +43,6 @@ class CublasBiasAddReluMatmulGradKernel final : public user_op::OpKernel, const user_op::Tensor* aux = ctx->Tensor4ArgNameAndIndex("aux", 0); user_op::Tensor* d_bias = ctx->Tensor4ArgNameAndIndex("d_bias", 0); user_op::Tensor* d_grad = ctx->Tensor4ArgNameAndIndex("d_grad", 0); - const auto* matmul_grad_cache = CHECK_NOTNULL(dynamic_cast(cache)); auto* cuda_stream = ctx->stream()->As(); @@ -52,7 +53,7 @@ class CublasBiasAddReluMatmulGradKernel final : public user_op::OpKernel, size_t cublas_m = 0, cublas_n = 0, cublas_k = 0; int64_t cublas_lda = 0, cublas_ldb = 0, cublas_ldc = 0; - const double alpha = 1.0; + const double alpha = ctx->Attr("alpha"); const auto sp_alpha = GetCublasScalarParameter(alpha, cublas_compute_dtype); const double beta = 0.0; const auto sp_beta = GetCublasScalarParameter(beta, cublas_compute_dtype); @@ -99,6 +100,8 @@ REGISTER_CUBLAS_BIAS_ADD_RELU_MATMUL_GRAD_KERNEL(float) REGISTER_CUBLAS_BIAS_ADD_RELU_MATMUL_GRAD_KERNEL(double) REGISTER_CUBLAS_BIAS_ADD_RELU_MATMUL_GRAD_KERNEL(half) +} // namespace + } // namespace oneflow #endif // CUDA_VERSION >= 11060 diff --git a/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu b/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu index e4df4aae01e..95a25fcd525 100644 --- a/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu +++ b/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu @@ -18,10 +18,14 @@ limitations under the License. #include "oneflow/core/ep/include/primitive/memcpy.h" #include "oneflow/core/ep/cuda/cuda_device.h" // CUBLASLT_EPILOGUE_BGRADB only support in cuda11.4.2 or higher version. +// TODO(zhengzekang): In cuda11.6 version, CUBLASLT_EPILOGUE_BGRADB may occur illegal memory access +// error in some shapes. #if CUDA_VERSION >= 11060 namespace oneflow { +namespace { + cudaDataType_t GetGemmComputeType(cudaDataType_t data_type) { switch (data_type) { case CUDA_R_32F: return CUDA_R_32F; @@ -54,7 +58,6 @@ class CublasMatmulBiasAddGradKernel final : public user_op::OpKernel, const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* w_grad = ctx->Tensor4ArgNameAndIndex("w_grad", 0); user_op::Tensor* b_grad = ctx->Tensor4ArgNameAndIndex("b_grad", 0); - const auto* matmul_grad_cache = CHECK_NOTNULL(dynamic_cast(cache)); auto* cuda_stream = ctx->stream()->As(); @@ -64,7 +67,6 @@ class CublasMatmulBiasAddGradKernel final : public user_op::OpKernel, const cudaDataType_t cuda_data_type = GetCudaDataType(data_type); size_t cublas_m = 0, cublas_n = 0, cublas_k = 0; int64_t cublas_lda = 0, cublas_ldb = 0, cublas_ldc = 0; - const double alpha = 1.0; const auto sp_alpha = GetCublasScalarParameter(alpha, cublas_compute_dtype); const double beta = 0.0; @@ -113,7 +115,6 @@ class CublasMatmulBiasAddGradKernel final : public user_op::OpKernel, ctx->stream()->device_type(), ep::primitive::MemcpyKind::kDtoD); CHECK(memcpy_primitive); memcpy_primitive->Launch(ctx->stream(), b_grad->mut_dptr(), dy->dptr(), cublas_n * sizeof(T)); - OF_CUBLAS_CHECK(cublasGemmEx( cuda_stream->cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, cublas_m, cublas_n, cublas_k, &sp_alpha, x->dptr(), cuda_data_type, cublas_lda, dy->dptr(), cuda_data_type, cublas_ldb, @@ -134,6 +135,8 @@ REGISTER_CUBLAS_MATMUL_BIAS_ADD_GRAD_KERNEL(float) REGISTER_CUBLAS_MATMUL_BIAS_ADD_GRAD_KERNEL(double) REGISTER_CUBLAS_MATMUL_BIAS_ADD_GRAD_KERNEL(half) +} // namespace + } // namespace oneflow #endif // CUDA_VERSION >= 11060 diff --git a/oneflow/user/kernels/cublas_fused_mlp_kernel.cu b/oneflow/user/kernels/cublas_fused_mlp_kernel.cu index 5a51ee57512..8755c514ebd 100644 --- a/oneflow/user/kernels/cublas_fused_mlp_kernel.cu +++ b/oneflow/user/kernels/cublas_fused_mlp_kernel.cu @@ -20,6 +20,8 @@ limitations under the License. namespace oneflow { +namespace { + template class CublasFusedMLPKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { public: @@ -131,6 +133,8 @@ REGISTER_CUBLAS_FUSED_MLP_KERNEL_GPU(float, DataType::kFloat); REGISTER_CUBLAS_FUSED_MLP_KERNEL_GPU(half, DataType::kFloat16); REGISTER_CUBLAS_FUSED_MLP_KERNEL_GPU(nv_bfloat16, DataType::kBFloat16); +} // namespace + } // namespace oneflow #endif // CUDA_VERSION >= 11060 diff --git a/oneflow/user/kernels/cum_backward_kernel.cpp b/oneflow/user/kernels/cum_backward_kernel.cpp index 51065a68c5a..6e6967d1daa 100644 --- a/oneflow/user/kernels/cum_backward_kernel.cpp +++ b/oneflow/user/kernels/cum_backward_kernel.cpp @@ -18,88 +18,70 @@ limitations under the License. namespace oneflow { namespace { -// O(n) cumprod backward, formula: cumsum(flip(dY * Y)) / X. -// Need to take care when there is at least a zero in the input. +// CumProd backward, formula: flip(cumsum(flip(dY * Y))) / X. template void CumProdBackward(const T* dy_ptr, T* dx_ptr, const T* output_ptr, const T* input_ptr, const int64_t up_space, const int64_t space, const int64_t down_space, const int64_t elem_cnt) { const auto step = space * down_space; for (size_t i = 0; i < up_space; i++) { - // two-dims buffer for 0 elem index - std::vector cumsum_zeros_number(space * down_space, 0); - auto* cumsum_zeros_number_ptr = cumsum_zeros_number.data(); + const size_t base_ptr_offset = step * i; + const T* input_ptr_base = input_ptr + base_ptr_offset; + const T* output_ptr_base = output_ptr + base_ptr_offset; + const T* dy_ptr_base = dy_ptr + base_ptr_offset; + T* dx_ptr_base = dx_ptr + base_ptr_offset; + + // Use dx as tmp buffer for finding 0 element in the input. for (size_t j = 0; j < space; j++) { const size_t ptr_offset = j * down_space; - auto* tmp_input_ptr = input_ptr + ptr_offset; - auto* tmp_cumsum_zeros_number_ptr = cumsum_zeros_number_ptr + ptr_offset; - auto* last_tmp_cumsum_zeros_number_ptr = tmp_cumsum_zeros_number_ptr - down_space; - for (auto k = 0; k < down_space; k++) { - int is_zero = tmp_input_ptr[k] == 0 ? 1 : 0; - tmp_cumsum_zeros_number_ptr[k] = - is_zero + (j == 0 ? 0 : last_tmp_cumsum_zeros_number_ptr[k]); - } - } - { - // for k < z(z is first zero index) - std::vector reverse_cumsum(down_space, 0); - for (size_t j = 0; j < space; j++) { - const size_t ptr_offset = (space - j - 1) * down_space; - auto* tmp_cumsum_zeros_number_ptr = cumsum_zeros_number_ptr + ptr_offset; - auto* tmp_dy_ptr = dy_ptr + ptr_offset; - auto* tmp_dx_ptr = dx_ptr + ptr_offset; - auto* tmp_output_ptr = output_ptr + ptr_offset; - auto* tmp_input_ptr = input_ptr + ptr_offset; - for (auto k = 0; k < down_space; k++) { - if (tmp_cumsum_zeros_number_ptr[k] > 0) { continue; } - reverse_cumsum[k] += tmp_output_ptr[k] * tmp_dy_ptr[k]; - tmp_dx_ptr[k] = reverse_cumsum[k] / tmp_input_ptr[k]; - } + auto* cur_input_ptr = input_ptr_base + ptr_offset; + + auto* cumsum_zeros_number_ptr = dx_ptr_base + ptr_offset; + auto* last_cumsum_zeros_number_ptr = cumsum_zeros_number_ptr - down_space; + for (size_t k = 0; k < down_space; k++) { + int is_zero = cur_input_ptr[k] == 0 ? 1 : 0; + cumsum_zeros_number_ptr[k] = is_zero + (j == 0 ? 0 : last_cumsum_zeros_number_ptr[k]); } } - { - // for k == z - std::vector first_zero(down_space, space); - for (size_t j = 0; j < space; j++) { - auto* tmp_cumsum_zeros_number_ptr = cumsum_zeros_number_ptr + j * down_space; - for (size_t k = 0; k < down_space; k++) { - if (tmp_cumsum_zeros_number_ptr[k] == 1 && first_zero[k] == space) { first_zero[k] = j; } - } - } - // compute along row - std::vector cumsum_buffer(down_space, 0); - for (size_t k = 0; k < down_space; k++) { - auto* tmp_input_down_offset_ptr = input_ptr + k; - auto* tmp_output_down_offset_ptr = output_ptr + k; - auto* tmp_dy_down_offset_ptr = dy_ptr + k; - auto* tmp_cumsum_zero_number_down_offset_ptr = cumsum_zeros_number_ptr + k; - size_t first_zero_index = first_zero[k]; - if (first_zero_index == space) { continue; } - auto cumprod_before_first_zero = - first_zero_index == 0 - ? 1 - : *(tmp_output_down_offset_ptr + (first_zero_index - 1) * down_space); - auto cumprod = 1; - for (size_t j = first_zero_index; j < space; j++) { - const size_t ptr_offset = j * down_space; - auto tmp_dy = *(tmp_dy_down_offset_ptr + ptr_offset); - auto tmp_input = *(tmp_input_down_offset_ptr + ptr_offset); - auto tmp_cumsum_zero_number = *(tmp_cumsum_zero_number_down_offset_ptr + ptr_offset); - if (tmp_cumsum_zero_number != 1) { continue; } - if (j != first_zero_index) { cumprod *= tmp_input; } - cumsum_buffer[k] += cumprod_before_first_zero * tmp_dy * cumprod; + for (size_t j = 0; j < down_space; j++) { + const auto* cur_output_ptr = output_ptr_base + j; + const auto* cur_input_ptr = input_ptr_base + j; + const auto* cur_dy_ptr = dy_ptr_base + j; + auto* cur_dx_ptr = dx_ptr_base + j; + const auto* cumsum_zeros_number_ptr = dx_ptr_base + j; + + size_t first_zero_index = space; + // Find index of first zero in input. + for (size_t k = 0; k < space; k++) { + if (cumsum_zeros_number_ptr[k * down_space] == 1) { + first_zero_index = k; + break; } } - for (size_t j = 0; j < down_space; j++) { - *(dx_ptr + first_zero[j] * down_space) = cumsum_buffer[j]; + // Suppose z is index of first zero element in input, + // for element which index is less than z grad is computed as below: + T reverse_cumsum = 0; + for (size_t k = 0; k < first_zero_index; k++) { + const size_t data_offset = (first_zero_index - k - 1) * down_space; + reverse_cumsum += cur_output_ptr[data_offset] * cur_dy_ptr[data_offset]; + cur_dx_ptr[data_offset] = reverse_cumsum / cur_input_ptr[data_offset]; + } + // For where index is z, its grad is computed as below: + if (first_zero_index == space) { continue; } + T cumprod = 1; + T cumsum = 0; + T cumprod_before_first_zero = + first_zero_index == 0 ? 1 : cur_output_ptr[(first_zero_index - 1) * down_space]; + for (size_t k = first_zero_index; k < space; k++) { + const size_t data_offset = k * down_space; + // Recover dx_ptr default value + if (cur_dx_ptr[data_offset] >= 1) { cur_dx_ptr[data_offset] = 0; } + if (k != first_zero_index) { cumprod *= cur_input_ptr[data_offset]; } + cumsum += cumprod_before_first_zero * cumprod * cur_dy_ptr[data_offset]; } + cur_dx_ptr[first_zero_index * down_space] = cumsum; } - - input_ptr += step; - output_ptr += step; - dy_ptr += step; - dx_ptr += step; } } } // namespace diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index 81d50aa8b2d..efe197e4bc8 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -49,8 +49,11 @@ class DimGatherKernel final : public user_op::OpKernel { const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); - int ndim = input_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); + const int& ndim = input_tensor->shape().NumAxes(); + int dim_value = 0; + if (ndim > 0) { dim_value = input_tensor->shape().At(dim); } + + small_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int64_t dim) -> IDX_T { return static_cast(dim); }); @@ -59,9 +62,9 @@ class DimGatherKernel final : public user_op::OpKernel { DimOpIndexNdHelper input_nd_helper(shape_vec.data(), ndim); shape2dims(index_tensor->shape()); DimOpIndexNdHelper index_nd_helper(shape_vec.data(), ndim); - DimGatherFunctor()( - ctx->stream(), input_nd_helper, index_nd_helper, ndim, index_tensor->shape().elem_cnt(), - input_tensor->shape().At(dim), dim, index, input, output); + DimGatherFunctor()(ctx->stream(), input_nd_helper, index_nd_helper, + ndim, index_tensor->shape().elem_cnt(), dim_value, + dim, index, input, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index e2af3bccaa8..c540f1f8be6 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -106,7 +106,7 @@ OF_DEVICE_FUNC void DoDimScatter(const DimOpIndexNdHelper& src_nd_helper, IDX_T coordinate[kDimGatherMaxDimCount] = {0}; idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk IDX_T idx_elem = index[idx_offset]; - if (idx_elem >= upper_bound) { + if (upper_bound != 0 && idx_elem >= upper_bound) { #if __CUDA_ARCH__ __trap(); #else diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 11ebbdc2e1c..df4721b6c3f 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -51,7 +51,7 @@ class DimScatterKernel final : public user_op::OpKernel { } const int ndim = src_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); + small_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int32_t dim) -> IDX_T { return static_cast(dim); }); diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp index e67322b02d2..34fab14c90c 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp @@ -49,7 +49,7 @@ class DimScatterScalarKernel final : public user_op::OpKernel { } const int ndim = out_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); + small_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int32_t dim) -> IDX_T { return static_cast(dim); }); diff --git a/oneflow/user/kernels/dropout_kernel.cu b/oneflow/user/kernels/dropout_kernel.cu index b1ac8e577e6..6f05ec435bd 100644 --- a/oneflow/user/kernels/dropout_kernel.cu +++ b/oneflow/user/kernels/dropout_kernel.cu @@ -338,14 +338,13 @@ __global__ RETURN_VOID_IF_DOUBLE FusedDropoutAddGpu( } } -template unsigned int ComputeGridSize(ep::Stream* stream, const int32_t block_size, const int64_t elem_cnt) { auto* cuda_stream = stream->As(); const int32_t max_threads_multi_process = cuda_stream->device_properties().maxThreadsPerMultiProcessor; const int32_t multi_processor_count = cuda_stream->device_properties().multiProcessorCount; unsigned int blocks_per_sm = max_threads_multi_process / block_size; - unsigned int grid_size = ((elem_cnt + block_size - 1) / block_size); + unsigned int grid_size = std::max((int64_t)1, ((elem_cnt + block_size - 1) / block_size)); grid_size = std::min((unsigned int)multi_processor_count * blocks_per_sm, grid_size); return grid_size; } @@ -354,9 +353,9 @@ template void DispatchTail(ep::Stream* stream, uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, const int64_t elem_cnt, float rate, float scale, const T* x, bool* mask, const T* addend, T* y) { - unsigned int grid_size = ComputeGridSize<4>(stream, kBlockSize, elem_cnt); constexpr int pack_size = GetDropoutPackSize(); const int64_t pack_num = elem_cnt / pack_size; + unsigned int grid_size = ComputeGridSize(stream, kBlockSize, pack_num); const int64_t tail_offset = pack_num * pack_size; const int64_t n_tail = elem_cnt - tail_offset; const bool tail = n_tail > 0 ? true : false; diff --git a/oneflow/user/kernels/fused_cross_feature_interaction.cu b/oneflow/user/kernels/fused_cross_feature_interaction.cu new file mode 100644 index 00000000000..687724cb89e --- /dev/null +++ b/oneflow/user/kernels/fused_cross_feature_interaction.cu @@ -0,0 +1,257 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/ep/cuda/cuda_stream.h" + +namespace oneflow { + +namespace { + +enum InteractionMode { kVector = 0, kMatrix }; + +constexpr int kBlockSize = 256; + +void InferMatmulMNK(const ShapeView& a_shape, const ShapeView& b_shape, bool transpose_a, + bool transpose_b, size_t* m, size_t* n, size_t* k) { + const int64_t num_a_axes = a_shape.NumAxes(); + CHECK_GE(num_a_axes, 2); + const int64_t num_b_axes = b_shape.NumAxes(); + CHECK_GE(num_b_axes, 2); + if (!transpose_a) { + *m = a_shape.At(num_a_axes - 2); + *k = a_shape.At(num_a_axes - 1); + } else { + *m = a_shape.At(num_a_axes - 1); + *k = a_shape.At(num_a_axes - 2); + } + if (!transpose_b) { + CHECK_EQ(b_shape.At(num_b_axes - 2), *k); + *n = b_shape.At(num_b_axes - 1); + } else { + CHECK_EQ(b_shape.At(num_b_axes - 1), *k); + *n = b_shape.At(num_b_axes - 2); + } +} + +ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) { + return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N; +} + +std::unique_ptr NewMatmulPrimitive(DeviceType device_type, + DataType data_type, bool transpose_a, + bool transpose_b) { + const auto trans_a = GetBlasTransposeType(transpose_a); + const auto trans_b = GetBlasTransposeType(transpose_b); + return ep::primitive::NewPrimitive(device_type, data_type, trans_a, + trans_b); +} + +template +std::unique_ptr NewMatmulPrimitive(Context* ctx) { + const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type(); + return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false, + /*transpose_b=*/true); +} + +auto MatmulPrimitiveExists() { + return hob::make_custom("MatmulPrimitiveExists", [](const user_op::KernelRegContext& ctx) { + return NewMatmulPrimitive(&ctx).operator bool(); + }); +} + +template +__global__ void FusedBiasAddMulAddResidualKernel(const T* in, const T* x, const T* x0, + const T* bias, T* out, const IndexType cols, + const IndexType elem_cnt) { + const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; + using LoadPack = cuda::elementwise::Packed; + for (IndexType linear_index = global_thread_id * pack_size, + step = gridDim.x * blockDim.x * pack_size; + linear_index < elem_cnt; linear_index += step) { + const IndexType row_idx = linear_index / cols; + const IndexType col_idx = linear_index - row_idx * cols; + + const LoadPack* x0_load = reinterpret_cast(x0 + linear_index); + const LoadPack* x_load = reinterpret_cast(x + linear_index); + const LoadPack* bias_load = reinterpret_cast(bias + col_idx); + + LoadPack x0_vec = *x0_load; + LoadPack x_vec = *x_load; + LoadPack bias_vec = *bias_load; + + LoadPack out_store; + if (mode == InteractionMode::kVector) { + T in_val = in[row_idx]; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + out_store.elem[i] = x0_vec.elem[i] * in_val + bias_vec.elem[i] + x_vec.elem[i]; + } + } else if (mode == InteractionMode::kMatrix) { + const LoadPack* in_load = reinterpret_cast(in + linear_index); + LoadPack in_vec = *in_load; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + out_store.elem[i] = (in_vec.elem[i] + bias_vec.elem[i]) * x0_vec.elem[i] + x_vec.elem[i]; + } + } else { + __trap(); + } + *(reinterpret_cast(out + linear_index)) = out_store; + } +} + +template +int GetLaunchPackSize(const int64_t cols) { + constexpr int type_pack_size = cuda::elementwise::PackSize(); + for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { + if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) { + return launch_pack_size; + } + } + return 1; +} + +template +void DispatchFusedBiasAddMulAddResidualPackSize(ep::Stream* stream, const T* in, const T* x, + const T* x0, const T* bias, T* out, + const IndexType cols, const IndexType elem_cnt) { + int grid_size; + const int pack_size = GetLaunchPackSize(cols); + const int64_t pack_num = elem_cnt / pack_size; + cudaError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (pack_size == 8) { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } else if (pack_size == 4) { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } else if (pack_size == 2) { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } else { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } +} + +template +void DispatchFusedBiasAddMulAddResidualIndexType(ep::Stream* stream, const T* in, const T* x, + const T* x0, const T* bias, T* out, + const int64_t cols, const int64_t elem_cnt) { + if (elem_cnt < GetMaxVal()) { + DispatchFusedBiasAddMulAddResidualPackSize(stream, in, x, x0, bias, out, cols, + elem_cnt); + } else { + DispatchFusedBiasAddMulAddResidualPackSize(stream, in, x, x0, bias, out, cols, + elem_cnt); + } +} + +template +class FusedCrossFeatureInteractionKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedCrossFeatureInteractionKernel() = default; + ~FusedCrossFeatureInteractionKernel() = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + /* + Cross Interaction v1: + 1. x matmul weight. matmul_result0 -> (B, E) matmul (1, E) -> (B, 1) + dx = dmatmul_result0 matmul weight + dw = x matmul dmatmul_result0 + + 2. matmul_result0 broadcast_mul x0. matmul_result1 -> (B, 1) broadcast_mul (B, E) -> (B, E) + dmatmul_result0 = reduce_sum(dmatmul_result1 * x0, axis=1) + dx0 = dmatmul_result1 broadcast_mul matmul_result0 + + 3. matmul_result1 broadcast_add bias. matmul_result2 -> (B, E) broadcast_add (1, E) -> (B, E) + dmatmul_result1 = dout + dbias = reduce_sum(dmatmul_result2, axis=0) + + 4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E) + dmatmul_result2 = dout, dx = dout. + + Cross Interaction Grad: + dw = x matmul dmatmul_result0 + dx0 = dmatmul_result1 broadcast_mul matmul_result0 + dbias = reduce_sum(dmatmul_result2, axis=0) + dx = (dmatmul_result0 matmul weight) + dout. + + Cross Interaction v2: + 1. x matmul weight. matmul_result0 -> (B, E) matmul (E, E) -> (B, E) + + 2. matmul_result0 add bias. matmul_result1 -> (B, E) bias_add (1, E) -> (B, E) + + 3. matmul_result1 multiply x0. matmul_result2 -> (B, E) elementwise_mul (B, E) -> (B, E) + + 4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E) + + */ + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const user_op::Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); + const std::string interaction_mode = ctx->Attr("interaction_mode"); + + CHECK_EQ(out->shape().NumAxes(), 2); + size_t m = 0, n = 0, k = 0; + InferMatmulMNK(x->shape(), weight->shape(), /*trans_a=*/false, /*trans_b=*/true, &m, &n, &k); + const double alpha = 1.0; + double beta = 0.0; + auto matmul = NewMatmulPrimitive(ctx); + CHECK(matmul); + matmul->Launch(ctx->stream(), m, n, k, alpha, x->dptr(), weight->dptr(), beta, + matmul_result->mut_dptr()); + const int64_t elem_cnt = out->shape().elem_cnt(); + const int64_t cols = out->shape().At(1); + if (interaction_mode == "vector") { + DispatchFusedBiasAddMulAddResidualIndexType( + ctx->stream(), matmul_result->mut_dptr(), x->dptr(), x0->dptr(), bias->dptr(), + out->mut_dptr(), cols, elem_cnt); + } else { + DispatchFusedBiasAddMulAddResidualIndexType( + ctx->stream(), matmul_result->mut_dptr(), x->dptr(), x0->dptr(), bias->dptr(), + out->mut_dptr(), cols, elem_cnt); + } + } +}; + +#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_cross_feature_interaction") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value) \ + && MatmulPrimitiveExists()); + +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(float) +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(half) + +} // namespace + +} // namespace oneflow diff --git a/oneflow/user/kernels/fused_cross_feature_interaction_grad.cu b/oneflow/user/kernels/fused_cross_feature_interaction_grad.cu new file mode 100644 index 00000000000..92ccdc3da01 --- /dev/null +++ b/oneflow/user/kernels/fused_cross_feature_interaction_grad.cu @@ -0,0 +1,454 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/ep/cuda/cuda_stream.h" + +namespace oneflow { + +namespace { + +constexpr int kBlockSize = 256; + +void InferMatmulMNK(const DimVector& a_shape, const DimVector& b_shape, bool transpose_a, + bool transpose_b, size_t* m, size_t* n, size_t* k) { + const int64_t num_a_axes = a_shape.size(); + CHECK_GE(num_a_axes, 2); + const int64_t num_b_axes = b_shape.size(); + CHECK_GE(num_b_axes, 2); + if (!transpose_a) { + *m = a_shape.at(num_a_axes - 2); + *k = a_shape.at(num_a_axes - 1); + } else { + *m = a_shape.at(num_a_axes - 1); + *k = a_shape.at(num_a_axes - 2); + } + if (!transpose_b) { + CHECK_EQ(b_shape.at(num_b_axes - 2), *k); + *n = b_shape.at(num_b_axes - 1); + } else { + CHECK_EQ(b_shape.at(num_b_axes - 1), *k); + *n = b_shape.at(num_b_axes - 2); + } +} + +ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) { + return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N; +} + +template +struct MulOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a * b; } +}; + +template +struct AddOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; } +}; + +template +int GetLaunchPackSize(const int64_t cols) { + constexpr int type_pack_size = cuda::elementwise::PackSize(); + for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { + if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) { + return launch_pack_size; + } + } + return 1; +} + +template +__global__ void BroadcastMulKernel(const T* x, const T* y, T* out, const IndexType cols, + const IndexType elem_cnt) { + const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; + using LoadPack = cuda::elementwise::Packed; + for (IndexType linear_index = global_thread_id * pack_size, + step = gridDim.x * blockDim.x * pack_size; + linear_index < elem_cnt; linear_index += step) { + const IndexType row_idx = linear_index / cols; + const LoadPack* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec = *x_load; + LoadPack out_store; + const T y_val = y[row_idx]; +#pragma unroll + for (int i = 0; i < pack_size; i++) { out_store.elem[i] = x_vec.elem[i] * y_val; } + *(reinterpret_cast(out + linear_index)) = out_store; + } +} + +template +void DispatchBroadcastMulPackSize(ep::Stream* stream, const T* x, const T* y, T* out, + const IndexType cols, const IndexType elem_cnt) { + int grid_size; + const int pack_size = GetLaunchPackSize(cols); + const int64_t pack_num = elem_cnt / pack_size; + cudaError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (pack_size == 8) { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } else if (pack_size == 4) { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } else if (pack_size == 2) { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } else { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } +} + +template +void DispatchBroadcastMulIndexType(ep::Stream* stream, const T* x, const T* y, T* out, + const int64_t cols, const int64_t elem_cnt) { + if (elem_cnt < GetMaxVal()) { + DispatchBroadcastMulPackSize(stream, x, y, out, cols, elem_cnt); + } else { + DispatchBroadcastMulPackSize(stream, x, y, out, cols, elem_cnt); + } +} + +template +__global__ void BroadcastAddElementwiseMulKernel(const T* x, const T* y, const T* z, T* out, + const IndexType cols, const IndexType elem_cnt) { + const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; + using LoadPack = cuda::elementwise::Packed; + for (IndexType linear_index = global_thread_id * pack_size, + step = gridDim.x * blockDim.x * pack_size; + linear_index < elem_cnt; linear_index += step) { + const IndexType row_idx = linear_index / cols; + const IndexType col_idx = linear_index - row_idx * cols; + const LoadPack* x_load = reinterpret_cast(x + linear_index); + const LoadPack* y_load = reinterpret_cast(y + col_idx); + const LoadPack* z_load = reinterpret_cast(z + linear_index); + + LoadPack x_vec = *x_load; + LoadPack y_vec = *y_load; + LoadPack z_vec = *z_load; + LoadPack out_store; + +#pragma unroll + for (int i = 0; i < pack_size; i++) { + out_store.elem[i] = (x_vec.elem[i] + y_vec.elem[i]) * z_vec.elem[i]; + } + *(reinterpret_cast(out + linear_index)) = out_store; + } +} + +template +void DispatchBroadcastAddElementwiseMulPackSize(ep::Stream* stream, const T* x, const T* y, + const T* z, T* out, const IndexType cols, + const IndexType elem_cnt) { + int grid_size; + const int pack_size = GetLaunchPackSize(cols); + const int64_t pack_num = elem_cnt / pack_size; + cudaError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (pack_size == 8) { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } else if (pack_size == 4) { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } else if (pack_size == 2) { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } else { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } +} + +template +void DispatchBroadcastAddElementwiseMulIndexType(ep::Stream* stream, const T* x, const T* y, + const T* z, T* out, const int64_t cols, + const int64_t elem_cnt) { + if (elem_cnt < GetMaxVal()) { + DispatchBroadcastAddElementwiseMulPackSize(stream, x, y, z, out, cols, elem_cnt); + } else { + DispatchBroadcastAddElementwiseMulPackSize(stream, x, y, z, out, cols, elem_cnt); + } +} + +} // namespace + +namespace user_op { + +std::unique_ptr NewMatmulPrimitive(DeviceType device_type, + DataType data_type, bool transpose_a, + bool transpose_b) { + const auto trans_a = GetBlasTransposeType(transpose_a); + const auto trans_b = GetBlasTransposeType(transpose_b); + return ep::primitive::NewPrimitive(device_type, data_type, trans_a, + trans_b); +} + +template +std::unique_ptr NewReduceMatmulPrimitive(Context* ctx) { + const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->data_type(); + return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false, + /*transpose_b=*/false); +} + +auto ReduceMatmulPrimitiveExists() { + return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) { + return NewReduceMatmulPrimitive(&ctx).operator bool(); + }); +} + +template +std::unique_ptr NewWeightGradMatmulPrimitive(Context* ctx) { + const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type(); + return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/true, + /*transpose_b=*/false); +} + +auto WeightGradMatmulPrimitiveExists() { + return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) { + return NewWeightGradMatmulPrimitive(&ctx).operator bool(); + }); +} + +template +class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public CudaGraphSupport { + public: + FusedCrossFeatureInteractionGradKernel() = default; + ~FusedCrossFeatureInteractionGradKernel() = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { + const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); + const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); + + const int64_t batch_size = dy->shape().At(0); + const int64_t hidden_size = dy->shape().At(1); + const int64_t out_size = weight->shape().At(0); + const int64_t dy_elem_cnt = dy->shape().elem_cnt(); + + Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); + Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); + Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0); + Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + // step1: Get dbias. + const T* ones = nullptr; + auto* cuda_device = dynamic_cast(ctx->stream()->device()); + if (cuda_device != nullptr) { + ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), batch_size)); + } + size_t m = 0, n = 0, k = 0; + DimVector dy_shape(2); + dy->shape().ToDimVector(&dy_shape); + DimVector ones_buf_shape(2); + ones_buf_shape.at(0) = 1; + ones_buf_shape.at(1) = batch_size; + InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); + auto reduce_matmul = NewReduceMatmulPrimitive(ctx); + CHECK(reduce_matmul); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, dy->dptr(), 0.0, dbias->mut_dptr()); + + // step2: Get dmatmul_result0. + T* dy_mul_x0 = reinterpret_cast(tmp_buffer->mut_dptr()); + T* dmatmul_result0 = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))); + OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp(), dy_elem_cnt, dy_mul_x0, dy->dptr(), + x0->dptr(), + ctx->stream()->As()->cuda_stream())); + + ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), hidden_size)); + DimVector dy_mul_x0_shape(2); + dy->shape().ToDimVector(&dy_mul_x0_shape); + ones_buf_shape.at(0) = hidden_size; + ones_buf_shape.at(1) = 1; + InferMatmulMNK(dy_mul_x0_shape, ones_buf_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, + &k); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dy_mul_x0, ones, 0.0, dmatmul_result0); + + // step3: Get dx + T* dx_buf = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(dy_elem_cnt * sizeof(T)) + + GetCudaAlignedSize(batch_size * sizeof(T))); + DimVector dmatmul_result_shape(2); + dmatmul_result_shape.at(0) = batch_size; + dmatmul_result_shape.at(1) = 1; // todo change to hidden size + DimVector weight_shape(2); + weight->shape().ToDimVector(&weight_shape); + InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, + &k); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0, + reinterpret_cast(dx_buf)); + OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp(), dy_elem_cnt, dx->mut_dptr(), dx_buf, + dy->dptr(), + ctx->stream()->As()->cuda_stream())); + + // step4: Get dw. + DimVector x_shape(2); + x->shape().ToDimVector(&x_shape); + + InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); + auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); + CHECK(weight_grad_matmul); + weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0, + dw->mut_dptr()); + + // step5: Get dx0. + DispatchBroadcastMulIndexType(ctx->stream(), dy->dptr(), matmul_result->dptr(), + dx0->mut_dptr(), hidden_size, dy_elem_cnt); + } +}; + +#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_cross_feature_interaction_v1_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA) \ + && (HobDataType("dy", 0) == GetDataType::value) \ + && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \ + .SetInferTmpSizeFn([](InferContext* ctx) { \ + size_t tmp_size = 0; \ + const TensorDesc& dy = ctx->InputTensorDesc("dy", 0); \ + const int64_t dy_elem_cnt = dy.shape().elem_cnt(); \ + const int64_t batch_size = dy.shape().At(0); \ + size_t dy_mul_x0_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype)); \ + size_t dmatmul_result_size = GetCudaAlignedSize(batch_size * sizeof(dtype)); \ + size_t dx_buf_size = dy_mul_x0_size; \ + tmp_size = dy_mul_x0_size + dmatmul_result_size + dx_buf_size; \ + return tmp_size; \ + }); + +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(float) +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(half) + +template +class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public CudaGraphSupport { + public: + FusedCrossFeatureInteractionV2GradKernel() = default; + ~FusedCrossFeatureInteractionV2GradKernel() = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { + const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); + const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); + + const int64_t batch_size = dy->shape().At(0); + const int64_t in_size = weight->shape().At(1); + const int64_t hidden_size = weight->shape().At(0); + const int64_t dy_elem_cnt = dy->shape().elem_cnt(); + + Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); + Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); + Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0); + Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + // step1: Get dx0. + DispatchBroadcastAddElementwiseMulIndexType(ctx->stream(), matmul_result->dptr(), + bias->dptr(), dy->dptr(), + dx0->mut_dptr(), hidden_size, dy_elem_cnt); + + // step2: Get dmatmul_result0. + T* dmatmul_result0 = reinterpret_cast(tmp_buffer->mut_dptr()); + OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp(), dy_elem_cnt, dmatmul_result0, dy->dptr(), + x0->dptr(), + ctx->stream()->As()->cuda_stream())); + // step3: Get dx + T* dx_buf = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))); + DimVector dmatmul_result_shape(2); + dmatmul_result_shape.at(0) = batch_size; + dmatmul_result_shape.at(1) = hidden_size; + DimVector weight_shape(2); + weight->shape().ToDimVector(&weight_shape); + size_t m = 0, n = 0, k = 0; + InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, + &k); + auto reduce_matmul = NewReduceMatmulPrimitive(ctx); + CHECK(reduce_matmul); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0, + reinterpret_cast(dx_buf)); + OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp(), dy_elem_cnt, dx->mut_dptr(), dx_buf, + dy->dptr(), + ctx->stream()->As()->cuda_stream())); + + // step4: Get dw. + DimVector x_shape(2); + x->shape().ToDimVector(&x_shape); + + InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); + auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); + CHECK(weight_grad_matmul); + weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0, + dw->mut_dptr()); + + // step5: Get dbias. + const T* ones = nullptr; + auto* cuda_device = dynamic_cast(ctx->stream()->device()); + if (cuda_device != nullptr) { + ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), batch_size)); + } + DimVector dy_shape(2); + dy->shape().ToDimVector(&dy_shape); + DimVector ones_buf_shape(2); + ones_buf_shape.at(0) = 1; + ones_buf_shape.at(1) = batch_size; + InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, + reinterpret_cast(dmatmul_result0), 0.0, dbias->mut_dptr()); + } +}; + +#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_cross_feature_interaction_v2_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA) \ + && (HobDataType("dy", 0) == GetDataType::value) \ + && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \ + .SetInferTmpSizeFn([](InferContext* ctx) { \ + size_t tmp_size = 0; \ + const TensorDesc& dy = ctx->InputTensorDesc("dy", 0); \ + const int64_t dy_elem_cnt = dy.shape().elem_cnt(); \ + size_t dmatmul_result_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype)); \ + size_t dx_buf_size = dmatmul_result_size; \ + tmp_size = dmatmul_result_size + dx_buf_size; \ + return tmp_size; \ + }); + +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(float) +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(half) + +} // namespace user_op + +} // namespace oneflow diff --git a/oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu b/oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu new file mode 100644 index 00000000000..9b6f6fc431c --- /dev/null +++ b/oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu @@ -0,0 +1,478 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/cuda/atomic.cuh" +#include "oneflow/user/kernels/cublas_fused_mlp_util.cuh" +#include "oneflow/user/kernels/dropout_kernel.h" +// CUBLAS_AUX_EPILOGUE only support in cuda11.4 or higher version, in cuda11.4 it need static link. +#if CUDA_VERSION >= 11060 + +namespace oneflow { + +namespace { + +constexpr int32_t kVecSize = 4; +constexpr int32_t kBlockSize = 256; +constexpr int32_t kWarpSize = 32; + +union RandPack4 { + uint4 storage; + uint32_t elem[4]; // store curand4 return val. +}; + +template +__device__ void SetCublasBitMask(const IndexType aux_ld, const IndexType row, const IndexType col, + int32_t thread_bitmask, int32_t* mask) { + IndexType linear_index = row * aux_ld + col; + IndexType mask_index = linear_index / kWarpSize; + IndexType mask_offset = linear_index - mask_index * kWarpSize; + + int32_t bitmask = thread_bitmask << mask_offset; + for (int stride = kWarpSize / (pack_size * 2); stride > 0; stride /= 2) { + bitmask |= __shfl_down_sync(__activemask(), bitmask, stride, kWarpSize); + } + if (mask_offset == 0) { mask[mask_index] = bitmask; } +} + +template +__global__ void FusedVectorizedReluDropoutKernel(uint64_t seed, + one::CUDAGeneratorState* cuda_gen_state, + uint64_t inc_offset, const IndexType elem_cnt, + const int32_t aux_ld, const IndexType cols, + const uint32_t rate, float scale, T* x, + int32_t* mask) { + IndexType global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + curandStatePhilox4_32_10_t state; + curand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + + T t_scale = static_cast(scale); + RandPack4 rand_uniform_pack4; + T zero_val = static_cast(0.0); + for (IndexType linear_index = global_thread_id * kVecSize, + step = gridDim.x * blockDim.x * kVecSize; + linear_index < elem_cnt; linear_index += step) { + const IndexType row = linear_index / cols; + const IndexType col = linear_index - row * cols; + int32_t thread_bitmask = 0; + + rand_uniform_pack4.storage = curand4(&state); + + LoadType* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec; + x_vec.storage = *x_load; + LoadPack out_vec; +#pragma unroll + for (int i = 0; i < kVecSize; i++) { + bool relu_mask = true; + if (relu) { + // Relu + relu_mask = x_vec.elem[i] >= zero_val; + } + // dropout + bool mask_val = rand_uniform_pack4.elem[i] > rate; + // Combined relu_mask, dropout_mask together. + bool combined_mask = relu_mask && mask_val; + // Cause half/bfloat16 cannot directily convert from bool, here we cast to float type first + T t_combined_mask = static_cast(static_cast(combined_mask)); + thread_bitmask |= (combined_mask << i); + out_vec.elem[i] = x_vec.elem[i] * t_combined_mask * t_scale; + } + *(reinterpret_cast(x + linear_index)) = out_vec.storage; + SetCublasBitMask(aux_ld, row, col, thread_bitmask, mask); + } + + if (threadIdx.x == 0) { + int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; + if (new_counter == gridDim.x) { + cuda_gen_state->dev_counter = 0; // reset counter to zero + cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset + } + } +} + +template +__global__ void FusedPaddedVectorizedReluDropoutKernel( + uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset, + const IndexType aligned32_elem_cnt, const int32_t aux_ld, const IndexType aligned32_cols, + const IndexType cols, const uint32_t rate, float scale, T* x, int32_t* mask) { + IndexType global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + curandStatePhilox4_32_10_t state; + curand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + + T t_scale = static_cast(scale); + RandPack4 rand_uniform_pack4; + T zero_val = static_cast(0.0); + for (IndexType linear_index = global_thread_id * kVecSize, + step = gridDim.x * blockDim.x * kVecSize; + linear_index < aligned32_elem_cnt; linear_index += step) { + const IndexType row = linear_index / aligned32_cols; + const IndexType col = linear_index - row * aligned32_cols; + int32_t thread_bitmask = 0; + + if (col < cols) { + const IndexType actual_index = row * cols + col; + rand_uniform_pack4.storage = curand4(&state); + + LoadType* x_load = reinterpret_cast(x + actual_index); + LoadPack x_vec; + x_vec.storage = *x_load; + LoadPack out_vec; +#pragma unroll + for (int i = 0; i < kVecSize; i++) { + bool relu_mask = true; + if (relu) { + // Relu + relu_mask = x_vec.elem[i] >= zero_val; + } + // dropout + bool mask_val = rand_uniform_pack4.elem[i] > rate; + // Combined relu_mask, dropout_mask together. + bool combined_mask = relu_mask && mask_val; + // Cause half/bfloat16 cannot directily convert from bool, here we cast to float type first + T t_combined_mask = static_cast(static_cast(combined_mask)); + thread_bitmask |= (combined_mask << i); + out_vec.elem[i] = x_vec.elem[i] * t_combined_mask * t_scale; + } + *(reinterpret_cast(x + actual_index)) = out_vec.storage; + } + SetCublasBitMask(aux_ld, row, col, thread_bitmask, mask); + } + + if (threadIdx.x == 0 && threadIdx.y == 0) { + int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; + if (new_counter == gridDim.x) { + cuda_gen_state->dev_counter = 0; // reset counter to zero + cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset + } + } +} + +template +__global__ void FusedWarpReluDropoutKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, + uint64_t inc_offset, const IndexType elem_cnt, + const IndexType aux_ld, const IndexType rows, + const IndexType cols, const uint32_t rate, float scale, + T* x, int32_t* mask) { + const int32_t lane_id = threadIdx.x; + const IndexType global_warp_id = blockIdx.x * blockDim.y + threadIdx.y; + const IndexType step = gridDim.x * blockDim.y; + const IndexType global_thread_id = global_warp_id * kWarpSize + lane_id; + + curandStatePhilox4_32_10_t state; + curand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); + + T t_scale = static_cast(scale); + T zero_val = static_cast(0.0); + RandPack4 rand_uniform_pack4; + + for (IndexType row = global_warp_id; row < rows; row += step) { + for (IndexType col = lane_id; col < cols; col += kWarpSize * kVecSize) { + const IndexType linear_index = row * cols + col; + rand_uniform_pack4.storage = curand4(&state); +#pragma unroll + for (int i = 0; i < kVecSize; i++) { + int32_t thread_bitmask = 0; + int32_t cur_col = col + i * kWarpSize; + int32_t cur_linear_index = linear_index + i * kWarpSize; + if (cur_col < cols) { + T x_val = x[cur_linear_index]; + const uint32_t rand_uniform_val = rand_uniform_pack4.elem[i]; + bool relu_mask = true; + if (relu) { + // relu + relu_mask = x_val >= zero_val; + } + // dropout + bool mask_val = rand_uniform_val > rate; + // Combined relu_mask, dropout_mask together. + bool combined_mask = relu_mask && mask_val; + thread_bitmask = combined_mask; + // Cause half/bfloat16 cannot directily convert from bool, here we cast to float type + // first + T t_combined_mask = static_cast(static_cast(combined_mask)); + T out_val = x_val * t_combined_mask * t_scale; + x[cur_linear_index] = out_val; + } + int32_t warp_mask = __ballot_sync(__activemask(), thread_bitmask); + if (lane_id == 0) { mask[(row * aux_ld + cur_col) / kWarpSize] = warp_mask; } + } + } + } + + if (threadIdx.x == 0 && threadIdx.y == 0) { + int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; + if (new_counter == gridDim.x) { + cuda_gen_state->dev_counter = 0; // reset counter to zero + cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset + } + } +} + +template +unsigned int ComputeGridSize(ep::Stream* stream, Func func, const int64_t elem_cnt, + const int32_t block_size) { + auto* cuda_stream = stream->As(); + const int64_t pack_num = elem_cnt / kVecSize; + const int32_t num_blocks = std::max(1, (pack_num + block_size - 1) / block_size); + const int32_t multi_processor_count = cuda_stream->device_properties().multiProcessorCount; + int max_active_blocks = 0; + OF_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, block_size, + /*shared_memory*/ 0)); + return std::min(num_blocks, max_active_blocks * multi_processor_count); +} + +uint64_t RoundUp(uint64_t x, uint64_t y) { return (x + y - 1) / y * y; } + +template +cudaError_t LaunchFusedReluDropoutKernel(ep::CudaStream* stream, uint64_t seed, + one::CUDAGeneratorState* cuda_gen_state, + const int64_t elem_cnt, const int32_t aux_ld, + const int64_t rows, const int64_t cols, float rate, + float scale, T* x, int32_t* mask) { + uint64_t inc_offset = 0; + const uint32_t uint_rate = UINT_MAX * rate; + unsigned int grid_size = 0; + if (cols % 32 == 0) { + // Launch Elementwise Vectorized Kernel. + if (elem_cnt < GetMaxVal()) { + grid_size = ComputeGridSize(stream, FusedVectorizedReluDropoutKernel, + elem_cnt, kBlockSize); + inc_offset = RoundUp((elem_cnt / (kBlockSize * grid_size)), kVecSize); + FusedVectorizedReluDropoutKernel + <<cuda_stream()>>>( + seed, cuda_gen_state, inc_offset, elem_cnt, aux_ld, cols, uint_rate, scale, x, mask); + } else { + grid_size = ComputeGridSize(stream, FusedVectorizedReluDropoutKernel, + elem_cnt, kBlockSize); + inc_offset = RoundUp((elem_cnt / (kBlockSize * grid_size)), kVecSize); + FusedVectorizedReluDropoutKernel + <<cuda_stream()>>>( + seed, cuda_gen_state, inc_offset, elem_cnt, aux_ld, cols, uint_rate, scale, x, mask); + } + } else { + if (cols % 4 == 0) { + // Padding cols to align kWarpSize. + const int64_t align32_cols = (cols + kWarpSize - 1) / kWarpSize * kWarpSize; + const int64_t align32_elem_cnt = rows * align32_cols; + if (align32_elem_cnt < GetMaxVal()) { + grid_size = + ComputeGridSize(stream, FusedPaddedVectorizedReluDropoutKernel, + align32_elem_cnt, kBlockSize); + inc_offset = RoundUp((elem_cnt / (kBlockSize * grid_size)), kVecSize); + FusedPaddedVectorizedReluDropoutKernel + <<cuda_stream()>>>( + seed, cuda_gen_state, inc_offset, align32_elem_cnt, aux_ld, align32_cols, cols, + uint_rate, scale, x, mask); + } else { + grid_size = + ComputeGridSize(stream, FusedPaddedVectorizedReluDropoutKernel, + align32_elem_cnt, kBlockSize); + inc_offset = RoundUp((elem_cnt / (kBlockSize * grid_size)), kVecSize); + FusedPaddedVectorizedReluDropoutKernel + <<cuda_stream()>>>( + seed, cuda_gen_state, inc_offset, align32_elem_cnt, aux_ld, align32_cols, cols, + uint_rate, scale, x, mask); + } + } else { + // Process a row by using a warp. + dim3 block_dim(kWarpSize, kBlockSize / kWarpSize); + if (elem_cnt < GetMaxVal()) { + grid_size = ComputeGridSize(stream, FusedWarpReluDropoutKernel, elem_cnt, + kBlockSize); + inc_offset = RoundUp((elem_cnt / (kBlockSize * grid_size)), kVecSize); + FusedWarpReluDropoutKernel + <<cuda_stream()>>>(seed, cuda_gen_state, inc_offset, + elem_cnt, aux_ld, rows, cols, + uint_rate, scale, x, mask); + } else { + grid_size = ComputeGridSize(stream, FusedWarpReluDropoutKernel, elem_cnt, + kBlockSize); + inc_offset = RoundUp((elem_cnt / (kBlockSize * grid_size)), kVecSize); + FusedWarpReluDropoutKernel + <<cuda_stream()>>>(seed, cuda_gen_state, inc_offset, + elem_cnt, aux_ld, rows, cols, + uint_rate, scale, x, mask); + } + } + } + return cudaPeekAtLastError(); +} + +template +class FusedMatmulBiasAddReluDropoutKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedMatmulBiasAddReluDropoutKernel() = default; + ~FusedMatmulBiasAddReluDropoutKernel() override = default; + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateCublasFusedMLPKernelCache(); + } + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + const auto& generator = CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA)); + return std::make_shared(generator); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const override { + /* + Fused DenseActivation Layer. Assume we have two layers: + A: (m, k) + B: (n, k) need transpose + C: (j, n) need transpose + tmp: A matmul B(transpose), its shape is (m, n) + out: tmp matmul C(transpose), its shape is (m, j) + */ + const int32_t weight_size = ctx->input_size("weights"); + const int32_t bias_size = ctx->input_size("biases"); + CHECK_EQ(weight_size, bias_size) << "The number of weight and bias is not equal!. "; + auto* cuda_stream = ctx->stream()->As(); + const auto* matmul_cache = CHECK_NOTNULL(dynamic_cast(cache)); + + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + bool skip_final_activation = ctx->Attr("skip_final_activation"); + + auto* fused_dropout_kernel_state = dynamic_cast(state); + CHECK_NOTNULL(fused_dropout_kernel_state); + const auto& generator = fused_dropout_kernel_state->generator(); + CHECK_NOTNULL(generator); + const auto device_index = ctx->stream()->device()->device_index(); + std::shared_ptr cuda_generator = + CHECK_JUST(generator->Get(device_index)); + uint64_t seed = cuda_generator->current_seed(); + const std::vector dropout_rate_list = ctx->Attr>("dropout_rate_list"); + one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state(); + + const DataType data_type = out->data_type(); + const cublasComputeType_t cublas_compute_dtype = GetComputeType(data_type); + const cudaDataType_t cuda_data_type = GetCudaDataType(data_type); + size_t cublas_m = 0, cublas_n = 0, cublas_k = 0; + int64_t cublas_lda = 0, cublas_ldb = 0, cublas_ldc = 0; + + const double alpha = 1.0; + const auto sp_alpha = GetCublasScalarParameter(alpha, cublas_compute_dtype); + const double beta = 0.0; + const auto sp_beta = GetCublasScalarParameter(beta, cublas_compute_dtype); + + // Currently only support 2D matmul. + DimVector in_shape(2); + x->shape().ToDimVector(&in_shape); + DimVector weight_shape(2); + + const void* in_buf_ptr = x->dptr(); + size_t offset = 0; + for (int idx = 0; idx < weight_size; idx++) { + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weights", idx); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("biases", idx); + user_op::Tensor* cublas_aux = ctx->Tensor4ArgNameAndIndex("cublas_aux", idx); + + const int64_t batchsize = in_shape.at(0); + const int64_t out_feature = weight->shape().At(0); + weight->shape().ToDimVector(&weight_shape); + size_t matmul_out_elem_cnt = batchsize * out_feature; + + InferMatmulCublasMNK(in_shape, weight_shape, + /*transpose_a=*/ep::primitive::BlasTransposeType::N, + /*transpose_b=*/ep::primitive::BlasTransposeType::T, &cublas_m, + &cublas_n, &cublas_k, &cublas_lda, &cublas_ldb, &cublas_ldc); + + cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS; + void* matmul_out_ptr; + + float rate = dropout_rate_list.at(idx); + float scale = 0.0; + const int32_t aux_ld = AlignReluAuxLd(out_feature); + if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); } + + if (idx == weight_size - 1) { + matmul_out_ptr = ctx->Tensor4ArgNameAndIndex("out", 0)->mut_dptr(); + } else { + matmul_out_ptr = ctx->Tensor4ArgNameAndIndex("hidden", idx)->mut_dptr(); + } + SetCublasAttr(matmul_cache, cublas_compute_dtype, cuda_data_type, /*need_aux=*/false, + /*transpose_a=*/ep::primitive::BlasTransposeType::N, + /*transpose_b=*/ep::primitive::BlasTransposeType::T, epilogue, bias->dptr(), + /*aux_ptr=*/nullptr, cublas_m, cublas_n, cublas_k, cublas_lda, cublas_ldb, + cublas_ldc); + + OF_CUBLAS_CHECK(cublasLtMatmul( + cuda_stream->cublas_lt_handle(), matmul_cache->operation_desc, &sp_alpha, weight->dptr(), + matmul_cache->cublas_a_desc, in_buf_ptr, matmul_cache->cublas_b_desc, &sp_beta, + matmul_out_ptr, matmul_cache->cublas_c_desc, matmul_out_ptr, matmul_cache->cublas_c_desc, + nullptr, cuda_stream->cublas_workspace(), cuda_stream->cublas_workspace_size(), + cuda_stream->cuda_stream())); + + if (idx != weight_size - 1 || !skip_final_activation || rate != 0.0f) { + OF_CUDA_CHECK(cudaMemsetAsync(cublas_aux->mut_dptr(), 0, + cublas_aux->shape().elem_cnt() * sizeof(int32_t), + cuda_stream->cuda_stream())); + } + + if (idx != weight_size - 1 || !skip_final_activation) { + // If it's not last layer or it's last layer but need relu. + OF_CUDA_CHECK((LaunchFusedReluDropoutKernel( + cuda_stream, seed, cuda_gen_state, matmul_out_elem_cnt, aux_ld, batchsize, out_feature, + rate, scale, reinterpret_cast(matmul_out_ptr), + reinterpret_cast(cublas_aux->mut_dptr())))); + // Set relu_droput_out ptr as next layer's input. + in_buf_ptr = matmul_out_ptr; + // Set hidden_layer shape as next layer's input shape. + in_shape.at(1) = out_feature; + } else { + if (rate == 0.0f) { + // It's last layer and dropout_rate is 0.0f, we do not launch FusedReluDropoutKernel. + break; + } else { + // skip_final_activation but need dropout. + OF_CUDA_CHECK((LaunchFusedReluDropoutKernel( + cuda_stream, seed, cuda_gen_state, matmul_out_elem_cnt, aux_ld, batchsize, + out_feature, rate, scale, reinterpret_cast(matmul_out_ptr), + reinterpret_cast(cublas_aux->mut_dptr())))); + } + } + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_MATMUL_BIAS_ADD_RELU_DROPOUT_KERNEL_GPU(cpp_type, data_type) \ + REGISTER_USER_KERNEL("fused_matmul_bias_add_relu_dropout") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == data_type)); + +REGISTER_FUSED_MATMUL_BIAS_ADD_RELU_DROPOUT_KERNEL_GPU(float, DataType::kFloat) +REGISTER_FUSED_MATMUL_BIAS_ADD_RELU_DROPOUT_KERNEL_GPU(half, DataType::kFloat16) +#if CUDA_VERSION >= 11000 +REGISTER_FUSED_MATMUL_BIAS_ADD_RELU_DROPOUT_KERNEL_GPU(nv_bfloat16, DataType::kBFloat16) +#endif + +} // namespace + +} // namespace oneflow + +#endif // CUDA_VERSION >= 11060 diff --git a/oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu new file mode 100644 index 00000000000..85dc3d492df --- /dev/null +++ b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu @@ -0,0 +1,148 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/cuda/elementwise.cuh" + +namespace oneflow { + +namespace { + +constexpr int32_t kWarpSize = 32; + +template +__global__ void VectorizedReluDropoutBitmaskBackwardKernel( + const IndexType elem_cnt, const IndexType cols, const IndexType aux_ld, const float scale, + const IndexType n_tail, const IndexType tail_offset, const T* dy, const int32_t* mask, T* dx) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + using LoadStoreType = cuda::elementwise::PackType; + using LoadStorePack = cuda::elementwise::Pack; + + T t_scale = static_cast(scale); + for (IndexType linear_pack_index = global_thread_id * pack_size; linear_pack_index < elem_cnt; + linear_pack_index += gridDim.x * blockDim.x * pack_size) { + const LoadStoreType* dy_load = reinterpret_cast(dy + linear_pack_index); + LoadStorePack dy_vec; + dy_vec.storage = *dy_load; + + LoadStorePack dx_vec; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + const IndexType linear_index = (linear_pack_index + i); + const IndexType row = linear_index / cols; + const IndexType col = linear_index - row * cols; + const int32_t col_mod_warpsize = col % kWarpSize; + const IndexType aux_idx = ((row * aux_ld) + col) / kWarpSize; + bool is_positive = mask[aux_idx] & (1 << col_mod_warpsize); + dx_vec.elem[i] = + dy_vec.elem[i] * static_cast(static_cast(is_positive)) * static_cast(scale); + } + *(reinterpret_cast(dx + linear_pack_index)) = dx_vec.storage; + } + + if (tail && global_thread_id < n_tail) { + const IndexType tail_index = tail_offset + global_thread_id; + const IndexType tail_row = tail_index / cols; + const IndexType tail_col = tail_index - tail_row * cols; + const IndexType tail_col_mod_warpsize = tail_col % kWarpSize; + const IndexType tail_aux_idx = ((tail_row * aux_ld) + tail_col) / kWarpSize; + bool is_positive = mask[tail_aux_idx] & (1 << tail_col_mod_warpsize); + dx[tail_index] = + dy[tail_index] * static_cast(static_cast(is_positive)) * static_cast(scale); + } +} + +template +void LaunchVectorizedReluDropoutBackwardKernel(ep::Stream* stream, const int64_t elem_cnt, + const int64_t cols, const int64_t aux_ld, + float scale, const T* dy, const int32_t* mask, + T* dx) { + constexpr int pack_size = cuda::elementwise::PackSize(); + const int64_t pack_num = elem_cnt / pack_size; + const int64_t tail_offset = pack_num * pack_size; + const int64_t n_tail = elem_cnt - tail_offset; + const bool tail = n_tail > 0 ? true : false; + if (tail) { + if (elem_cnt < GetMaxVal()) { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy, + mask, dx); + } else { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy, + mask, dx); + } + } else { + if (elem_cnt < GetMaxVal()) { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset, + dy, mask, dx); + } else { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset, + dy, mask, dx); + } + } +} + +template +class FusedReluDropoutGradKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedReluDropoutGradKernel() = default; + ~FusedReluDropoutGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const float scale = ctx->Attr("scale"); + + const int64_t cols = dy->shape().At(1); + const int64_t aux_ld = mask->shape().At(1) * 32; + const int64_t elem_cnt = dy->shape().elem_cnt(); + LaunchVectorizedReluDropoutBackwardKernel( + ctx->stream(), elem_cnt, cols, aux_ld, scale, reinterpret_cast(dy->dptr()), + mask->dptr(), reinterpret_cast(dx->mut_dptr())); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type) \ + REGISTER_USER_KERNEL("fused_relu_dropout_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == data_type)); + +REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat) +REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16) +#if CUDA_VERSION >= 11000 +REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(nv_bfloat16, DataType::kBFloat16) +#endif + +} // namespace + +} // namespace oneflow diff --git a/oneflow/user/kernels/gather_kernel_util.cpp b/oneflow/user/kernels/gather_kernel_util.cpp index bec965bdf12..88705ca4bff 100644 --- a/oneflow/user/kernels/gather_kernel_util.cpp +++ b/oneflow/user/kernels/gather_kernel_util.cpp @@ -85,7 +85,7 @@ void GatherKernelUtilImpl::Forward(ep::Stream* stream, c const T* from = in + outer_idx * gather_dim_size * inner_dim_size + idx * inner_dim_size; std::copy(from, from + inner_dim_size, to); } else { - std::memset(reinterpret_cast(to), 0, inner_dim_size * sizeof(K)); + std::memset(reinterpret_cast(to), 0, inner_dim_size * sizeof(T)); } } } diff --git a/oneflow/user/kernels/image_preprocess_kernels.cu b/oneflow/user/kernels/image_preprocess_kernels.cu index 30fda3bd96d..2b2e287e69c 100644 --- a/oneflow/user/kernels/image_preprocess_kernels.cu +++ b/oneflow/user/kernels/image_preprocess_kernels.cu @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/framework/framework.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/ep/cuda/cuda_stream.h" diff --git a/oneflow/user/kernels/max_pool_kernel_util.h b/oneflow/user/kernels/max_pool_kernel_util.h index c62bf3bd3cd..821aa2020e0 100644 --- a/oneflow/user/kernels/max_pool_kernel_util.h +++ b/oneflow/user/kernels/max_pool_kernel_util.h @@ -41,7 +41,7 @@ namespace oneflow { #define POOL_DATA_TYPE_CUDA_SEQ POOL_DATA_TYPE_SEQ -typedef fixed_vector FixedDimVector; +typedef small_vector FixedDimVector; template struct DeviceAdd { diff --git a/oneflow/user/kernels/slice_kernel.cpp b/oneflow/user/kernels/slice_kernel.cpp index c89e1513f2e..691ee1b810e 100644 --- a/oneflow/user/kernels/slice_kernel.cpp +++ b/oneflow/user/kernels/slice_kernel.cpp @@ -329,30 +329,6 @@ DEFINE_STATIC_SWITCH_FUNC( )); #undef MAKE_WRITE_SLICE_SWITCH_ENTRY -std::shared_ptr CreateSliceCache(user_op::KernelCacheContext* ctx, - const std::string& large_tensor_name) { - SliceContext slice_ctx; - if (ctx->parallel_ctx().parallel_num() == 1) { - // split_axis == SPLIT_AXIS_FOR_NON_SPLIT means the sbp attribute is not 'split' - CHECK_JUST(slice_ctx.PushSplitInfo(SPLIT_AXIS_FOR_NON_SPLIT, 0, 0, 0)); - } else { - const NdSbp& in_nd_sbp = ctx->NdSbp4ArgNameAndIndex(large_tensor_name, 0); - const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy(); - const Shape& logical_shape = - ctx->LogicalTensorDesc4ArgNameAndIndex(large_tensor_name, 0)->shape(); - const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const TensorSliceView& slice_view = - GetTensorSliceView4ParallelId(parallel_hierarchy, in_nd_sbp, logical_shape, parallel_id); - for (int i = 0; i < logical_shape.NumAxes(); ++i) { - const Range& range = slice_view.At(i); - if (range.begin() != 0 || range.end() != logical_shape.At(i)) { - CHECK_JUST(slice_ctx.PushSplitInfo(i, range.begin(), range.end(), logical_shape.At(i))); - } - } - } - return std::make_shared>(slice_ctx); -} - template class LogicalSliceKernel final : public user_op::OpKernel { public: @@ -361,7 +337,25 @@ class LogicalSliceKernel final : public user_op::OpKernel { std::shared_ptr InitOpKernelCache( user_op::KernelCacheContext* ctx) const override { - return CreateSliceCache(ctx, "x"); + SliceContext slice_ctx; + if (ctx->parallel_ctx().parallel_num() == 1) { + // split_axis == SPLIT_AXIS_FOR_NON_SPLIT means the sbp attribute is not 'split' + CHECK_JUST(slice_ctx.PushSplitInfo(SPLIT_AXIS_FOR_NON_SPLIT, 0, 0, 0)); + } else { + const NdSbp& in_nd_sbp = ctx->NdSbp4ArgNameAndIndex("x", 0); + const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy(); + const Shape& logical_shape = ctx->LogicalTensorDesc4ArgNameAndIndex("x", 0)->shape(); + const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); + const TensorSliceView& slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, in_nd_sbp, logical_shape, parallel_id); + for (int i = 0; i < logical_shape.NumAxes(); ++i) { + const Range& range = slice_view.At(i); + if (range.begin() != 0 || range.end() != logical_shape.At(i)) { + CHECK_JUST(slice_ctx.PushSplitInfo(i, range.begin(), range.end(), logical_shape.At(i))); + } + } + } + return std::make_shared>(slice_ctx); } private: @@ -388,15 +382,39 @@ class LogicalSliceAssignKernel final : public user_op::OpKernel { std::shared_ptr InitOpKernelCache( user_op::KernelCacheContext* ctx) const override { - if (ctx->parallel_ctx().parallel_num() > 1) { - const NdSbp& value_nd_sbp = ctx->NdSbp4ArgNameAndIndex("value", 0); - CHECK(std::all_of(value_nd_sbp.sbp_parallel().begin(), value_nd_sbp.sbp_parallel().end(), - [](const SbpParallel& sbp) { - return sbp.has_partial_sum_parallel() || sbp.has_broadcast_parallel(); - })) - << "value's sbp must be broadcast or partial_sum"; + SliceContext slice_ctx; + if (ctx->parallel_ctx().parallel_num() == 1) { + // split_axis == SPLIT_AXIS_FOR_NON_SPLIT means the sbp attribute is not 'split' + CHECK_JUST(slice_ctx.PushSplitInfo(SPLIT_AXIS_FOR_NON_SPLIT, 0, 0, 0)); + } else { + const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy(); + NdSbp ref_nd_sbp = ctx->NdSbp4ArgNameAndIndex("ref", 0); + { + const NdSbp value_nd_sbp = ctx->NdSbp4ArgNameAndIndex("value", 0); + // If ref and value both split in the same axis(full slice), + // we can consider the physical tensor is broadcast in this axis. + for (int i = 0; i < parallel_hierarchy.NumAxes(); ++i) { + const SbpParallel& ref_sbp = ref_nd_sbp.sbp_parallel(i); + const SbpParallel& value_sbp = value_nd_sbp.sbp_parallel(i); + if (ref_sbp.has_split_parallel() && value_sbp.has_split_parallel()) { + CHECK_EQ(ref_sbp.split_parallel().axis(), value_sbp.split_parallel().axis()); + ref_nd_sbp.mutable_sbp_parallel(i)->clear_split_parallel(); + ref_nd_sbp.mutable_sbp_parallel(i)->mutable_broadcast_parallel(); + } + } + } + const Shape& logical_shape = ctx->LogicalTensorDesc4ArgNameAndIndex("ref", 0)->shape(); + const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); + const TensorSliceView& slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, ref_nd_sbp, logical_shape, parallel_id); + for (int i = 0; i < logical_shape.NumAxes(); ++i) { + const Range& range = slice_view.At(i); + if (range.begin() != 0 || range.end() != logical_shape.At(i)) { + CHECK_JUST(slice_ctx.PushSplitInfo(i, range.begin(), range.end(), logical_shape.At(i))); + } + } } - return CreateSliceCache(ctx, "ref"); + return std::make_shared>(slice_ctx); } private: diff --git a/oneflow/user/ops/arange_op.cpp b/oneflow/user/ops/arange_op.cpp index 225f3fa37cf..73585347376 100644 --- a/oneflow/user/ops/arange_op.cpp +++ b/oneflow/user/ops/arange_op.cpp @@ -84,8 +84,9 @@ namespace oneflow { const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy(); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; diff --git a/oneflow/user/ops/constant_op.cpp b/oneflow/user/ops/constant_op.cpp index 8cf14f3b22f..62d9bdcc050 100644 --- a/oneflow/user/ops/constant_op.cpp +++ b/oneflow/user/ops/constant_op.cpp @@ -29,8 +29,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/cublas_fused_mlp_op.cpp b/oneflow/user/ops/cublas_fused_mlp_op.cpp index dc806e15c09..65619b14d0c 100644 --- a/oneflow/user/ops/cublas_fused_mlp_op.cpp +++ b/oneflow/user/ops/cublas_fused_mlp_op.cpp @@ -64,8 +64,7 @@ Maybe InferTensorDesc4FusedMatmul(user_op::InferContext* ctx) { cublas_aux_ld = n; // Set Middle result shape. long cublas_aligned_aux_ld = AlignReluAuxLd(cublas_aux_ld); - int64_t aux_size = - cublas_aligned_aux_ld / GetSizeOfDataType(DataType::kInt8); // Cause we use int8_t as dtype + int64_t aux_size = cublas_aligned_aux_ld / 32; // Cause we use int32_t as dtype *ctx->OutputShape("cublas_aux", idx) = Shape({m, aux_size}); *ctx->OutputShape("hidden", idx) = Shape({m, n}); // Set for next layer. @@ -94,7 +93,7 @@ Maybe InferDataType4Matmul(user_op::InferContext* ctx) { for (int32_t i = 0; i < ctx->output_size("cublas_aux"); i++) { user_op::TensorDesc* aux_desc = ctx->OutputTensorDesc("cublas_aux", i); - *aux_desc->mut_data_type() = DataType::kInt8; + *aux_desc->mut_data_type() = DataType::kInt32; } return Maybe::Ok(); @@ -156,30 +155,22 @@ REGISTER_USER_OP_GRAD("cublas_fused_mlp") last_bias_grad = op.GetGradTensorWithOpOutput("out", 0); } - // step2: use CublasFusedMatmulBiasAddGrad to get last layer's bias grad and weight grad. - std::string last_layer_x = op.input("x", 0); - if (weight_num >= 2) { last_layer_x = op.output("hidden", weight_num - 2); } - - user_op::UserOpConfWrapperBuilder cublas_matmul_bias_add_grad_builder( - op.op_name() + "_cublas_matmul_bias_add_grad"); - user_op::UserOpConfWrapper cublas_matmul_bias_add_grad_op = - cublas_matmul_bias_add_grad_builder.Op("cublas_matmul_bias_add_grad") - .Input("dy", last_bias_grad) - .Input("x", last_layer_x) - .Output("w_grad") - .Output("b_grad") - .Build(); - AddOp(cublas_matmul_bias_add_grad_op); - + // step2: use reduce_sum to get last layer's bias grad. + // TODO: Currently Only support 2d fused_matmul. + // so here we hard encode bias reduce axis as 0. + std::vector reduce_axes_vec{0}; + user_op::UserOpConfWrapperBuilder bias_grad_builder(op.op_name() + "_bias_grad"); + user_op::UserOpConfWrapper bias_grad_op = bias_grad_builder.Op("reduce_sum") + .Input("input_tensor", last_bias_grad) + .Output("output_tensor") + .Attr("axis", reduce_axes_vec) + .Attr("keepdims", false) + .Build(); + AddOp(bias_grad_op); if (op.NeedGenGradTensor4OpInput("biases", weight_num - 1)) { - op.BindGradTensorWithOpInput(cublas_matmul_bias_add_grad_op.output("b_grad", 0), "biases", + op.BindGradTensorWithOpInput(bias_grad_op.output("output_tensor", 0), "biases", weight_num - 1); } - if (op.NeedGenGradTensor4OpInput("weights", weight_num - 1)) { - op.BindGradTensorWithOpInput(cublas_matmul_bias_add_grad_op.output("w_grad", 0), "weights", - weight_num - 1); - } - std::string cublas_dy = last_bias_grad; for (int32_t hidden_layer_idx = weight_num - 1; hidden_layer_idx > 0; hidden_layer_idx--) { user_op::UserOpConfWrapperBuilder cublas_bias_add_relu_matmul_grad_builder( @@ -189,6 +180,7 @@ REGISTER_USER_OP_GRAD("cublas_fused_mlp") .Input("dy", cublas_dy) .Input("weight", op.input("weights", hidden_layer_idx)) .Input("aux", op.output("cublas_aux", hidden_layer_idx - 1)) + .Attr("alpha", 1.0) .Output("d_grad") .Output("d_bias") .Build(); @@ -199,22 +191,19 @@ REGISTER_USER_OP_GRAD("cublas_fused_mlp") hidden_layer_idx - 1); // previous layers bias grad } - // dw, need to skip final layer, cause final layer's wgrad has used CublasMatmulBiasAddGrad - // to calculate. - if (op.NeedGenGradTensor4OpInput("weights", hidden_layer_idx) - && hidden_layer_idx != (weight_num - 1)) { - user_op::UserOpConfWrapperBuilder matmul_weight_grad_builder( - op.op_name() + "_matmul_a_grad_" + std::to_string(hidden_layer_idx)); - user_op::UserOpConfWrapper matmul_weight_grad_op = - matmul_weight_grad_builder.Op("matmul") - .Input("a", cublas_dy) - .Input("b", op.output("hidden", hidden_layer_idx - 1)) - .Output("out") - .Attr("transpose_a", true) - .Attr("transpose_b", false) - .Attr("alpha", 1.0) - .Build(); - AddOp(matmul_weight_grad_op); + user_op::UserOpConfWrapperBuilder matmul_weight_grad_builder( + op.op_name() + "_matmul_a_grad_" + std::to_string(hidden_layer_idx)); + user_op::UserOpConfWrapper matmul_weight_grad_op = + matmul_weight_grad_builder.Op("matmul") + .Input("a", cublas_dy) + .Input("b", op.output("hidden", hidden_layer_idx - 1)) + .Output("out") + .Attr("transpose_a", true) + .Attr("transpose_b", false) + .Attr("alpha", 1.0) + .Build(); + AddOp(matmul_weight_grad_op); + if (op.NeedGenGradTensor4OpInput("weights", hidden_layer_idx)) { op.BindGradTensorWithOpInput(matmul_weight_grad_op.output("out", 0), "weights", hidden_layer_idx); } @@ -223,7 +212,7 @@ REGISTER_USER_OP_GRAD("cublas_fused_mlp") } // For the first layer, we need to use 2 matmul to get grads. - std::string last_dy = last_bias_grad; + std::string last_dy; if (weight_num != 1) { last_dy = cublas_dy; } // dx: user_op::UserOpConfWrapperBuilder matmul_input_grad_builder(op.op_name() @@ -240,22 +229,19 @@ REGISTER_USER_OP_GRAD("cublas_fused_mlp") if (op.NeedGenGradTensor4OpInput("x", 0)) { op.BindGradTensorWithOpInput(matmul_input_grad_op.output("out", 0), "x", 0); } - - if (op.NeedGenGradTensor4OpInput("weights", 0) && weight_num >= 2) { - // dw: - user_op::UserOpConfWrapperBuilder matmul_weight_grad_builder(op.op_name() - + "_matmul_input_weight_grad"); - user_op::UserOpConfWrapper matmul_weight_grad_op = matmul_weight_grad_builder.Op("matmul") - .Input("a", last_dy) - .Input("b", op.input("x", 0)) - .Output("out") - .Attr("transpose_a", true) - .Attr("transpose_b", false) - .Attr("alpha", 1.0) - .Build(); - AddOp(matmul_weight_grad_op); - // If weight_num == 1, dw has been calculated by CublasMatmulBiasAddGrad, so we need to - // skip. + // dw: + user_op::UserOpConfWrapperBuilder matmul_weight_grad_builder(op.op_name() + + "_matmul_input_weight_grad"); + user_op::UserOpConfWrapper matmul_weight_grad_op = matmul_weight_grad_builder.Op("matmul") + .Input("a", last_dy) + .Input("b", op.input("x", 0)) + .Output("out") + .Attr("transpose_a", true) + .Attr("transpose_b", false) + .Attr("alpha", 1.0) + .Build(); + AddOp(matmul_weight_grad_op); + if (op.NeedGenGradTensor4OpInput("weights", 0)) { op.BindGradTensorWithOpInput(matmul_weight_grad_op.output("out", 0), "weights", 0); } diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index fa7bd0815a0..4e9c23b663b 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -22,16 +22,18 @@ namespace oneflow { /* static */ Maybe DimGatherOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { const user_op::TensorDesc& in = ctx->InputTensorDesc("input", 0); int64_t input_num_axes = in.shape().NumAxes(); - CHECK_GT_OR_RETURN(input_num_axes, 0); + // For 0-dim tensor + CHECK_GE_OR_RETURN(input_num_axes, 0); // NOLINT CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); const user_op::TensorDesc& index = ctx->InputTensorDesc("index", 0); int64_t index_num_axes = index.shape().NumAxes(); const int32_t dim = ctx->Attr("dim"); + // For 0-dim tensor CHECK_GE_OR_RETURN(dim, 0); - CHECK_LT_OR_RETURN(dim, input_num_axes); - CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); + CHECK_LE_OR_RETURN(dim, input_num_axes); // NOLINT + if (input_num_axes > 0) { CHECK_GE_OR_RETURN(input_num_axes, index_num_axes); } // NOLINT CHECK_EQ_OR_RETURN(in.is_dynamic(), index.is_dynamic()); diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 99e090994e0..60ef6283774 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -34,7 +34,8 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { // check index.numaxes == src.num_axes == input/like.numaxes int64_t src_num_axes = src.shape().NumAxes(); - CHECK_GT_OR_RETURN(src_num_axes, 0); + // For 0-dim Tensor + CHECK_GE_OR_RETURN(src_num_axes, 0); // NOLINT CHECK_LE_OR_RETURN(src_num_axes, user_op::kDimGatherMaxDimCount); int64_t index_num_axes = index.shape().NumAxes(); CHECK_EQ_OR_RETURN(src_num_axes, index_num_axes); @@ -47,7 +48,14 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { } else { OF_UNIMPLEMENTED() << "Input tensor and like tensor cannot be empty simultaneously."; } - CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); + // For 0-dim Tensor + if (output_num_axes != 0 && index_num_axes != 0) { + CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); // NOLINT + } else if (output_num_axes != 0) { + CHECK_LE_OR_RETURN(output_num_axes, 1); // NOLINT + } else { + CHECK_LE_OR_RETURN(index_num_axes, 1); // NOLINT + } // check index.shape(i) <= input/like.shape(i) FOR_RANGE(int64_t, i, 0, index_num_axes) { @@ -79,7 +87,8 @@ Maybe InferScalarTensorDesc(user_op::InferContext* ctx) { // check index.numaxes == src.num_axes == input/like.numaxes int64_t output_num_axes = input.shape().NumAxes(); int64_t index_num_axes = index.shape().NumAxes(); - CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); + // For 0-dim tensor + CHECK_GE_OR_RETURN(output_num_axes, index_num_axes); // NOLINT // check index.shape(i) <= input/like.shape(i) FOR_RANGE(int64_t, i, 0, index_num_axes) { diff --git a/oneflow/user/ops/distributions/normal_op.cpp b/oneflow/user/ops/distributions/normal_op.cpp index 5af64e0d3bb..736a70e5d0b 100644 --- a/oneflow/user/ops/distributions/normal_op.cpp +++ b/oneflow/user/ops/distributions/normal_op.cpp @@ -32,8 +32,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/distributions/uniform_int_op.cpp b/oneflow/user/ops/distributions/uniform_int_op.cpp index 9e79e69c4e5..f01bb710f3c 100644 --- a/oneflow/user/ops/distributions/uniform_int_op.cpp +++ b/oneflow/user/ops/distributions/uniform_int_op.cpp @@ -35,8 +35,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/distributions/uniform_op.cpp b/oneflow/user/ops/distributions/uniform_op.cpp index 206a27426d8..b7d566aac49 100644 --- a/oneflow/user/ops/distributions/uniform_op.cpp +++ b/oneflow/user/ops/distributions/uniform_op.cpp @@ -35,8 +35,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/eager_nccl_ops.cpp b/oneflow/user/ops/eager_nccl_ops.cpp index 1399ea4d97a..bd4cdda1367 100644 --- a/oneflow/user/ops/eager_nccl_ops.cpp +++ b/oneflow/user/ops/eager_nccl_ops.cpp @@ -133,8 +133,9 @@ namespace oneflow { const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy(); const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, in_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, in_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *out_shape = physical_shape; } else { *out_shape = in_shape; diff --git a/oneflow/user/ops/empty_op.cpp b/oneflow/user/ops/empty_op.cpp index f2060b49950..4489902d730 100644 --- a/oneflow/user/ops/empty_op.cpp +++ b/oneflow/user/ops/empty_op.cpp @@ -30,8 +30,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; *ctx->OutputStride("out", 0) = Stride(physical_shape); diff --git a/oneflow/user/ops/fused_cross_feature_interaction_op.cpp b/oneflow/user/ops/fused_cross_feature_interaction_op.cpp new file mode 100644 index 00000000000..0dfce53893d --- /dev/null +++ b/oneflow/user/ops/fused_cross_feature_interaction_op.cpp @@ -0,0 +1,181 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +/* static */ Maybe FusedCrossFeatureInteractionOp::InferLogicalTensorDesc( + user_op::InferContext* ctx) { + const Shape& x_shape = ctx->InputShape("x", 0); + const Shape& weight_shape = ctx->InputShape("weight", 0); + CHECK_EQ_OR_RETURN(x_shape.At(1), weight_shape.At(1)) << "Matmul K dims should be equal. "; + *ctx->OutputShape("matmul_result", 0) = Shape({x_shape.At(0), weight_shape.At(0)}); + const Shape& x0_shape = ctx->InputShape("x0", 0); + const Shape& bias_shape = ctx->InputShape("bias", 0); + CHECK_EQ_OR_RETURN(bias_shape.At(0), x0_shape.At(1)) << "Bias dim should be equal to X0 dim1. "; + *ctx->OutputShape("out", 0) = x0_shape; + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionOp::InferPhysicalTensorDesc( + user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe FusedCrossFeatureInteractionOp::GetSbp(user_op::SbpContext* ctx) { + ctx->NewBuilder() + .Split(user_op::OpArg("x", 0), 0) + .Broadcast(user_op::OpArg("weight", 0)) + .Split(user_op::OpArg("x0", 0), 0) + .Broadcast(user_op::OpArg("bias", 0)) + .Split(user_op::OpArg("matmul_result", 0), 0) + .Split(user_op::OpArg("out", 0), 0) + .Build(); + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionOp::InferDataType(user_op::InferContext* ctx) { + *ctx->OutputDType("out", 0) = ctx->InputDType("x", 0); + *ctx->OutputDType("matmul_result", 0) = ctx->InputDType("x", 0); + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionV1GradOp::InferLogicalTensorDesc( + user_op::InferContext* ctx) { + const Shape& x0_shape = ctx->InputShape("x0", 0); + const Shape& weight_shape = ctx->InputShape("weight", 0); + *ctx->OutputShape("dx0", 0) = x0_shape; + *ctx->OutputShape("dw", 0) = weight_shape; + *ctx->OutputShape("dx", 0) = x0_shape; + *ctx->OutputShape("dbias", 0) = Shape({x0_shape.At(1)}); + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionV1GradOp::InferPhysicalTensorDesc( + user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe FusedCrossFeatureInteractionV1GradOp::GetSbp(user_op::SbpContext* ctx) { + ctx->NewBuilder() + .Split(user_op::OpArg("dy", 0), 0) + .Broadcast(user_op::OpArg("weight", 0)) + .Split(user_op::OpArg("x", 0), 0) + .Split(user_op::OpArg("x0", 0), 0) + .Split(user_op::OpArg("matmul_result", 0), 0) + .Split(user_op::OpArg("dx0", 0), 0) + .PartialSum(user_op::OpArg("dw", 0)) + .Split(user_op::OpArg("dx", 0), 0) + .PartialSum(user_op::OpArg("dbias", 0)) + .Build(); + + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionV1GradOp::InferDataType( + user_op::InferContext* ctx) { + *ctx->OutputDType("dx0", 0) = ctx->InputDType("x", 0); + *ctx->OutputDType("dw", 0) = ctx->InputDType("x", 0); + *ctx->OutputDType("dx", 0) = ctx->InputDType("x", 0); + *ctx->OutputDType("dbias", 0) = ctx->InputDType("x", 0); + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionV2GradOp::InferLogicalTensorDesc( + user_op::InferContext* ctx) { + const Shape& x0_shape = ctx->InputShape("x0", 0); + const Shape& weight_shape = ctx->InputShape("weight", 0); + *ctx->OutputShape("dx0", 0) = x0_shape; + *ctx->OutputShape("dw", 0) = weight_shape; + *ctx->OutputShape("dx", 0) = x0_shape; + *ctx->OutputShape("dbias", 0) = Shape({x0_shape.At(1)}); + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionV2GradOp::InferPhysicalTensorDesc( + user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe FusedCrossFeatureInteractionV2GradOp::GetSbp(user_op::SbpContext* ctx) { + ctx->NewBuilder() + .Split(user_op::OpArg("dy", 0), 0) + .Broadcast(user_op::OpArg("weight", 0)) + .Broadcast(user_op::OpArg("bias", 0)) + .Split(user_op::OpArg("x", 0), 0) + .Split(user_op::OpArg("x0", 0), 0) + .Split(user_op::OpArg("matmul_result", 0), 0) + .Split(user_op::OpArg("dx0", 0), 0) + .PartialSum(user_op::OpArg("dw", 0)) + .Split(user_op::OpArg("dx", 0), 0) + .PartialSum(user_op::OpArg("dbias", 0)) + .Build(); + + return Maybe::Ok(); +} + +/* static */ Maybe FusedCrossFeatureInteractionV2GradOp::InferDataType( + user_op::InferContext* ctx) { + *ctx->OutputDType("dx0", 0) = ctx->InputDType("x", 0); + *ctx->OutputDType("dw", 0) = ctx->InputDType("x", 0); + *ctx->OutputDType("dx", 0) = ctx->InputDType("x", 0); + *ctx->OutputDType("dbias", 0) = ctx->InputDType("x", 0); + return Maybe::Ok(); +} + +REGISTER_USER_OP_GRAD("fused_cross_feature_interaction") + .SetGenBackwardOpConfFn([](const user_op::UserOpWrapper& op, + const user_op::AddOpFn& AddOp) -> Maybe { + user_op::UserOpConfWrapperBuilder builder(op.op_name() + "_grad"); + if (op.attr("interaction_mode") == "vector") { + builder.Op("fused_cross_feature_interaction_v1_grad") + .Input("dy", op.GetGradTensorWithOpOutput("out", 0)) + .Input("weight", op.input("weight", 0)) + .Input("x", op.input("x", 0)) + .Input("x0", op.input("x0", 0)) + .Input("matmul_result", op.output("matmul_result", 0)); + } else if (op.attr("interaction_mode") == "matrix") { + builder.Op("fused_cross_feature_interaction_v2_grad") + .Input("dy", op.GetGradTensorWithOpOutput("out", 0)) + .Input("weight", op.input("weight", 0)) + .Input("bias", op.input("bias", 0)) + .Input("x", op.input("x", 0)) + .Input("x0", op.input("x0", 0)) + .Input("matmul_result", op.output("matmul_result", 0)); + } else { + UNIMPLEMENTED(); + } + builder.Output("dx", 0).Output("dw", 0).Output("dx0", 0).Output("dbias", 0); + auto grad_op = builder.Build(); + AddOp(grad_op); + if (op.NeedGenGradTensor4OpInput("x", 0)) { + op.BindGradTensorWithOpInput(grad_op.output("dx", 0), "x", 0); + } + if (op.NeedGenGradTensor4OpInput("weight", 0)) { + op.BindGradTensorWithOpInput(grad_op.output("dw", 0), "weight", 0); + } + if (op.NeedGenGradTensor4OpInput("x0", 0)) { + op.BindGradTensorWithOpInput(grad_op.output("dx0", 0), "x0", 0); + } + if (op.NeedGenGradTensor4OpInput("bias", 0)) { + op.BindGradTensorWithOpInput(grad_op.output("dbias", 0), "bias", 0); + } + return Maybe::Ok(); + }); + +} // namespace oneflow diff --git a/oneflow/user/ops/fused_matmul_bias_add_relu_dropout_op.cpp b/oneflow/user/ops/fused_matmul_bias_add_relu_dropout_op.cpp new file mode 100644 index 00000000000..c473ba7ea57 --- /dev/null +++ b/oneflow/user/ops/fused_matmul_bias_add_relu_dropout_op.cpp @@ -0,0 +1,263 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +namespace { + +constexpr int32_t kAuxReluLdAlignRequirement = 128; + +long AlignReluAuxLd(long aux_ld) { + /* + ReLu bit-mask matrix leading dimension in elements. + Must be divisible by 128 and be no less than the number of rows in the output matrix. + */ + long old_aux_ld = aux_ld; + return ((old_aux_ld + kAuxReluLdAlignRequirement - 1) / kAuxReluLdAlignRequirement) + * kAuxReluLdAlignRequirement; +} + +Maybe InferTensorDesc4FusedMatmul(user_op::InferContext* ctx) { + const user_op::TensorDesc& x_desc = ctx->InputTensorDesc("x", 0); + int32_t weight_size = ctx->input_size("weights"); + int32_t bias_size = ctx->input_size("biases"); + CHECK_EQ_OR_RETURN(weight_size, bias_size) << "Weight num should be equal to bias num. "; + /* + A: (m, k) + B: (n, k) need transpose + C: (m, n) + */ + int64_t m = 0, n = 0, k = 0, cublas_aux_ld = 0; + m = x_desc.shape().At(0); + k = x_desc.shape().At(1); + + for (int32_t idx = 0; idx < weight_size; idx++) { + // skip first input weight. + const user_op::TensorDesc& weight_desc = ctx->InputTensorDesc("weights", idx); + const user_op::TensorDesc& bias_desc = ctx->InputTensorDesc("biases", idx); + CHECK_EQ_OR_RETURN(weight_desc.shape().NumAxes(), 2) << "Weight's ndim should be equal to 2. "; + CHECK_EQ_OR_RETURN(bias_desc.shape().NumAxes(), 1) << "Bias's ndim should be equal to 1. "; + + n = weight_desc.shape().At(0); + CHECK_EQ_OR_RETURN(bias_desc.shape().At(0), n) + << "Bias shape should be equal to N. Assume (M, K) matmul (N, K, transpose_b=True) " + "bias_add (N, ). "; + CHECK_EQ_OR_RETURN(weight_desc.shape().At(1), k) + << "Weight shape should be equal to K. Assume (M, K) matmul (N, K, transpose_b=True) " + "bias_add (N, ). "; + + cublas_aux_ld = n; + // Set Middle result shape. + long cublas_aligned_aux_ld = AlignReluAuxLd(cublas_aux_ld); + int64_t aux_size = cublas_aligned_aux_ld / 32; // Cause we use int32_t as dtype + *ctx->OutputShape("cublas_aux", idx) = Shape({m, aux_size}); + *ctx->OutputShape("hidden", idx) = Shape({m, n}); + // Set for next layer. + k = n; + } + *ctx->OutputShape("out", 0) = {m, n}; + return Maybe::Ok(); +} + +Maybe InferDataType4Matmul(user_op::InferContext* ctx) { + const user_op::TensorDesc& first_in_desc = ctx->InputTensorDesc("x", 0); + + for (const auto& in_arg_pair : ctx->inputs()) { + const user_op::TensorDesc& in_desc = + ctx->InputTensorDesc(in_arg_pair.first, in_arg_pair.second); + CHECK_EQ_OR_RETURN(in_desc.data_type(), first_in_desc.data_type()) + << "The Input's datatype should be equal. "; + } + + user_op::TensorDesc* out_desc = ctx->OutputTensorDesc("out", 0); + *out_desc->mut_data_type() = first_in_desc.data_type(); + + for (int32_t i = 0; i < ctx->output_size("hidden"); i++) { + user_op::TensorDesc* hidden_desc = ctx->OutputTensorDesc("hidden", i); + *hidden_desc->mut_data_type() = first_in_desc.data_type(); + } + + for (int32_t i = 0; i < ctx->output_size("cublas_aux"); i++) { + user_op::TensorDesc* aux_desc = ctx->OutputTensorDesc("cublas_aux", i); + *aux_desc->mut_data_type() = DataType::kInt32; + } + + return Maybe::Ok(); +} + +} // namespace + +/* static */ Maybe FusedMatmulBiasAddReluDropoutOp::InferLogicalTensorDesc( + user_op::InferContext* ctx) { + return InferTensorDesc4FusedMatmul(ctx); +} + +/*static*/ Maybe FusedMatmulBiasAddReluDropoutOp::InferPhysicalTensorDesc( + user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe FusedMatmulBiasAddReluDropoutOp::GetSbp(user_op::SbpContext* ctx) { + auto builder = ctx->NewBuilder().Split(user_op::OpArg("x", 0), 0); + for (int i = 0; i < ctx->user_op_conf().input_size("weights"); ++i) { + builder.Broadcast(user_op::OpArg("weights", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("biases"); ++i) { + builder.Broadcast(user_op::OpArg("biases", i)); + } + for (int i = 0; i < ctx->user_op_conf().output_size("cublas_aux"); ++i) { + builder.Split(user_op::OpArg("cublas_aux", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().output_size("hidden"); ++i) { + builder.Split(user_op::OpArg("hidden", i), 0); + } + builder.Split(user_op::OpArg("out", 0), 0); + builder.Build(); + return Maybe::Ok(); +} + +/* static */ Maybe FusedMatmulBiasAddReluDropoutOp::InferDataType( + user_op::InferContext* ctx) { + return InferDataType4Matmul(ctx); +} + +REGISTER_USER_OP_GRAD("fused_matmul_bias_add_relu_dropout") + .SetGenBackwardOpConfFn([](const user_op::UserOpWrapper& op, + const user_op::AddOpFn& AddOp) -> Maybe { + bool skip_final_activation = op.attr("skip_final_activation"); + const std::vector dropout_rate_list = op.attr>("dropout_rate_list"); + float scale = 1.0; + float rate = 0.0; + int64_t weight_num = op.input_size("weights"); + + std::string last_bias_grad; + if (!skip_final_activation || (dropout_rate_list[weight_num - 1] != 0.0f)) { + // step1: Get last layer's relu+dropout grad. + rate = dropout_rate_list[weight_num - 1]; + if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); } + user_op::UserOpConfWrapperBuilder relu_grad_builder(op.op_name() + + "fused_relu_dropout_grad"); + user_op::UserOpConfWrapper relu_dropout_grad_op = + relu_grad_builder.Op("fused_relu_dropout_grad") + .Input("dy", op.GetGradTensorWithOpOutput("out", 0)) + .Input("mask", op.output("cublas_aux", weight_num - 1)) + .Attr("scale", scale) + .Output("dx") + .Build(); + AddOp(relu_dropout_grad_op); + last_bias_grad = relu_dropout_grad_op.output("dx", 0); + } else { + last_bias_grad = op.GetGradTensorWithOpOutput("out", 0); + } + + // step2: Get last layer's bias grad. + std::vector reduce_axes_vec{0}; + user_op::UserOpConfWrapperBuilder bias_grad_builder(op.op_name() + "_bias_grad"); + user_op::UserOpConfWrapper bias_grad_op = bias_grad_builder.Op("reduce_sum") + .Input("input_tensor", last_bias_grad) + .Output("output_tensor") + .Attr("axis", reduce_axes_vec) + .Attr("keepdims", false) + .Build(); + AddOp(bias_grad_op); + if (op.NeedGenGradTensor4OpInput("biases", weight_num - 1)) { + op.BindGradTensorWithOpInput(bias_grad_op.output("output_tensor", 0), "biases", + weight_num - 1); + } + std::string cublas_dy = last_bias_grad; + + for (int32_t hidden_layer_idx = weight_num - 1; hidden_layer_idx > 0; hidden_layer_idx--) { + rate = dropout_rate_list[hidden_layer_idx - 1]; + scale = 1.0; + if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); } + user_op::UserOpConfWrapperBuilder cublas_bias_add_relu_matmul_grad_builder( + op.op_name() + "_cublas_bias_add_relu_matmul_grad_" + std::to_string(hidden_layer_idx)); + user_op::UserOpConfWrapper cublas_bias_add_relu_matmul_grad_op = + cublas_bias_add_relu_matmul_grad_builder.Op("cublas_bias_add_relu_matmul_grad") + .Input("dy", cublas_dy) + .Input("weight", op.input("weights", hidden_layer_idx)) + .Input("aux", op.output("cublas_aux", hidden_layer_idx - 1)) + .Attr("alpha", scale) + .Output("d_grad") + .Output("d_bias") + .Build(); + AddOp(cublas_bias_add_relu_matmul_grad_op); + if (op.NeedGenGradTensor4OpInput("biases", hidden_layer_idx - 1)) { + op.BindGradTensorWithOpInput(cublas_bias_add_relu_matmul_grad_op.output("d_bias", 0), + "biases", + hidden_layer_idx - 1); // previous layers bias grad + } + + user_op::UserOpConfWrapperBuilder matmul_weight_grad_builder( + op.op_name() + "_matmul_a_grad_" + std::to_string(hidden_layer_idx)); + user_op::UserOpConfWrapper matmul_weight_grad_op = + matmul_weight_grad_builder.Op("matmul") + .Input("a", cublas_dy) + .Input("b", op.output("hidden", hidden_layer_idx - 1)) + .Output("out") + .Attr("transpose_a", true) + .Attr("transpose_b", false) + .Attr("alpha", 1.0) + .Build(); + AddOp(matmul_weight_grad_op); + if (op.NeedGenGradTensor4OpInput("weights", hidden_layer_idx)) { + op.BindGradTensorWithOpInput(matmul_weight_grad_op.output("out", 0), "weights", + hidden_layer_idx); + } + // update dgrad + cublas_dy = cublas_bias_add_relu_matmul_grad_op.output("d_grad", 0); + } + + // For the first layer, we need to use 2 matmul to get grads. + std::string last_dy = last_bias_grad; + if (weight_num != 1) { last_dy = cublas_dy; } + // dx: + user_op::UserOpConfWrapperBuilder matmul_input_grad_builder(op.op_name() + + "_matmul_input_grad"); + user_op::UserOpConfWrapper matmul_input_grad_op = matmul_input_grad_builder.Op("matmul") + .Input("a", last_dy) + .Input("b", op.input("weights", 0)) + .Output("out") + .Attr("transpose_a", false) + .Attr("transpose_b", false) + .Attr("alpha", 1.0) + .Build(); + AddOp(matmul_input_grad_op); + if (op.NeedGenGradTensor4OpInput("x", 0)) { + op.BindGradTensorWithOpInput(matmul_input_grad_op.output("out", 0), "x", 0); + } + // dw: + user_op::UserOpConfWrapperBuilder matmul_weight_grad_builder(op.op_name() + + "_matmul_input_weight_grad"); + user_op::UserOpConfWrapper matmul_weight_grad_op = matmul_weight_grad_builder.Op("matmul") + .Input("a", last_dy) + .Input("b", op.input("x", 0)) + .Output("out") + .Attr("transpose_a", true) + .Attr("transpose_b", false) + .Attr("alpha", 1.0) + .Build(); + AddOp(matmul_weight_grad_op); + if (op.NeedGenGradTensor4OpInput("weights", 0)) { + op.BindGradTensorWithOpInput(matmul_weight_grad_op.output("out", 0), "weights", 0); + } + + return Maybe::Ok(); + }); + +} // namespace oneflow diff --git a/oneflow/user/ops/fused_relu_dropout_grad_op.cpp b/oneflow/user/ops/fused_relu_dropout_grad_op.cpp new file mode 100644 index 00000000000..14101dd16c5 --- /dev/null +++ b/oneflow/user/ops/fused_relu_dropout_grad_op.cpp @@ -0,0 +1,61 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/data_type.pb.h" +#include "oneflow/core/common/just.h" +#include "oneflow/core/common/maybe.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/infer_util.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +namespace { + +Maybe InferTensorDesc4FusedReluDropoutGrad(user_op::InferContext* ctx) { + *ctx->OutputShape("dx", 0) = ctx->InputShape("dy", 0); + return Maybe::Ok(); +} + +Maybe InferDataType4FusedReluDropoutGrad(user_op::InferContext* ctx) { + *ctx->OutputDType("dx", 0) = ctx->InputDType("dy", 0); + return Maybe::Ok(); +} + +} // namespace + +/* static */ Maybe FusedReluDropoutGradOp::InferLogicalTensorDesc( + user_op::InferContext* ctx) { + return InferTensorDesc4FusedReluDropoutGrad(ctx); +} + +/*static*/ Maybe FusedReluDropoutGradOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe FusedReluDropoutGradOp::GetSbp(user_op::SbpContext* ctx) { + ctx->NewBuilder() + .Split(user_op::OpArg("dy", 0), 0) + .Split(user_op::OpArg("mask", 0), 0) + .Split(user_op::OpArg("dx", 0), 0) + .Build(); + return Maybe::Ok(); +} + +/* static */ Maybe FusedReluDropoutGradOp::InferDataType(user_op::InferContext* ctx) { + return InferDataType4FusedReluDropoutGrad(ctx); +} + +} // namespace oneflow diff --git a/oneflow/user/ops/gather_op.cpp b/oneflow/user/ops/gather_op.cpp index 87ded29ab9c..34fd62b74d5 100644 --- a/oneflow/user/ops/gather_op.cpp +++ b/oneflow/user/ops/gather_op.cpp @@ -23,7 +23,8 @@ namespace oneflow { CHECK_GT_OR_RETURN(in.shape().NumAxes(), 0); const int64_t axis = ctx->Attr("axis"); const user_op::TensorDesc& indices = ctx->InputTensorDesc("indices", 0); - CHECK_GT_OR_RETURN(indices.shape().NumAxes(), 0); + // For 0-dim Tensor + CHECK_GE_OR_RETURN(indices.shape().NumAxes(), 0); // NOLINT user_op::TensorDesc* out = ctx->OutputTensorDesc("out", 0); DimVector dim_vec; diff --git a/oneflow/user/ops/image_preprocess_ops.cpp b/oneflow/user/ops/image_preprocess_ops.cpp index 6e72ccda1a5..00c6d419c8b 100644 --- a/oneflow/user/ops/image_preprocess_ops.cpp +++ b/oneflow/user/ops/image_preprocess_ops.cpp @@ -156,8 +156,9 @@ namespace oneflow { const Shape logical_shape = Shape({batch_size}); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); } diff --git a/oneflow/user/ops/matmul_op.cpp b/oneflow/user/ops/matmul_op.cpp index 4e73d7e3e35..9c10d7538fe 100644 --- a/oneflow/user/ops/matmul_op.cpp +++ b/oneflow/user/ops/matmul_op.cpp @@ -222,9 +222,53 @@ void GenBackwardOpConf4Matmul(const std::string& op_type_name, const user_op::Us if (ctx->user_op_conf().has_input("_add_to_output", 0)) { out_and_add_to_output_args.emplace_back("_add_to_output", 0); } - FOR_RANGE(int64_t, i, 0, a_tensor.shape().NumAxes() - 2) { + int32_t num_axes = a_tensor.shape().NumAxes(); + FOR_RANGE(int64_t, i, 0, num_axes - 2) { ctx->NewBuilder().Split(ctx->inputs(), i).Split(out_and_add_to_output_args, i).Build(); } + int32_t m_axis = -1; + int32_t k_a_axis = -1; + int32_t k_b_axis = -1; + int32_t n_axis = -1; + if (ctx->Attr("transpose_a")) { + m_axis = num_axes - 1; + k_a_axis = num_axes - 2; + } else { + m_axis = num_axes - 2; + k_a_axis = num_axes - 1; + } + if (ctx->Attr("transpose_b")) { + k_b_axis = num_axes - 1; + n_axis = num_axes - 2; + } else { + k_b_axis = num_axes - 2; + n_axis = num_axes - 1; + } + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), m_axis) + .Broadcast(user_op::OpArg("b", 0)) + .Split(out_and_add_to_output_args, num_axes - 2) + .Build(); + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .Split(user_op::OpArg("b", 0), n_axis) + .Split(out_and_add_to_output_args, num_axes - 1) + .Build(); + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), k_a_axis) + .Split(user_op::OpArg("b", 0), k_b_axis) + .PartialSum(out_and_add_to_output_args) + .Build(); + ctx->NewBuilder() + .PartialSum(user_op::OpArg("a", 0)) + .Broadcast(user_op::OpArg("b", 0)) + .PartialSum(out_and_add_to_output_args) + .Build(); + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .PartialSum(user_op::OpArg("b", 0)) + .PartialSum(out_and_add_to_output_args) + .Build(); return Maybe::Ok(); } diff --git a/oneflow/user/ops/randperm_op.cpp b/oneflow/user/ops/randperm_op.cpp index c7e83402b86..aa6103a2f0d 100644 --- a/oneflow/user/ops/randperm_op.cpp +++ b/oneflow/user/ops/randperm_op.cpp @@ -39,8 +39,9 @@ namespace oneflow { int32_t n = ctx->Attr("n"); const Shape& logical_shape = Shape({n}); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; diff --git a/oneflow/user/ops/slice_op.cpp b/oneflow/user/ops/slice_op.cpp index fac6d0ed57c..482118b253d 100644 --- a/oneflow/user/ops/slice_op.cpp +++ b/oneflow/user/ops/slice_op.cpp @@ -154,13 +154,21 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { } /*static*/ Maybe LogicalSliceAssignOp::GetSbp(user_op::SbpContext* ctx) { - const user_op::TensorDesc& ref_desc = ctx->LogicalTensorDesc4InputArgNameAndIndex("ref", 0); - FOR_RANGE(int64_t, axis, 0, ref_desc.shape().NumAxes()) { + const Shape& x_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("ref", 0).shape(); + const int64_t ndim = x_shape.NumAxes(); + const auto& start_vec = ctx->Attr>("start"); + const auto& stop_vec = ctx->Attr>("stop"); + const auto& step_vec = ctx->Attr>("step"); + FOR_RANGE(int64_t, axis, 0, ndim) { ctx->NewBuilder() .Split(user_op::OpArg("ref", 0), axis) .Broadcast(user_op::OpArg("value", 0)) .Split(user_op::OpArg("y", 0), axis) .Build(); + // FullSlice support S+S->S + if (IsFullSlice(start_vec[axis], stop_vec[axis], step_vec[axis], x_shape.At(axis))) { + ctx->NewBuilder().Split(ctx->inputs(), axis).Split(ctx->outputs(), axis).Build(); + } } ctx->NewBuilder() .PartialSum(user_op::OpArg("ref", 0)) @@ -260,6 +268,7 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { ctx->NewBuilder().PartialSum(ctx->inputs()).PartialSum(ctx->outputs()).Build(); return Maybe::Ok(); } + /*static*/ Maybe SliceUpdateOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { const auto& x_desc = ctx->InputTensorDesc("x", 0); const int64_t ndim = x_desc.shape().NumAxes(); diff --git a/oneflow/user/ops/stack_op.cpp b/oneflow/user/ops/stack_op.cpp index 254cbcd1743..1dd129081bd 100644 --- a/oneflow/user/ops/stack_op.cpp +++ b/oneflow/user/ops/stack_op.cpp @@ -144,8 +144,6 @@ Maybe GenGradOp(const user_op::UserOpWrapper& op, const user_op::AddOpFn& /*static*/ Maybe StackGradOp::GetSbp(user_op::SbpContext* ctx) { const auto axis = ctx->Attr("axis"); - const int64_t in_num_axes = - ctx->LogicalTensorDesc4InputArgNameAndIndex("in", 0).shape().NumAxes(); const int64_t like_num_axes = ctx->LogicalTensorDesc4InputArgNameAndIndex("like", 0).shape().NumAxes(); FOR_RANGE(int64_t, i, 0, like_num_axes) { diff --git a/oneflow/user/utils/pool_util.h b/oneflow/user/utils/pool_util.h index 9a21f8a9129..4deed023f1f 100644 --- a/oneflow/user/utils/pool_util.h +++ b/oneflow/user/utils/pool_util.h @@ -21,8 +21,8 @@ limitations under the License. namespace oneflow { -typedef fixed_vector FixedDimVector; -typedef fixed_vector FixedVector; +typedef small_vector FixedDimVector; +typedef small_vector FixedVector; class Params3D { public: diff --git a/python/oneflow/autoprof/__main__.py b/python/oneflow/autoprof/__main__.py index 13ddf9d7dc1..0a247ae5cac 100644 --- a/python/oneflow/autoprof/__main__.py +++ b/python/oneflow/autoprof/__main__.py @@ -74,6 +74,19 @@ def get_oneflow_gpu_kernel_time(prof) -> Union[str, float]: return round(kernel_gpu_time, 1) +def get_oneflow_gpu_kernel_bandwidth(prof) -> str: + gpu_kernel_items = list( + filter( + lambda x: x.event_type == 1 and x.bandwidth_is_recorded, prof.key_averages() + ) + ) + if len(gpu_kernel_items) == 0: + return "-" + if len(gpu_kernel_items) == 1: + return f"{round(gpu_kernel_items[0].bandwidth, 1)}" + return ", ".join([f"{x.name}: {round(x.bandwidth, 1)}" for x in gpu_kernel_items]) + + def get_pytorch_cpu_end_to_end_time(prof) -> float: total = get_sole_value( filter(lambda x: x.key == auto_profiler.END_TO_END, prof.key_averages()) @@ -92,7 +105,9 @@ def get_oneflow_cpu_end_to_end_time(prof) -> float: def print_summary_from_csv() -> None: print("----------------------------------------------------------------------") - print('Summary ("KT" means "Kernel Time", "ET" means "End-to-end Time"):') + print( + 'Summary ("KT" means "Kernel Time", "ET" means "End-to-end Time", in microseconds; "BW" means "Bandwidth" in GB/s):' + ) with open(csv_filename, "r") as f: table: PrettyTable = prettytable.from_csv(f) table.field_names = [ @@ -100,15 +115,17 @@ def print_summary_from_csv() -> None: "Args", "Lib", "KT(GPU)", + "BW(GPU)", "KT(1 CPU)", "ET(1 CPU)", "KT(32 CPU)", "ET(32 CPU)", "Desc", ] + table.del_column("Desc") for row in table.rows: row[2] = {"PyTorch": "PT", "OneFlow": "OF"}[row[2]] - table.del_column("Desc") + print(table) @@ -129,6 +146,7 @@ def print_summary_from_csv() -> None: "Args", "Library", "Kernel Time (us, GPU)", + "Kernel Bandwidth (GB/s, GPU)", "Kernel Time (us, 1 CPU)", "End-to-end Time (us, 1 CPU)", "Kernel Time (us, 32 CPUs)", @@ -153,6 +171,7 @@ def add_row(profs): args_description, "OneFlow", get_oneflow_gpu_kernel_time(profs[0]), + get_oneflow_gpu_kernel_bandwidth(profs[0]), get_oneflow_cpu_kernel_time(profs[1]), get_oneflow_cpu_end_to_end_time(profs[1]), get_oneflow_cpu_kernel_time(profs[2]), @@ -166,6 +185,7 @@ def add_row(profs): args_description, "PyTorch", get_pytorch_gpu_kernel_time(profs[3]), + "-", get_pytorch_cpu_kernel_time(profs[4]), get_pytorch_cpu_end_to_end_time(profs[4]), get_pytorch_cpu_kernel_time(profs[5]), diff --git a/python/oneflow/framework/multi_client_session.py b/python/oneflow/framework/multi_client_session.py index 72c6e093779..64a82c12b27 100644 --- a/python/oneflow/framework/multi_client_session.py +++ b/python/oneflow/framework/multi_client_session.py @@ -124,4 +124,7 @@ def update_resource_eagerly(self, resource_config): self._session_ctx.update_resource(config_proto_str) def __del__(self): + if self._env.is_shutting_down(): + # After python shutting down, it's not safe to call oneflow + return self._TryClose() diff --git a/python/oneflow/framework/sysconfig.py b/python/oneflow/framework/sysconfig.py index ab20c63bc62..b9fa1c7cbd8 100644 --- a/python/oneflow/framework/sysconfig.py +++ b/python/oneflow/framework/sysconfig.py @@ -55,7 +55,6 @@ def get_liboneflow_link_flags() -> List[str]: return [ f"-L{oneflow_python_libs_path}", f"-l:oneflow", - f"-l:of_pyext_obj", f"-l:of_protoobj", ] diff --git a/python/oneflow/framework/tensor.py b/python/oneflow/framework/tensor.py index a94c0d1d629..8c97c973596 100755 --- a/python/oneflow/framework/tensor.py +++ b/python/oneflow/framework/tensor.py @@ -1139,6 +1139,10 @@ def _cumprod(self, dim, dtype=None): def RegisterMethods(): Tensor.ndim = property(_ndim) Tensor.numpy = _numpy + Tensor.add = _add + Tensor.add_ = _add_inplace + Tensor.sub = _sub + Tensor.sub_ = _sub_inplace Tensor.backward = _backward Tensor.__setitem__ = _setitem Tensor.__str__ = _str @@ -1164,61 +1168,24 @@ def RegisterMethods(): Tensor._meta_repr = _meta_repr Tensor.argsort = _argsort Tensor.argwhere = _argwhere - Tensor.add = _add - Tensor.add_ = _add_inplace - Tensor.clamp = _clamp - Tensor.clamp_ = _clamp_ - Tensor.clip = _clip - Tensor.clip_ = _clip_ - Tensor.cpu = _cpu - Tensor.cuda = _cuda Tensor.expand = _expand Tensor.expand_as = _expand_as - Tensor.flatten = _flatten Tensor.flip = _flip - Tensor.in_top_k = _in_top_k - Tensor.index_select = _index_select - Tensor.minimum = _minimum - Tensor.maximum = _maximum Tensor.new_empty = _new_empty Tensor.new_ones = _new_ones Tensor.new_zeros = _new_zeros - Tensor.pow = _pow - Tensor.var = _var - Tensor.std = _std - Tensor.softplus = _softplus - Tensor.tril = _tril - Tensor.triu = _triu Tensor.where = _where Tensor.norm = _norm Tensor.local_to_global = _local_to_global Tensor.global_to_global = _global_to_global Tensor.to_global = _to_global - Tensor.relu = _relu - Tensor.relu_ = _relu_inplace - Tensor.softmax = _softmax - Tensor.log_softmax = _log_softmax - Tensor.roll = _roll - Tensor.chunk = _chunk Tensor.repeat = _repeat Tensor.repeat_interleave = _repeat_interleave Tensor.tile = _tile Tensor.split = _split - Tensor.unbind = _unbind - Tensor.squeeze = _squeeze - Tensor.swapaxes = _swapaxes - Tensor.amax = _amax - Tensor.swapdims = _swapdims - Tensor.unfold = _unfold - Tensor.narrow = _narrow - Tensor.unsqueeze = _unsqueeze Tensor.to = _to - Tensor.half = _half Tensor.gather = _gather - Tensor.all = _all - Tensor.any = _any Tensor.T = property(_T) - Tensor.masked_fill = _masked_fill Tensor.masked_select = _masked_select Tensor.eq = _eq Tensor.item = _item @@ -1230,11 +1197,6 @@ def RegisterMethods(): Tensor.topk = _topk Tensor.nms = _nms Tensor.nonzero = _nonzero - Tensor.max = _max - Tensor.min = _min - Tensor.median = _median - Tensor.sum = _sum - Tensor.mean = _mean Tensor.prod = _prod Tensor.is_consistent = _is_consistent Tensor.to_consistent = _to_consistent diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py index 3c9fba9e34e..119a121058e 100644 --- a/python/oneflow/nn/graph/graph.py +++ b/python/oneflow/nn/graph/graph.py @@ -525,7 +525,7 @@ def _shallow_repr(self): return shallow_repr def _ops_repr(self): - r"""Generate this graph's operators' string representation + r"""Generate this graph's operators' string representation """ if self._is_compiled: conf = self._graph_proto.module_name2module_conf[ @@ -898,10 +898,9 @@ def __build_graph(self, *args, **kwargs): ) enable_mlir_inference_opt = False del os.environ["ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"] - if enable_mlir_inference_opt: - oneflow._oneflow_internal.FillVariableTensorMgr( - state_op_names, self._state_tensor_tuple - ) + oneflow._oneflow_internal.FillVariableTensorMgr( + state_op_names, self._state_tensor_tuple + ) # Complete the graph job proto oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete() # Save full graph job proto after job Complete for find real output blob shape and build it. @@ -941,12 +940,11 @@ def __build_graph(self, *args, **kwargs): self._c_nn_graph.register_output_op_names_and_tensors( output_op_names, self._outputs_tensor_tuple ) - if enable_mlir_inference_opt: - ( - state_op_names, - state_tensors, - ) = oneflow._oneflow_internal.DumpVariableTensorMgr() - self._state_tensor_tuple = convert_to_tensor_tuple(state_tensors) + ( + state_op_names, + state_tensors, + ) = oneflow._oneflow_internal.DumpVariableTensorMgr() + self._state_tensor_tuple = convert_to_tensor_tuple(state_tensors) self._c_nn_graph.register_variable_op_names_and_tensors( state_op_names, self._state_tensor_tuple @@ -1354,6 +1352,7 @@ def __del__(self): # So it's safe to skip sync here. return oneflow._oneflow_internal.eager.Sync() + oneflow._oneflow_internal.ClearVariableTensorMgr() def __ensure_input_tensors_contiguous(self, *args, **kwargs): args_tree = ArgsTree((args, kwargs), False) diff --git a/python/oneflow/nn/graph/graph_config.py b/python/oneflow/nn/graph/graph_config.py index dfb3795b3ac..ea48ad8d957 100644 --- a/python/oneflow/nn/graph/graph_config.py +++ b/python/oneflow/nn/graph/graph_config.py @@ -17,6 +17,7 @@ from collections import OrderedDict +import oneflow.boxing.nccl as nccl_config from oneflow.nn.graph.optimizer import OptDict import oneflow.core.job.job_conf_pb2 as job_conf_pb @@ -45,24 +46,51 @@ def training(self): return False raise NotImplementedError - def set_outputs_buffer_size(self, value: int = 2): - r"""Set the outputs buffer size of ``nn.Graph``. + def enable_amp(self, mode: bool = True): + r"""If set to true, then graph will use mixed precision mode, it means use both float16 and float32 during model training. - When graph's outputs buffer size is greater than 2, multiple call on the graph can work like a pipeline. This makes multiple call takes less time. + For example: - The default outputs buffer size is 2. + .. code-block:: python - # TODO (lixiang): Explain the meaning of the size of buffer size and add sample code. - # The size of the buffer size indicates the maximum number of iterations that the output of the Graph and the Graph actually executed asynchronously can overlap. - # If the buffer size is 1, there is no pipeline. A size of 2 means that it can execute 1 iter ahead of time. A size of 3 means that two iters can be executed ahead of time. + import oneflow as flow + + class Graph(flow.nn.Graph): + def __init__(self): + super().__init__() + self.linear = flow.nn.Linear(3, 8, False) + self.config.enable_amp(True) # Use mixed precision mode. + def build(self, x): + return self.linear(x) + + graph = Graph() Args: - value (int): graph ouputs buffer size. + mode (bool, optional): The default vaule is True. + """ - self._outputs_buffer_size = value + assert type(mode) is bool + self.proto.enable_auto_mixed_precision = mode - def enable_amp(self, mode: bool = True): - r"""If set to true, then graph will use mixed precision mode, it means use both float16 and float32 during model training. + def set_zero_redundancy_optimizer_mode(self, mode: str = "distributed_split"): + raise RuntimeError( + "`set_zero_redundancy_optimizer_mode` has been changed to `enable_zero`, please use `enable_zero(True)` to activate ZeRO optimization." + ) + + def enable_zero( + self, + mode: bool = True, + *, + stage: int = 2, + shard_min_size: int = 1024, + shard_restore_level: int = 1, + ): + r"""Enable ZeRO redundancy optimizer. + + This optimzation will reduce optimizer states memory consumption as described + by ZeRO https://arxiv.org/abs/1910.02054 . + + The default zero stage is 2. For example: @@ -74,17 +102,36 @@ class Graph(flow.nn.Graph): def __init__(self): super().__init__() self.linear = flow.nn.Linear(3, 8, False) - self.config.enable_amp(True) # Use mixed precision mode. + self.config.enable_zero() def build(self, x): return self.linear(x) graph = Graph() Args: - mode (bool, optional): The default vaule is True. + mode (bool): if set to true, optimizer states of Data Parallel will be sharded across devices. + stage (int): optimization stage, range from 1 to 3. + shard_min_size (int): min size of a shard of an optimizer state. + shard_restore_level (int): level to restore sharded parameter to whole parameter for consumer operators, level 0 is no restore, level 1 is soft restore, level 2 is hard restore. Note that this paremeter is at pre-alpha stage. """ - assert type(mode) is bool - self.proto.enable_auto_mixed_precision = mode + if not mode: + self.proto.optimizer_placement_optimization_mode = "none" + return + assert stage >= 1 and stage <= 3, "ZeRO stage must range form 1 to 3." + assert ( + shard_min_size > 0 + ), "ZeRO min size of a sharded optimizer state must > 0." + assert stage >= 1 and stage <= 3, "ZeRO stage must range form 1 to 3." + if stage >= 1: + self.proto.optimizer_placement_optimization_mode = "distributed_split" + self.proto.optimizer_placement_optimization_threshold = shard_min_size + self.proto.optimizer_placement_optimization_shard_restore_level = ( + shard_restore_level + ) + if stage >= 2: + nccl_config.enable_use_compute_stream(True) + if stage >= 3: + nccl_config.disable_group_boxing_by_dst_parallel(True) def allow_fuse_model_update_ops(self, mode: bool = True): r"""If set to true, try to fuse cast + scale + l1_l2_regularize_gradient + model_update to one op to improve performance. @@ -188,61 +235,23 @@ def build(self, x): """ self.proto.num_gradient_accumulation_steps = value - def set_zero_redundancy_optimizer_mode(self, mode: str = "distributed_split"): - r"""Set mode to remove redundancy of optimizer states. - This optimzation will reduce optimizer states memory consumption as described - by ZeRO https://arxiv.org/abs/1910.02054 . - - For example: - - .. code-block:: python - - import oneflow as flow - - class Graph(flow.nn.Graph): - def __init__(self): - super().__init__() - self.linear = flow.nn.Linear(3, 8, False) - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - def build(self, x): - return self.linear(x) - - graph = Graph() - - Args: - mode (str): "distributed_split" or "non_distributed". "distributed_split" mode - will shard each optimizer state across devices. "non_distributed" mode - will place each optimizer state to only one device. - """ - assert mode in ("distributed_split", "non_distributed") - self.proto.optimizer_placement_optimization_mode = mode - - def set_zero_redundancy_optimizer_min_size_after_split(self, value): - r"""Set the min size of optimizer state/grad/parameter after split. - - For example: - - .. code-block:: python + def set_outputs_buffer_size(self, value: int = 2): + r"""Set the outputs buffer size of ``nn.Graph``. - import oneflow as flow + When graph's outputs buffer size is greater than 2, multiple call on the graph can work like a pipeline. This makes multiple call takes less time. - class Graph(flow.nn.Graph): - def __init__(self): - super().__init__() - self.linear = flow.nn.Linear(3, 8, False) - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - def build(self, x): - return self.linear(x) + The default outputs buffer size is 2. - graph = Graph() + # TODO (lixiang): Explain the meaning of the size of buffer size and add sample code. + # The size of the buffer size indicates the maximum number of iterations that the output of the Graph and the Graph actually executed asynchronously can overlap. + # If the buffer size is 1, there is no pipeline. A size of 2 means that it can execute 1 iter ahead of time. A size of 3 means that two iters can be executed ahead of time. Args: - value (int): min size value. + value (int): graph ouputs buffer size. """ assert isinstance(value, int) assert value >= 1 - self.proto.optimizer_placement_optimization_threshold = value + self._outputs_buffer_size = value def enable_cudnn_conv_heuristic_search_algo(self, mode: bool = True): r""" Whether enable cudnn conv operatioin to use heuristic search algorithm. diff --git a/python/oneflow/nn/modules/fused_mlp.py b/python/oneflow/nn/modules/fused_mlp.py index 34ace37aab7..fb117cffcbb 100644 --- a/python/oneflow/nn/modules/fused_mlp.py +++ b/python/oneflow/nn/modules/fused_mlp.py @@ -32,6 +32,10 @@ class FusedMLP(Module): out_features: The final Linear layer hidden size + hidden_dropout_rate: A tuple of each hidden layer's dropout rate + + out_dropout_rate: The final Linear layer's dropout rate + Shape: - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of additional dimensions and :math:`H_{in} = {in\\_features}` @@ -63,6 +67,8 @@ def __init__( in_features: int, hidden_features: Tuple[int], out_features: int, + hidden_dropout_rate: Tuple[float] = None, + out_dropout_rate: float = 0.0, skip_final_activation=False, ) -> None: super().__init__() @@ -72,9 +78,21 @@ def __init__( # TODO(zzk): Add more activation support. self.skip_final_activation = skip_final_activation self.hidden_layer_num = len(hidden_features) - + self.dropout_rate_list = ( + hidden_dropout_rate + if hidden_dropout_rate + else [0.0] * (self.hidden_layer_num) + ) + self.dropout_rate_list += [out_dropout_rate] self.add_parameters() self.reset_parameters() + self.use_dropout = False + for i in range(self.hidden_layer_num + 1): + if self.dropout_rate_list[i] != 0.0: + self.use_dropout = True + break + if not self.training: + self.use_dropout = False def add_parameters(self) -> None: """Register parameter in FusedMLP module. @@ -166,10 +184,18 @@ def reset_parameters(self) -> None: flow.nn.init.uniform_(self.bias(layer_idx), -bound, bound) def forward(self, x): - res = flow._C.fused_mlp( - x, self.weights(), self.biases(), self.skip_final_activation - ) - return res + if self.use_dropout: + return flow._C.fused_matmul_bias_add_relu_dropout( + x, + self.weights(), + self.biases(), + self.skip_final_activation, + self.dropout_rate_list, + ) + else: + return flow._C.fused_mlp( + x, self.weights(), self.biases(), self.skip_final_activation + ) def extra_repr(self) -> str: return "in_features={}, hidden_features={}, out_features={}, skip_final_activation={}".format( diff --git a/python/oneflow/nn/modules/sparse.py b/python/oneflow/nn/modules/sparse.py index 0731f7b21b9..b8eb4d50b9a 100644 --- a/python/oneflow/nn/modules/sparse.py +++ b/python/oneflow/nn/modules/sparse.py @@ -160,9 +160,12 @@ def forward(self, indices): flow._C.embedding_renorm_( self.weight, indices, self.max_norm, self.norm_type ) - return flow._C.embedding( - self.weight, indices, self.padding_idx, self.scale_grad_by_freq - ) + if self.padding_idx is None and not self.scale_grad_by_freq: + return flow._C.gather(self.weight, indices, axis=0) + else: + return flow._C.embedding( + self.weight, indices, self.padding_idx, self.scale_grad_by_freq + ) def embedding( @@ -232,7 +235,10 @@ def embedding( with flow.no_grad(): weight = flow._C.embedding_renorm_(weight, input, max_norm, norm_type) - return flow._C.embedding(weight, input, padding_idx, scale_grad_by_freq) + if padding_idx is None and not scale_grad_by_freq: + return flow._C.gather(weight, input, axis=0) + else: + return flow._C.embedding(weight, input, padding_idx, scale_grad_by_freq) if __name__ == "__main__": diff --git a/python/oneflow/test/exceptions/test_array_functor.py b/python/oneflow/test/exceptions/test_array_functor.py index 991b8e90d31..595768957b1 100644 --- a/python/oneflow/test/exceptions/test_array_functor.py +++ b/python/oneflow/test/exceptions/test_array_functor.py @@ -31,7 +31,7 @@ def test_broadcast_like_runtime_error(test_case): like = flow.ones((2, 2, 2), dtype=flow.float32, requires_grad=True) y = flow.broadcast_like(x, like) test_case.assertTrue( - "doesn't match the broadcast shape" in str(context.exception) + "The expanded size of the tensor" in str(context.exception) ) def test_concat_index_error(test_case): diff --git a/python/oneflow/test/exceptions/test_nn_functor.py b/python/oneflow/test/exceptions/test_nn_functor.py new file mode 100644 index 00000000000..33f7bdf0142 --- /dev/null +++ b/python/oneflow/test/exceptions/test_nn_functor.py @@ -0,0 +1,386 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import re +import unittest + +import oneflow as flow +import oneflow.unittest + +from oneflow.test_utils.automated_test_util import * + + +class TestBiasAddError(flow.unittest.TestCase): + def test_bias_add_dimension_match_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + bias = flow.ones((5,), dtype=flow.float32) + out = flow._C.bias_add(x, bias, axis=1) + + test_case.assertTrue( + "The size of tensor x (4,4) must match the size of tensor b (5,) at dimension 1" + in str(ctx.exception) + ) + + def test_bias_add_index_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + bias = flow.ones((5,), dtype=flow.float32) + out = flow._C.bias_add(x, bias, axis=3) + + test_case.assertTrue( + "Dimension out of range (expected to be in range of [-2,1], but got 3)" + in str(ctx.exception) + ) + + +class TestCrossEntropyError(flow.unittest.TestCase): + def test_cross_entropy_reduction_type_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + target = flow.ones((4, 4), dtype=flow.float32) + out = flow._C.cross_entropy(x, target, None, 0, "just_test") + + test_case.assertTrue( + "Reduction should be none, sum or mean." in str(ctx.exception) + ) + + +class TestCTCLossError(flow.unittest.TestCase): + def test_ctcloss_reduction_type_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((5, 2, 3), dtype=flow.float32) + targets = flow.tensor([[1, 2, 2], [1, 2, 2]], dtype=flow.int32) + input_lengths = flow.tensor([5, 5], dtype=flow.int32) + target_lengths = flow.tensor([3, 3], dtype=flow.int32) + max_target_length = 0 + if targets.ndim == 1: + max_target_length = target_lengths.max().item() + elif targets.ndim == 2: + max_target_length = targets.shape[1] + loss = flow._C.ctc_loss( + x, + targets, + input_lengths, + target_lengths, + max_target_length, + blank=0, + zero_infinity=False, + reduction="just_test", + ) + test_case.assertTrue( + "Reduction should be none, sum or mean." in str(ctx.exception) + ) + + +class TestPadError(flow.unittest.TestCase): + def test_pad_size_attribute_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1, 1), dtype=flow.float32) + out = flow._C.pad(x, (1, 1, 1, 1, 1)) + test_case.assertTrue( + "Pad size should less than or equal to input axes * 2." + in str(ctx.exception) + ) + + def test_pad_size_mod2_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1, 1), dtype=flow.float32) + out = flow._C.pad(x, (1, 1, 1,)) + + test_case.assertTrue( + "Length of pad must be even but instead it equals 3" in str(ctx.exception) + ) + + def test_reflect_pad_size_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1, 1, 2, 2), dtype=flow.float32) + out = flow._C.pad(x, (4, 4, 4, 4), mode="reflect") + + test_case.assertTrue( + "padding size should be less than the corresponding input dimension!" + in str(ctx.exception) + ) + + def test_pad_mode_error(test_case): + with test_case.assertRaises(NotImplementedError) as ctx: + x = flow.ones((1, 1, 2, 2), dtype=flow.float32) + out = flow._C.pad(x, (4, 4, 4, 4), mode="test") + + test_case.assertTrue( + "Pad mode is test, but only constant, reflect and replicate are valid." + in str(ctx.exception) + ) + + +class TestFusedMLPError(flow.unittest.TestCase): + def test_fuse_mlp_weight_size_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + bias = flow.ones((4,), dtype=flow.float32) + out = flow._C.fused_mlp(x, [], [bias], False) + + test_case.assertTrue( + "The number of weights should be greater equal than 1" in str(ctx.exception) + ) + + def test_fuse_mlp_weight_bias_size_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + w1 = flow.ones((4, 4), dtype=flow.float32) + w2 = flow.ones((4, 4), dtype=flow.float32) + bias1 = flow.ones((4,), dtype=flow.float32) + out = flow._C.fused_mlp(x, [w1, w2], [bias1], False) + + test_case.assertTrue( + "The number of weights should be equal to biases" in str(ctx.exception) + ) + + def test_fuse_mlp_weight_numaxes_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + w1 = flow.ones((4,), dtype=flow.float32) + bias1 = flow.ones((4,), dtype=flow.float32) + out = flow._C.fused_mlp(x, [w1,], [bias1,], False) + test_case.assertTrue("Weight's dim size should == 2" in str(ctx.exception)) + + def test_fuse_mlp_bias_numaxes_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + w1 = flow.ones((4, 4), dtype=flow.float32) + bias1 = flow.ones((4, 4), dtype=flow.float32) + out = flow._C.fused_mlp(x, [w1,], [bias1,], False) + test_case.assertTrue("Bias's dim size should == 1" in str(ctx.exception)) + + def test_fuse_mlp_bias_first_dim_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + w1 = flow.ones((6, 4), dtype=flow.float32) + bias1 = flow.ones((5), dtype=flow.float32) + out = flow._C.fused_mlp(x, [w1,], [bias1,], False) + + test_case.assertTrue( + "Bias's dim is not equal to weight's first dim." in str(ctx.exception) + ) + + def test_fuse_mlp_weight_second_dim_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((2, 4), dtype=flow.float32) + w1 = flow.ones((3, 6), dtype=flow.float32) + bias1 = flow.ones((3), dtype=flow.float32) + out = flow._C.fused_mlp(x, [w1,], [bias1,], False) + + test_case.assertTrue( + "weight's second dim should be equal to input's second dim." + in str(ctx.exception) + ) + + +class TestL2NormalizeError(flow.unittest.TestCase): + def test_l2normalize_axis_error1(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((3, 3), dtype=flow.float32) + out = flow._C.normalize(x, dim=3, use_l2_norm_kernel=True) + test_case.assertTrue("Axis should < 2 but axis is 3 now." in str(ctx.exception)) + + def test_l2normalize_axis_error2(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((3, 3), dtype=flow.float32) + out = flow._C.normalize(x, dim=-3, use_l2_norm_kernel=True) + test_case.assertTrue( + "Axis should >=0 but axis is -1 now." in str(ctx.exception) + ) + + +class TestLossBaseFunctorError(flow.unittest.TestCase): + def test_loss_base_reduction_type_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + target = flow.ones((4, 4), dtype=flow.float32) + out = flow._C.mse_loss(x, target, "just_test") + + test_case.assertTrue( + "Reduction should be none, sum or mean." in str(ctx.exception) + ) + + +class TestMatmulError(flow.unittest.TestCase): + def test_matmul_dimension_error1(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4,), dtype=flow.float32) + w = flow.ones((4, 4), dtype=flow.float32) + out = flow._C.matmul(x, w, False, False, 1.0) + test_case.assertTrue("Tensor a's dim should >= 2" in str(ctx.exception)) + + def test_matmul_dimension_error2(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 4), dtype=flow.float32) + w = flow.ones((4,), dtype=flow.float32) + out = flow._C.matmul(x, w, False, False, 1.0) + test_case.assertTrue("Tensor b's dim should >= 2" in str(ctx.exception)) + + def test_matmul_dimension_error3(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((4, 1, 2, 1), dtype=flow.float32) + w = flow.ones((4, 4, 4), dtype=flow.float32) + out = flow._C.matmul(x, w, False, False, 1.0) + + test_case.assertTrue( + "Not support number of dimensions of a being less than number of dimensions of b!" + in str(ctx.exception) + ) + + +class TestPixelShuffleError(flow.unittest.TestCase): + def test_pixel_shuffle_4D_input_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1, 8, 4, 4, 1), dtype=flow.float32) + out = flow._C.pixel_shuffle(x, 2, 2) + + test_case.assertTrue("Only Accept 4D Tensor" in str(ctx.exception)) + + def test_pixel_shuffle_channel_divisble_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1, 8, 4, 4), dtype=flow.float32) + out = flow._C.pixel_shuffle(x, 2, 3) + + test_case.assertTrue( + "The channels of input tensor must be divisible by (upscale_factor * upscale_factor) or (h_upscale_factor * w_upscale_factor)" + in str(ctx.exception) + ) + + +class TestTripletMarginLossError(flow.unittest.TestCase): + def test_triplet_margin_loss_reduce_type_error(test_case): + with test_case.assertRaises(Exception) as ctx: + anchor = flow.ones((3, 3), dtype=flow.float32) + positive = flow.ones((3, 3), dtype=flow.float32) + negative = flow.ones((3, 3), dtype=flow.float32) + + triplet_loss = flow._C.triplet_margin_loss( + anchor, + positive, + negative, + margin=0.001, + p=2, + eps=1e-5, + swap=False, + reduction="just_test", + ) + + test_case.assertTrue( + "Reduction should be none, sum or mean." in str(ctx.exception) + ) + + +class TestNormalError(flow.unittest.TestCase): + def test_normal_data_type_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow._C.normal(mean=0.0, std=1.0, size=(3, 3), dtype=flow.int32) + + test_case.assertTrue( + "Only support float and double in normal()." in str(ctx.exception) + ) + + def test_normal_out_tensor_data_type_error(test_case): + with test_case.assertRaises(RuntimeError) as ctx: + out = flow.zeros((3, 3), dtype=flow.float64) + x = flow._C.normal( + mean=0.0, std=1.0, size=(3, 3), dtype=flow.float32, out=out + ) + + test_case.assertTrue( + "data type oneflow.float32 does not match data type of out parameter oneflow.float64" + in str(ctx.exception) + ) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_normal_out_tensor_device_type_error(test_case): + with test_case.assertRaises(RuntimeError) as ctx: + out = flow.zeros((3, 3), dtype=flow.float32, device="cuda") + x = flow._C.normal( + mean=0.0, + std=1.0, + size=(3, 3), + dtype=flow.float32, + out=out, + device="cpu", + ) + + test_case.assertTrue( + "does not match device type of out parameter" in str(ctx.exception) + ) + + +class TestNormalizationError(flow.unittest.TestCase): + def test_normalization_moving_mean_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1, 4, 2, 2), dtype=flow.float32) + moving_mean = flow.ones((4,), dtype=flow.float32) + weight = flow.ones((4,), dtype=flow.float32) + bias = flow.ones((4,), dtype=flow.float32) + + out = flow._C.normalization( + x, moving_mean, None, weight, bias, 1, 1e-5, 0.9, False + ) + + test_case.assertTrue( + "Both moving_mean and moving_variance should be None or Tensor." + in str(ctx.exception) + ) + + def test_normalization_x_input_axes_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1,), dtype=flow.float32) + weight = flow.ones((4,), dtype=flow.float32) + bias = flow.ones((4,), dtype=flow.float32) + + out = flow._C.normalization( + x, None, None, weight, bias, 1, 1e-5, 0.9, False + ) + + test_case.assertTrue( + "NumAxes of x should be greater or equal than 2." in str(ctx.exception) + ) + + def test_normalization_eval_need_moving_statistic_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((1, 2,), dtype=flow.float32) + weight = flow.ones((2,), dtype=flow.float32) + bias = flow.ones((2,), dtype=flow.float32) + + out = flow._C.normalization( + x, None, None, weight, bias, 1, 1e-5, 0.9, False + ) + + test_case.assertTrue( + "Must have moving_mean and moving_variance in eval mode." + in str(ctx.exception) + ) + + +class TestOnehotError(flow.unittest.TestCase): + def test_onehot_error(test_case): + with test_case.assertRaises(Exception) as ctx: + x = flow.ones((3, 3), dtype=flow.float32) + out = flow._C.one_hot(x, 3, 0.9, 0) + + test_case.assertTrue( + "one_hot is only applicable to index tensor." in str(ctx.exception) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/graph/test_graph_zero.py b/python/oneflow/test/graph/test_graph_zero.py index 20fc7366bab..51fa38a8657 100644 --- a/python/oneflow/test/graph/test_graph_zero.py +++ b/python/oneflow/test/graph/test_graph_zero.py @@ -26,40 +26,42 @@ def train_with_graph(iter_num=1): P = flow.placement("cuda", ranks=[0, 1]) B = flow.sbp.broadcast S0 = flow.sbp.split(0) - linear = flow.nn.Linear(8, 4) - linear = linear.to_global(placement=P, sbp=B) - flow.nn.init.constant_(linear.weight, 2.068758) - flow.nn.init.constant_(linear.bias, 0.23) - of_sgd = flow.optim.SGD(linear.parameters(), lr=0.001, momentum=0.9) + + linear_dp = flow.nn.Linear(800, 400, bias=False) + linear_dp = linear_dp.to_global(placement=P, sbp=B) + flow.nn.init.constant_(linear_dp.weight, 2.068758) + + linear_mp = flow.nn.Linear(400, 500, bias=False) + linear_mp = linear_mp.to_global(placement=P, sbp=S0) + flow.nn.init.constant_(linear_mp.weight, 2.068758) + + of_sgd = flow.optim.SGD( + [{"params": linear_dp.parameters()}, {"params": linear_mp.parameters()}], + lr=0.001, + momentum=0.9, + ) grad_scaler = flow.amp.StaticGradScaler(200) - x = flow.randint(1, 100, (4, 8), dtype=flow.float32, placement=P, sbp=S0) + x = flow.randint(1, 100, (6, 800), dtype=flow.float32, placement=P, sbp=S0) class LinearTrainGraphWithZeRO(flow.nn.Graph): def __init__(self): super().__init__() - self.linear = linear + self.linear_dp = linear_dp + self.linear_mp = linear_mp self.add_optimizer(of_sgd) self.config.enable_amp(True) self.set_grad_scaler(grad_scaler) - if zero_stage == 1: - print("zero stage 1 optimization") - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - if zero_stage == 2: - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - flow.boxing.nccl.enable_use_compute_stream(True) - if zero_stage == 3: - print("zero stage 3 optimization") - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - flow.boxing.nccl.enable_use_compute_stream(True) - flow.boxing.nccl.disable_group_boxing_by_dst_parallel(True) + self.config.enable_zero( + True, stage=zero_stage, shard_min_size=1, shard_restore_level=0, + ) + self.debug(2) def build(self, x): - out = self.linear(x) + out = self.linear_dp(x) + out = out.to_global(placement=P, sbp=B) + out = self.linear_mp(out) loss = out.sum() loss.backward() return out @@ -67,19 +69,26 @@ def build(self, x): class LinearEvalGraphWithZeRO(flow.nn.Graph): def __init__(self): super().__init__() - self.linear = linear + self.linear_dp = linear_dp + self.linear_mp = linear_mp self.config.enable_amp(True) def build(self, x): - out = self.linear(x) + out = self.linear_dp(x) + out = out.to_global(placement=P, sbp=B) + out = self.linear_mp(out) return out linear_t_g = LinearTrainGraphWithZeRO() + linear_t_g.debug(1) linear_e_g = LinearEvalGraphWithZeRO() + linear_e_g.debug(1) def one_train_iter(): out = linear_t_g(x) + if flow.env.get_rank() == 0: + print(linear_t_g) def one_eval_iter(): out = linear_e_g(x) @@ -89,8 +98,116 @@ def one_eval_iter(): # After pass rewrite in training graph, parameters' sbp has been # changed from flow.sbp.broadcast to flow.sbp.split(0) - test_case.assertEqual(linear.weight.sbp[0], S0) - test_case.assertEqual(linear.bias.sbp[0], S0) + test_case.assertEqual(linear_dp.weight.sbp[0], S0) + test_case.assertEqual(linear_mp.weight.sbp[0], S0) + + # In evaluation graph, paramters's sbp are flow.sbp.split(0). + # But their consumer will consum them as flow.sbp.broadcast. + one_eval_iter() + + iter_num = 1 + graph_check_list = train_with_graph(iter_num) + + +def _test_linear_train_graph_2d_with_zero(test_case, zero_stage=1): + def train_with_graph(iter_num=1): + P = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) + B = flow.sbp.broadcast + S0 = flow.sbp.split(0) + S1 = flow.sbp.split(1) + + def get_mixed_linear(): + linear_dp_mp = flow.nn.Linear(800, 400, bias=False) + linear_dp_mp = linear_dp_mp.to_global(placement=P, sbp=[B, S0]) + flow.nn.init.constant_(linear_dp_mp.weight, 1.068758) + + linear_mp_dp = flow.nn.Linear(800, 400, bias=False) + linear_mp_dp = linear_mp_dp.to_global(placement=P, sbp=[S0, B]) + flow.nn.init.constant_(linear_mp_dp.weight, 1.068758) + + class MixedLinear(flow.nn.Module): + def __init__(self): + super().__init__() + self.dp_mp = linear_dp_mp + self.mp_dp = linear_mp_dp + + def forward(self, x): + x = self.dp_mp(x) + x = flow.relu(x) + x = self.mp_dp(x) + x = flow.relu(x) + return x + + return MixedLinear() + + mixed_linear0 = get_mixed_linear() + mixed_linear1 = get_mixed_linear() + + of_sgd = flow.optim.SGD( + [ + {"params": mixed_linear0.parameters()}, + {"params": mixed_linear1.parameters()}, + ], + lr=0.001, + momentum=0.9, + ) + grad_scaler = flow.amp.StaticGradScaler(200) + + x = flow.rand((2, 800), dtype=flow.float32, placement=P, sbp=[S0, B]) + + class LinearTrainGraph2DWithZeRO(flow.nn.Graph): + def __init__(self): + super().__init__() + self.mixed_linear0 = mixed_linear0 + self.mixed_linear0.config.activation_checkpointing = True + self.mixed_linear1 = mixed_linear1 + self.mixed_linear1.config.activation_checkpointing = True + self.add_optimizer(of_sgd) + + self.config.enable_amp(True) + self.set_grad_scaler(grad_scaler) + self.config.enable_zero( + True, stage=zero_stage, shard_min_size=1, shard_restore_level=1, + ) + + def build(self, x): + out = self.mixed_linear0(x) + out = self.mixed_linear1(out) + loss = out.mean() + loss.backward() + return loss + + class LinearEvalGraph2DWithZeRO(flow.nn.Graph): + def __init__(self): + super().__init__() + self.mixed_linear0 = mixed_linear0 + self.mixed_linear1 = mixed_linear1 + + self.config.enable_amp(True) + + def build(self, x): + out = self.mixed_linear0(x) + out = self.mixed_linear1(out) + return out + + linear_t_g = LinearTrainGraph2DWithZeRO() + linear_e_g = LinearEvalGraph2DWithZeRO() + + def one_train_iter(): + out = linear_t_g(x) + # if flow.env.get_rank() == 0: + # print(linear_t_g) + + def one_eval_iter(): + out = linear_e_g(x) + + for i in range(iter_num): + one_train_iter() + + for state in linear_t_g._state(): + test_case.assertEqual( + state.origin.sbp, (oneflow.sbp.split(axis=0), oneflow.sbp.split(axis=0)) + ) # In evaluation graph, paramters's sbp are flow.sbp.split(0). # But their consumer will consum them as flow.sbp.broadcast. @@ -113,5 +230,18 @@ def test_linear_train_graph_with_zero_3(test_case): _test_linear_train_graph_with_zero(test_case, 3) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +@flow.unittest.skip_unless_1n4d() +class TestLinearTrainGraph2DWithZeRO(oneflow.unittest.TestCase): + def test_linear_train_graph_2d_with_zero_3(test_case): + _test_linear_train_graph_2d_with_zero(test_case, 3) + + def test_linear_train_graph_2d_with_zero_2(test_case): + _test_linear_train_graph_2d_with_zero(test_case, 2) + + def test_linear_train_graph_2d_with_zero_1(test_case): + _test_linear_train_graph_2d_with_zero(test_case, 1) + + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/graph/test_optimization_conf.py b/python/oneflow/test/graph/test_optimization_conf.py index da6348b7033..a60d339be8b 100644 --- a/python/oneflow/test/graph/test_optimization_conf.py +++ b/python/oneflow/test/graph/test_optimization_conf.py @@ -66,7 +66,7 @@ def __init__(self): self.config.allow_fuse_add_to_output(True) self.config.allow_fuse_cast_scale(True) self.config.set_gradient_accumulation_steps(100) - self.config.set_zero_redundancy_optimizer_mode("distributed_split") + self.config.enable_zero(True) self.config.enable_cudnn_conv_heuristic_search_algo(False) def build(self, x): diff --git a/python/oneflow/test/modules/test_consistent_diagonal.py b/python/oneflow/test/modules/test_consistent_diagonal.py index 1344eac515a..c93abe6272f 100644 --- a/python/oneflow/test/modules/test_consistent_diagonal.py +++ b/python/oneflow/test/modules/test_consistent_diagonal.py @@ -38,6 +38,7 @@ def _test_diagonal_impl(test_case, placement, sbp): return z +@unittest.skip("TODO: fix this test") class TestDiagonalConsistent(flow.unittest.TestCase): @globaltest def test_diagonal(test_case): diff --git a/python/oneflow/test/modules/test_consistent_slice.py b/python/oneflow/test/modules/test_consistent_slice.py index cc39410d200..d3dd5f7092a 100644 --- a/python/oneflow/test/modules/test_consistent_slice.py +++ b/python/oneflow/test/modules/test_consistent_slice.py @@ -117,18 +117,18 @@ def _test_logical_slice_with_bool(test_case, placement, sbp): def _test_logical_slice_with_grad(test_case, placement, sbp): - x = random_tensor(2, 4, 4, requires_grad=True).oneflow + x = random_tensor(2, 8, 16, requires_grad=True).oneflow x_numpy = x.detach().cpu().numpy() class LogicalSliceWithGrad(flow.nn.Module): def __init__(self): super().__init__() - self.input_grad = flow.nn.Parameter(flow.zeros(4, 4)) + self.input_grad = flow.nn.Parameter(flow.zeros(8, 16)) def forward(self, input): x = input + self.input_grad x = x.to_global(placement, sbp) - return x[:, :2] + return x[:, :8] logical_slice_with_grad = LogicalSliceWithGrad().to_global( placement, [flow.sbp.broadcast,] * len(sbp) @@ -154,10 +154,10 @@ def build(self, x): y = graph(input) # output - test_case.assertTrue(np.array_equal(y.numpy(), x_numpy[:, :2])) + test_case.assertTrue(np.array_equal(y.numpy(), x_numpy[:, :8])) # input_grad - x_grad_np = np.zeros((4, 4)) - x_grad_np[:, :2] = 1 + x_grad_np = np.zeros((8, 16)) + x_grad_np[:, :8] = 1 test_case.assertTrue( np.array_equal(-graph.module.input_grad.origin.numpy(), x_grad_np) ) diff --git a/python/oneflow/test/modules/test_consistent_slice_assign.py b/python/oneflow/test/modules/test_consistent_slice_assign.py index b2088b6bdd2..410b199ac53 100644 --- a/python/oneflow/test/modules/test_consistent_slice_assign.py +++ b/python/oneflow/test/modules/test_consistent_slice_assign.py @@ -23,39 +23,49 @@ def _test_logical_slice_assign(test_case, placement, sbp): - input = random_tensor(2, 4, 4, requires_grad=True).oneflow - x_numpy = input.detach().cpu().numpy() - + input = random_tensor(2, 8, 16, requires_grad=True).oneflow + value = random_tensor(2, 8, 8, requires_grad=True).oneflow x = (input + 0).to_global( placement=placement, sbp=sbp ) # add 0 to change to non-leaf tensor - x[:, :2] = 3 + y = value.to_global(placement, sbp=sbp) + x[:, :8] = y + + ref_np = input.detach().cpu().numpy() + value_np = value.detach().cpu().numpy() # forward - x_numpy[:, :2] = 3 + ref_np[:, :8] = value_np test_case.assertTrue(x.sbp == sbp) - test_case.assertTrue(np.array_equal(x.numpy(), x_numpy)) + test_case.assertTrue(np.array_equal(x.numpy(), ref_np)) # backward x.sum().backward() - input_grad_np = np.ones((4, 4)) - input_grad_np[:, :2] = 0 - test_case.assertTrue(np.array_equal(input.grad.numpy(), input_grad_np)) + # ref grad + ref_grad_np = np.ones((8, 16)) + ref_grad_np[:, :8] = 0 + test_case.assertTrue(np.array_equal(input.grad.numpy(), ref_grad_np)) + # value grad + value_grad_np = np.ones((8, 8)) + test_case.assertTrue(np.array_equal(value.grad.numpy(), value_grad_np)) def _test_graph_logical_slice_assign(test_case, placement, sbp): - x = random_tensor(2, 4, 4, requires_grad=True).oneflow - x_numpy = x.detach().cpu().numpy() + ref = random_tensor(2, 8, 16, requires_grad=True).oneflow + value = random_tensor(2, 8, 8, requires_grad=True).oneflow class LogicalSliceAssignWithGrad(flow.nn.Module): def __init__(self): super().__init__() - self.input_grad = flow.nn.Parameter(flow.zeros(4, 4)) + self.ref_grad = flow.nn.Parameter(flow.zeros(8, 16)) + self.value_grad = flow.nn.Parameter(flow.zeros(8, 8)) - def forward(self, input): - x = input + self.input_grad + def forward(self, ref, value): + x = ref + self.ref_grad + y = value + self.value_grad x = x.to_global(placement, sbp) - x[:, :2] = 3 + y = y.to_global(placement, sbp) + x[:, :8] = y return x logical_slice_assign_with_grad = LogicalSliceAssignWithGrad().to_global( @@ -72,27 +82,38 @@ def __init__(self): self.module = logical_slice_assign_with_grad self.add_optimizer(of_sgd) - def build(self, x): - out = self.module(x) + def build(self, x, y): + out = self.module(x, y) z = out.sum() z.backward() return out graph = LogicalSliceAssignTrainGraph() - input = x.to_global(placement=placement, sbp=sbp) - y = graph(input) + x = ref.to_global(placement=placement, sbp=sbp) + y = value.to_global(placement=placement, sbp=sbp) + z = graph(x, y) + + test_case.assertTrue(z.sbp == sbp) + + ref_np = ref.detach().cpu().numpy() + value_np = value.detach().cpu().numpy() - test_case.assertTrue(y.sbp == sbp) + # forward + ref_np[:, :8] = value_np + test_case.assertTrue(np.array_equal(z.numpy(), ref_np)) - # output - x_numpy[:, :2] = 3 - test_case.assertTrue(np.array_equal(y.numpy(), x_numpy)) - # input_grad - x_grad_np = np.ones((4, 4)) - x_grad_np[:, :2] = 0 + # backward + # ref grad + ref_grad = np.ones((8, 16)) + ref_grad[:, :8] = 0 + test_case.assertTrue( + np.array_equal(-graph.module.ref_grad.origin.numpy(), ref_grad) + ) + # value grad + value_grad = np.ones((8, 8)) test_case.assertTrue( - np.array_equal(-graph.module.input_grad.origin.numpy(), x_grad_np) + np.array_equal(-graph.module.value_grad.origin.numpy(), value_grad) ) diff --git a/python/oneflow/test/modules/test_conv2d.py b/python/oneflow/test/modules/test_conv2d.py index 0ba10de49af..9e8d62c9395 100644 --- a/python/oneflow/test/modules/test_conv2d.py +++ b/python/oneflow/test/modules/test_conv2d.py @@ -16,6 +16,7 @@ import unittest from collections import OrderedDict +import os import numpy as np @@ -1894,6 +1895,107 @@ def test_conv2d_group_with_random_data(test_case): y = m(x) return y + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_conv2d_NHWC_with_random_data(test_case): + in_channels = np.random.randint(6, 33) + out_channels = np.random.randint(32, 66) + kernel_size = np.random.randint(1, 5) + stride = np.random.randint(1, 2) + padding = np.random.randint(1, 3) + dilation = np.random.randint(1, 3) + spatial = np.random.randint(6, 64) + + np_x = np.random.randn(4, in_channels, spatial, spatial).astype(np.float32) + np_weight = np.random.randn( + out_channels, in_channels, kernel_size, kernel_size + ).astype(np.float32) + np_bias = np.random.randn(out_channels).astype(np.float32) + + flow_nchw_input = flow.tensor( + np_x, device="cuda", dtype=flow.float32, requires_grad=True + ) + flow_nchw_weights = flow.nn.Parameter( + flow.tensor( + np_weight, device="cuda", dtype=flow.float32, requires_grad=True + ) + ) + flow_nchw_bias = flow.nn.Parameter( + flow.tensor(np_bias, device="cuda", dtype=flow.float32, requires_grad=True) + ) + + flow_nchw_conv = flow.nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + ).to("cuda") + flow_nchw_conv.weight = flow_nchw_weights + flow_nchw_conv.bias = flow_nchw_bias + + flow_nchw_out = flow_nchw_conv(flow_nchw_input) + + os.environ["ONEFLOW_ENABLE_NHWC"] = "1" + flow_nhwc_input = flow.tensor( + np_x, device="cuda", dtype=flow.float32, requires_grad=True + ) + flow_nhwc_permuted_input = flow.permute(flow_nhwc_input, (0, 2, 3, 1)) + flow_nhwc_weights = flow.tensor( + np_weight, device="cuda", dtype=flow.float32, requires_grad=True + ) + flow_nhwc_permuted_weights = flow.nn.Parameter( + flow.permute(flow_nhwc_weights, (0, 2, 3, 1)) + ) + flow_nhwc_bias = flow.nn.Parameter( + flow.tensor(np_bias, device="cuda", dtype=flow.float32, requires_grad=True) + ) + + flow_nhwc_conv = flow.nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + ).to("cuda") + flow_nhwc_conv.weight = flow_nhwc_permuted_weights + flow_nhwc_conv.bias = flow_nhwc_bias + + flow_nhwc_out = flow_nhwc_conv(flow_nhwc_permuted_input) + flow_nhwc_permuted_out = flow.permute(flow_nhwc_out, (0, 3, 1, 2)) + + test_case.assertTrue( + np.allclose( + flow_nchw_out.numpy(), + flow_nhwc_permuted_out.numpy(), + rtol=1e-4, + atol=1e-4, + ) + ) + + total_out = flow_nchw_out + flow_nhwc_permuted_out + + total_out = total_out.sum() + total_out.backward() + test_case.assertTrue( + np.allclose( + flow_nchw_weights.grad.numpy(), + np.transpose(flow_nhwc_permuted_weights.grad.numpy(), (0, 3, 1, 2)), + rtol=1e-4, + atol=1e-4, + ) + ) + test_case.assertTrue( + np.allclose( + flow_nchw_input.grad.numpy(), + flow_nhwc_input.grad.numpy(), + rtol=1e-4, + atol=1e-4, + ) + ) + os.environ["ONEFLOW_ENABLE_NHWC"] = "0" + @profile(torch.nn.functional.conv2d) def profile_conv2d(test_case): input = torch.ones(8, 128, 28, 28) diff --git a/python/oneflow/test/modules/test_cublas_fused_mlp.py b/python/oneflow/test/modules/test_cublas_fused_mlp.py index 147a1321152..0ee5d337613 100644 --- a/python/oneflow/test/modules/test_cublas_fused_mlp.py +++ b/python/oneflow/test/modules/test_cublas_fused_mlp.py @@ -21,9 +21,9 @@ import oneflow as flow -def _matmul_bias_relu(x, weight, bias, skip_activate): +def _matmul_bias_relu(x, weight, bias, skip_activation): out = flow._C.bias_add(flow._C.matmul(x, weight, transpose_b=True), bias, axis=1) - if not skip_activate: + if not skip_activation: out = flow._C.relu(out) return out @@ -176,7 +176,7 @@ def test_fused_matmul_op(test_case): args_dict["batchsize"] = [1, 2, 4] args_dict["in_feature"] = [96, 128] args_dict["hidden_size_list"] = [[256, 512], [256], [96, 144], []] - args_dict["out_feature"] = [512, 1024, 288] + args_dict["out_feature"] = [512, 1024, 288, 1] args_dict["skip_final_activation"] = [True, False] args_dict["dtype"] = [flow.float32, flow.float64] args_dict["device"] = ["cuda", "cpu"] diff --git a/python/oneflow/test/modules/test_cum_ops.py b/python/oneflow/test/modules/test_cum_ops.py index 6f366e37259..2088440a292 100644 --- a/python/oneflow/test/modules/test_cum_ops.py +++ b/python/oneflow/test/modules/test_cum_ops.py @@ -15,9 +15,11 @@ """ import unittest from collections import OrderedDict +import numpy as np import oneflow as flow import oneflow.unittest +import torch as ori_torch from oneflow.test_utils.automated_test_util import * @@ -64,6 +66,29 @@ def test_cumprod_with_user_dy(test_case): z = y * 2 return z + def test_cumprod_with_zero(test_case): + np_arr = np.ones((5, 5)) + np_arr_grad = np_arr + np_arr[2][3] = 0 + np_arr[4][3] = 0 + of_tensor = flow.tensor(np_arr, dtype=flow.float, requires_grad=True) + of_res = of_tensor.cumprod(dim=0) + of_res.backward(flow.tensor(np_arr_grad, dtype=flow.float)) + + torch_tensor = ori_torch.tensor( + np_arr, dtype=ori_torch.float, requires_grad=True + ) + torch_res = torch_tensor.cumprod(dim=0) + torch_res.backward(ori_torch.tensor(np_arr_grad, dtype=ori_torch.float)) + test_case.assertTrue( + np.allclose( + of_tensor.grad.numpy(), + torch_tensor.grad.numpy(), + rtol=0.0001, + atol=1e-05, + ) + ) + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_fused_cross_interaction.py b/python/oneflow/test/modules/test_fused_cross_interaction.py new file mode 100644 index 00000000000..60140350946 --- /dev/null +++ b/python/oneflow/test/modules/test_fused_cross_interaction.py @@ -0,0 +1,154 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import unittest +from collections import OrderedDict +import os +import numpy as np +from oneflow.test_utils.test_util import GenArgList + +import oneflow as flow + + +def _test_fused_cross_feature_interaction_v1( + test_case, batchsize, in_feature, dtype, device, +): + x = np.random.uniform(low=-1, high=1, size=(batchsize, in_feature)) + weight = np.random.uniform(low=-1, high=1, size=(1, in_feature)) + bias = np.random.uniform(low=-1, high=1, size=(in_feature)) + x0 = np.random.uniform(low=-1, high=1, size=(batchsize, in_feature)) + + fused_x = flow.tensor(x, dtype=dtype, device=device, requires_grad=True) + naive_x = flow.tensor(x, dtype=dtype, device=device, requires_grad=True) + fused_weight = flow.tensor(weight, dtype=dtype, device=device, requires_grad=True) + naive_weight = flow.tensor(weight, dtype=dtype, device=device, requires_grad=True) + fused_bias = flow.tensor(bias, dtype=dtype, device=device, requires_grad=True) + naive_bias = flow.tensor(bias, dtype=dtype, device=device, requires_grad=True) + fused_x0 = flow.tensor(x0, dtype=dtype, device=device, requires_grad=True) + naive_x0 = flow.tensor(x0, dtype=dtype, device=device, requires_grad=True) + + fused_out = flow._C.fused_cross_feature_interaction( + fused_x, fused_weight, fused_x0, fused_bias, "vector" + ) + + naive_out = ( + flow._C.matmul(naive_x, naive_weight, transpose_b=True) * naive_x0 + naive_bias + ) + naive_x + + total_out = fused_out.sum() + naive_out.sum() + total_out.backward() + + test_case.assertTrue( + np.allclose(fused_out.numpy(), naive_out.numpy(), atol=1e-4, rtol=1e-4) + ) + test_case.assertTrue( + np.allclose(fused_x.grad.numpy(), naive_x.grad.numpy(), atol=1e-4, rtol=1e-4,) + ) + test_case.assertTrue( + np.allclose( + fused_weight.grad.numpy(), naive_weight.grad.numpy(), atol=1e-4, rtol=1e-4, + ) + ) + test_case.assertTrue( + np.allclose(fused_x0.grad.numpy(), naive_x0.grad.numpy(), atol=1e-4, rtol=1e-4,) + ) + test_case.assertTrue( + np.allclose( + fused_bias.grad.numpy(), naive_bias.grad.numpy(), atol=1e-4, rtol=1e-4, + ) + ) + + +def _test_fused_cross_feature_interaction_v2( + test_case, batchsize, in_feature, dtype, device, +): + x = np.random.uniform(low=-1, high=1, size=(batchsize, in_feature)) + weight = np.random.uniform(low=-1, high=1, size=(in_feature, in_feature)) + bias = np.random.uniform(low=-1, high=1, size=(in_feature)) + x0 = np.random.uniform(low=-1, high=1, size=(batchsize, in_feature)) + + fused_x = flow.tensor(x, dtype=dtype, device=device, requires_grad=True) + naive_x = flow.tensor(x, dtype=dtype, device=device, requires_grad=True) + fused_weight = flow.tensor(weight, dtype=dtype, device=device, requires_grad=True) + naive_weight = flow.tensor(weight, dtype=dtype, device=device, requires_grad=True) + fused_bias = flow.tensor(bias, dtype=dtype, device=device, requires_grad=True) + naive_bias = flow.tensor(bias, dtype=dtype, device=device, requires_grad=True) + fused_x0 = flow.tensor(x0, dtype=dtype, device=device, requires_grad=True) + naive_x0 = flow.tensor(x0, dtype=dtype, device=device, requires_grad=True) + + fused_out = flow._C.fused_cross_feature_interaction( + fused_x, fused_weight, fused_x0, fused_bias, "matrix" + ) + + naive_out = ( + flow._C.bias_add( + flow._C.matmul(naive_x, naive_weight, transpose_b=True), naive_bias, axis=1 + ) + * naive_x0 + + naive_x + ) + + total_out = fused_out.sum() + naive_out.sum() + total_out.backward() + + test_case.assertTrue( + np.allclose(fused_out.numpy(), naive_out.numpy(), atol=1e-4, rtol=1e-4) + ) + test_case.assertTrue( + np.allclose(fused_x.grad.numpy(), naive_x.grad.numpy(), atol=1e-4, rtol=1e-4,) + ) + test_case.assertTrue( + np.allclose( + fused_weight.grad.numpy(), naive_weight.grad.numpy(), atol=1e-4, rtol=1e-4, + ) + ) + test_case.assertTrue( + np.allclose(fused_x0.grad.numpy(), naive_x0.grad.numpy(), atol=1e-4, rtol=1e-4,) + ) + test_case.assertTrue( + np.allclose( + fused_bias.grad.numpy(), naive_bias.grad.numpy(), atol=1e-4, rtol=1e-4, + ) + ) + + +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +@flow.unittest.skip_unless_1n1d() +class TestFusedCrossFeatureInteraction(flow.unittest.TestCase): + def test_fused_cross_feature_interaction_v1(test_case): + args_dict = OrderedDict() + args_dict["test_fun"] = [_test_fused_cross_feature_interaction_v1] + args_dict["batchsize"] = [1, 2, 4] + args_dict["in_feature"] = [32, 64, 96, 128] + args_dict["dtype"] = [flow.float32] + args_dict["device"] = ["cuda"] + + for arg in GenArgList(args_dict): + arg[0](test_case, *arg[1:]) + + def test_fused_cross_feature_interaction_v2(test_case): + args_dict = OrderedDict() + args_dict["test_fun"] = [_test_fused_cross_feature_interaction_v2] + args_dict["batchsize"] = [1, 2, 4] + args_dict["in_feature"] = [32, 64, 96, 128] + args_dict["dtype"] = [flow.float32] + args_dict["device"] = ["cuda"] + + for arg in GenArgList(args_dict): + arg[0](test_case, *arg[1:]) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_fused_matmul_bias_add_relu_dropout.py b/python/oneflow/test/modules/test_fused_matmul_bias_add_relu_dropout.py new file mode 100644 index 00000000000..2945121ae2e --- /dev/null +++ b/python/oneflow/test/modules/test_fused_matmul_bias_add_relu_dropout.py @@ -0,0 +1,192 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import unittest +from collections import OrderedDict +import numpy as np +from oneflow.test_utils.test_util import GenArgList + +import oneflow as flow + + +def _matmul_bias_relu(x, weight, bias, skip_activate): + # We do not add dropout in unittest, cause its result is random. + out = flow._C.bias_add(flow._C.matmul(x, weight, transpose_b=True), bias, axis=1) + if not skip_activate: + out = flow._C.relu(out) + return out + + +def _test_fused_matmul_bias_add_relu_dropout( + test_case, + batchsize, + in_feature, + hidden_size_list, + out_feature, + skip_final_activation, + dtype, + device, +): + x = np.random.uniform(low=-1, high=1, size=(batchsize, in_feature)) + + fused_x = flow.tensor(x, dtype=dtype, device=device, requires_grad=True) + naive_x = flow.tensor(x, dtype=dtype, device=device, requires_grad=True) + + fused_weight_list = [] + naive_weight_list = [] + fused_bias_list = [] + naive_bias_list = [] + + hidden_num = len(hidden_size_list) + + if hidden_num != 0: + np_first_weight = np.random.uniform( + low=-1, high=1, size=(hidden_size_list[0], in_feature) + ) + np_first_bias = np.random.uniform(low=-1, high=1, size=hidden_size_list[0]) + + fused_weight_list.append( + flow.tensor(np_first_weight, dtype=dtype, device=device, requires_grad=True) + ) + fused_bias_list.append( + flow.tensor(np_first_bias, dtype=dtype, device=device, requires_grad=True) + ) + naive_weight_list.append( + flow.tensor(np_first_weight, dtype=dtype, device=device, requires_grad=True) + ) + naive_bias_list.append( + flow.tensor(np_first_bias, dtype=dtype, device=device, requires_grad=True) + ) + + for idx in range(1, hidden_num): + np_weight = np.random.uniform( + low=-1, high=1, size=(hidden_size_list[idx], hidden_size_list[idx - 1]) + ) + np_bias = np.random.uniform(low=-1, high=1, size=hidden_size_list[idx]) + + fused_weight_list.append( + flow.tensor(np_weight, dtype=dtype, device=device, requires_grad=True) + ) + fused_bias_list.append( + flow.tensor(np_bias, dtype=dtype, device=device, requires_grad=True) + ) + naive_weight_list.append( + flow.tensor(np_weight, dtype=dtype, device=device, requires_grad=True) + ) + naive_bias_list.append( + flow.tensor(np_bias, dtype=dtype, device=device, requires_grad=True) + ) + + np_final_weight = np.random.uniform(low=-1, high=1, size=(out_feature, in_feature)) + + if hidden_num != 0: + np_final_weight = np.random.uniform( + low=-1, high=1, size=(out_feature, hidden_size_list[-1]) + ) + + np_final_bias = np.random.uniform(low=-1, high=1, size=(out_feature)) + + fused_weight_list.append( + flow.tensor(np_final_weight, dtype=dtype, device=device, requires_grad=True) + ) + fused_bias_list.append( + flow.tensor(np_final_bias, dtype=dtype, device=device, requires_grad=True) + ) + naive_weight_list.append( + flow.tensor(np_final_weight, dtype=dtype, device=device, requires_grad=True) + ) + naive_bias_list.append( + flow.tensor(np_final_bias, dtype=dtype, device=device, requires_grad=True) + ) + + fused_out = flow._C.fused_matmul_bias_add_relu_dropout( + fused_x, + fused_weight_list, + fused_bias_list, + # We do not add dropout in unittest, cause its result is random. + dropout_rate_list=[0.0] * len(fused_weight_list), + skip_final_activation=skip_final_activation, + ) + + naive_out = _matmul_bias_relu( + naive_x, + naive_weight_list[0], + naive_bias_list[0], + False if hidden_num != 0 else skip_final_activation, + ) + + for idx in range(1, hidden_num + 1): + if idx == hidden_num: + naive_out = _matmul_bias_relu( + naive_out, + naive_weight_list[idx], + naive_bias_list[idx], + skip_final_activation, + ) + else: + naive_out = _matmul_bias_relu( + naive_out, naive_weight_list[idx], naive_bias_list[idx], False + ) + + total_out = fused_out.sum() + naive_out.sum() + total_out.backward() + + test_case.assertTrue( + np.allclose(fused_out.numpy(), naive_out.numpy(), atol=1e-4, rtol=1e-4) + ) + + # Test weight grad equality + for idx in range(hidden_num + 1): + test_case.assertTrue( + np.allclose( + fused_weight_list[idx].grad.numpy(), + naive_weight_list[idx].grad.numpy(), + atol=1e-4, + rtol=1e-4, + ) + ) + test_case.assertTrue( + np.allclose( + fused_bias_list[idx].grad.numpy(), + naive_bias_list[idx].grad.numpy(), + atol=1e-4, + rtol=1e-4, + ) + ) + # Test dx equality + test_case.assertTrue( + np.allclose(fused_x.grad.numpy(), naive_x.grad.numpy(), atol=1e-4, rtol=1e-4) + ) + + +@flow.unittest.skip_unless_1n1d() +class TestFusedMatmulBiasAddReluDropout(flow.unittest.TestCase): + def test_fused_matmul_bias_add_relu_dropout(test_case): + args_dict = OrderedDict() + args_dict["test_func"] = [_test_fused_matmul_bias_add_relu_dropout] + args_dict["batchsize"] = [1, 2, 4] + args_dict["in_feature"] = [96, 128, 64] + args_dict["hidden_size_list"] = [[256, 512], [400, 400, 400, 400], [17, 33, 79]] + args_dict["out_feature"] = [512, 400, 1024, 1] + args_dict["skip_final_activation"] = [False] + args_dict["dtype"] = [flow.float32] + args_dict["device"] = ["cuda"] + + for arg in GenArgList(args_dict): + arg[0](test_case, *arg[1:]) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_gather.py b/python/oneflow/test/modules/test_gather.py index b7925274e7c..8765242adaa 100644 --- a/python/oneflow/test/modules/test_gather.py +++ b/python/oneflow/test/modules/test_gather.py @@ -103,6 +103,36 @@ def _test_gather_backward(test_case, device): test_case.assertTrue(np.array_equal(of_input.grad.numpy(), np_grad)) +def _test_gather_index_0dim_tensor(test_case, device): + input = flow.ones(1).to(device) + input.requires_grad = True + index = flow.tensor(0).to(device) + output = flow.gather(input, 0, index) + test_case.assertTrue(np.array_equal(output.numpy(), 1.0)) + output.sum().backward() + test_case.assertTrue(np.array_equal(input.grad.numpy(), [1.0])) + + +def _test_gather_input_index_0dim_tensor(test_case, device): + input = flow.tensor(1.0).to(device) + input.requires_grad = True + index = flow.tensor(0).to(device) + output = flow.gather(input, 0, index) + test_case.assertTrue(np.array_equal(output.numpy(), 1.0)) + output.sum().backward() + test_case.assertTrue(np.array_equal(input.grad.numpy(), 1.0)) + + +def _test_gather_input_0dim_tensor(test_case, device): + input = flow.tensor(1.0).to(device) + input.requires_grad = True + index = flow.tensor([0]).to(device) + output = flow.gather(input, 0, index) + test_case.assertTrue(np.array_equal(output.numpy(), [1.0])) + output.sum().backward() + test_case.assertTrue(np.array_equal(input.grad.numpy(), 1.0)) + + @flow.unittest.skip_unless_1n1d() class TestGather(flow.unittest.TestCase): def test_gather(test_case): @@ -112,6 +142,9 @@ def test_gather(test_case): _test_gather_tensor_function, _test_gather_random_array, _test_gather_backward, + _test_gather_index_0dim_tensor, + _test_gather_input_index_0dim_tensor, + _test_gather_input_0dim_tensor, ] arg_dict["device"] = ["cpu", "cuda"] for arg in GenArgList(arg_dict): diff --git a/python/oneflow/test/modules/test_max.py b/python/oneflow/test/modules/test_max.py index 546a22a5ddd..919eabec6bc 100644 --- a/python/oneflow/test/modules/test_max.py +++ b/python/oneflow/test/modules/test_max.py @@ -98,6 +98,14 @@ def test_max_broadcast_dtype_promotion(test_case): y = random_tensor(ndim, *b_dims, dtype=int).to(device) return torch.max(x, y) + @autotest(n=3, auto_backward=True, check_graph=True) + def test_max_with_diff_size(test_case): + x = flow.rand(1, 1, 4, requires_grad=True) + y = flow.rand(1, 4, requires_grad=True) + x = random_tensor(3, 1, 1, 4) + y = random_tensor(2, 1, 4) + return torch.max(x, y) + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_nccl_send_recv_boxing.py b/python/oneflow/test/modules/test_nccl_send_recv_boxing.py new file mode 100644 index 00000000000..20c8d09f4ed --- /dev/null +++ b/python/oneflow/test/modules/test_nccl_send_recv_boxing.py @@ -0,0 +1,103 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest +from collections import OrderedDict +import oneflow +import numpy as np +import oneflow as flow +import oneflow.unittest +from oneflow.test_utils.test_util import GenArgList + +import time +import os + +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "1" + + +def _test_nccl_send_recv_boxing( + test_case, src_nd_sbp, dst_nd_sbp, src_ranks, dst_ranks +): + # can not process p in dst + if flow.sbp.partial_sum() in dst_nd_sbp: + return + # skip src == dst + if src_nd_sbp == dst_nd_sbp: + return + # in this case, use intra group boxing + if src_nd_sbp[0] == dst_nd_sbp[0]: + return + # in this case, use inter group boxing + if ( + src_nd_sbp[1] == dst_nd_sbp[1] + and src_nd_sbp[0] != src_nd_sbp[1] + and src_nd_sbp[0] != src_nd_sbp[1] + ): + return + # in this case, use 1d boxing + if src_nd_sbp[0] == src_nd_sbp[1] and dst_nd_sbp[0] == dst_nd_sbp[1]: + return + src_placement = flow.placement("cuda", ranks=src_ranks) + dst_placement = flow.placement("cuda", ranks=dst_ranks) + + class TestGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + + def build(self, x): + y = x.to_global(sbp=dst_nd_sbp, placement=dst_placement) + return y + + x = flow.tensor( + np.arange(12 * 16 * 16).reshape(12, 16, 16), + sbp=src_nd_sbp, + placement=src_placement, + ) + graph = TestGraph() + y = graph(x) + test_case.assertTrue(np.array_equal(y.numpy(), x.numpy())) + + +def gen_nd_sbp(): + sbp_list = [ + flow.sbp.partial_sum(), + flow.sbp.broadcast(), + flow.sbp.split(0), + flow.sbp.split(1), + flow.sbp.split(2), + ] + nd_sbp_list = [] + for sbp0 in sbp_list: + for sbp1 in sbp_list: + nd_sbp_list.append([sbp0, sbp1]) + return nd_sbp_list + + +@flow.unittest.skip_unless_1n4d() +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +class TestNcclSendRecvBoxing(flow.unittest.TestCase): + def test_nccl_send_recv_boxing(test_case): + arg_dict = OrderedDict() + arg_dict["src_nd_sbp"] = gen_nd_sbp() + arg_dict["dst_nd_sbp"] = gen_nd_sbp() + arg_dict["src_ranks"] = [[[0, 1], [2, 3]], [[0, 1]]] + arg_dict["dst_ranks"] = [[[0, 1], [2, 3]], [[2, 3]]] + for arg in GenArgList(arg_dict): + _test_nccl_send_recv_boxing(test_case, *arg) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_sparse.py b/python/oneflow/test/modules/test_sparse.py index 2a1ed9812db..01df8da23bb 100644 --- a/python/oneflow/test/modules/test_sparse.py +++ b/python/oneflow/test/modules/test_sparse.py @@ -184,7 +184,7 @@ def test_embedding_functional(test_case): # NOTE(Yao Zihang): Set check_graph=False temporarily # Graph mode do not support inplace op with flow.no_grad() # See this issue: https://github.com/Oneflow-Inc/OneTeam/issues/1382 - @autotest(n=5, check_graph="ValidatedFlase") + @autotest(n=5, rtol=1e-03, atol=1e-03, check_graph="ValidatedFlase") def test_embedding_renorm(test_case): device = random_device() emb_size = random(low=2) * 16 diff --git a/python/oneflow/test/modules/test_tensor_ops.py b/python/oneflow/test/modules/test_tensor_ops.py index 2894c5373ea..07d3252a614 100644 --- a/python/oneflow/test/modules/test_tensor_ops.py +++ b/python/oneflow/test/modules/test_tensor_ops.py @@ -168,6 +168,20 @@ def test_int_0dim(test_case): y = x.int() return y + @autotest(n=20, auto_backward=False, rtol=1e-4, atol=1e-4, check_graph=True) + def test_half(test_case): + device = random_device() + x = random_tensor(dtype=int).to(device) + y = x.half() + return y + + @autotest(n=20, auto_backward=False, rtol=1e-4, atol=1e-4, check_graph=True) + def test_half_0dim(test_case): + device = random_device() + x = random_tensor(ndim=0, dtype=int).to(device) + y = x.half() + return y + @autotest(n=20, auto_backward=False, rtol=1e-4, atol=1e-4, check_graph=True) def test_float(test_case): device = random_device() diff --git a/python/oneflow/test/tensor/test_tensor_part_1.py b/python/oneflow/test/tensor/test_tensor_part_1.py index d335ddfeb33..55da4a4a373 100644 --- a/python/oneflow/test/tensor/test_tensor_part_1.py +++ b/python/oneflow/test/tensor/test_tensor_part_1.py @@ -581,6 +581,30 @@ def test_broadcast_div_inplace_tensor(test_case): y.div_(x) return y + @flow.unittest.skip_unless_1n1d() + @autotest(check_graph=True) + def test_add_inplace_tensor(test_case): + device = random_device() + rand_tensor = random_tensor( + low=-2, high=2, ndim=4, dim0=6, dim1=9, dim2=14, dim3=17 + ).to(device) + y = rand_tensor + 1 + x = random_tensor(low=-2, high=2, ndim=4, dim0=6, dim1=9, dim2=14, dim3=17).to( + device + ) + y.add_(x) + return y + + @flow.unittest.skip_unless_1n1d() + @autotest(check_graph=True) + def test_broadcast_add_inplace_tensor(test_case): + device = random_device() + rand_tensor = random_tensor(ndim=3, dim0=5, dim1=9, dim2=23).to(device) + y = rand_tensor + 1 + x = random_tensor(ndim=2, dim0=9, dim1=23).to(device) + y.add_(x) + return y + @flow.unittest.skip_unless_1n1d() @autotest(check_graph=True) def test_sub_inplace_tensor(test_case): From 2a1810cbaa10cc0192e7a4842cd2f097c661d31f Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 23 Jun 2022 14:56:25 +0800 Subject: [PATCH 10/45] Support different hierarchy --- oneflow/core/framework/sbp_infer_util.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 095c2027d97..96f49089006 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -422,9 +422,6 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel reduced_in_nd_sbp.sbp_parallel(0), reduced_out_nd_sbp.sbp_parallel(0), logical_blob_desc, reduced_in_parallel_desc, reduced_out_parallel_desc)); } - // Not supporting different hierarchy - // TODO: Support it in the future - if (in_hierarchy->elem_cnt() != out_hierarchy->elem_cnt()) { return kUnsupportedBoxing; } double logical_blob_size = logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()); @@ -439,6 +436,9 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel } #endif // WITH_CUDA + // Not supporting different hierarchy without general basic communication + if (in_hierarchy->elem_cnt() != out_hierarchy->elem_cnt()) { return kUnsupportedBoxing; } + bool on_same_devices = reduced_in_parallel_desc.EqualsIgnoringHierarchy(reduced_out_parallel_desc); From f46efa139f7d63f36d4d602a5553ae58fd517928 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 23 Jun 2022 18:58:04 +0800 Subject: [PATCH 11/45] Merge branch 'master' into feat-general_basic_communication (#8477) * Add distributed optional run (#8372) * Add * change deps * add install * add skip * autoprof supports bandwidth (#8367) * autoprof supports bandwidth Signed-off-by: daquexian * print bandwidth Signed-off-by: daquexian * auto format by CI Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot * remove tmp buffer of cumprod cpu backward kernel (#8369) * remove tmp buffer of cumprod cpu backward kernel * refine * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Move tensor api to cpython part3 (#8342) * add tensor_functions * concat py methods * add hash, restore tensor.py * check replacement * refine code, remove commented tensor.py * refine code * move some api * add cpu and cuda api * add triu tril norm and etc. * remove tensor_functions.h * move more api * move more api, refine size * fix typo * format code, remove useless include * refine code * refine code, fix typo * align .cuda to python * refine code * split some api to part3 for review * remove positional only arguments of argmax and argmin * remove arguments parse * modify arguments name in matmul and floor_divide * rename BINARY_FUNC to DIRECT_PASS_FUNC, modify some functions * refine code, format code * add inplace /=, add comments * remove name in macros * remove python api * remove redundant include * remove cout * format code * refactor tensor.size by directly call shape.at, refactor tensor.sub_ by calling nb_sub_ * remove redundant code * auto format by CI * fix typo, fix wrong call * modify idx datatype from int32 to int64 in tensor.size * add some DIRECT_PASS_FUNC * add cpu cuda var pow and etc. * add masked_fill any all * make REDUCE_FUNC macro, add reduce_* functions * add 0dim check in ReduceSumWhole, refine yaml * fix bug * restore add add_ sub sub_ * add unittest for tensor.half tensor.add tensor.add_ * refine code * refine code * fix typo * fix bug of tensor.std() * refactor var std and cuda, using c++ functional api * add beta and threshold in softplus * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Add nn_functor Check (#7910) * add bias_add_check * add bias_add error test * fix conv2d nhwc bias_add error * add nhwc conv test * add bias_add_error test * Add bias add error check * Rename * add batch matmul error check * add matmul check error msg * remove annotation * add fused mlp error msg check * Add pixel shuffle check test * add more test until normalization add relu functor * refine error message * finish all nnfunctor check msg * handle type error * remove useless symbol * modify back to TypeError * fix all comment * Remove redundant code * Remove pad ndim check * fix bias add space * fix check logic cause ci gpu not always gpu:0 Co-authored-by: hjchen2 Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Add FusedMatmulBiasAddReluDropout [OneEmbedding] (#8222) * previous version for fused_matmul_bias_add_relu_dropout * add op infer * fix detail * finish forward * support dropout rate list * add forward test * fix bug for output buffer * Configurable alpha params * try to add bit mask logic * Add bitmask first version! * Add row col bitmask logic * support not align4 reludropout * simplify relu dropout ld logic * Add naive relu dropout grad kernel * add simple relu dropout grad kernel * Rename * support relu_dropout bitmask backward * add vectorized optimization * fix tmp buffer * add to amp list * add lazy backward logic * Refine kernel * add indextype dispatch * simplify functor logic * fix cublas fused mlp aux_ld shape bug * Add more relu dropout kernel * add full unittest * fix bug in skip final activation * refine * Remove dump func * fix format * Remove cmake * remove redundant divide * add padded version * fix dropout * oneflow curand * refine * remove redundant kernel * add unroll logic * add unroll and ballot sync * refine format * Remove fast curand * Refine python interface * Add if branch for memset * fix python logic * just for debug * not use matmul bias add grad * add launch 1 block limit * fix unittest * Refine * fix graph backward bug * limit to 11060 * change to use int32_t dtype for cublas aux * Fix jc comment * fix comment * fix convert * fix static_analysis * fix at * fix userops td * fix userops td * fix const ref * fix compile error for bfloat16 * limit to 11060 * fix bug Co-authored-by: Juncheng Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix gather 0-dim tensor bug (#8376) * fix 0-dim tensor bug * refine * support input 0-dim tensor for gather * refine * refine * refine dim_scatter_kernel check * refine * refine check * fix clang_tidy error Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add api to apply external job pass (#8370) * Add condition to find-test-cache-distributed (#8387) * add condition to find-test-cache-distributed * fix * warp dim util (#8382) * warp dim util * format * use more maybe_wrap_dim * refine array functor * add more * refine math_functor * fix_bug_in_broadcast_min_max_grad_and_broadcast_like (#8379) * fix_bug_in_broadcast_min_max_grad_and_broadcast_like * refine * fix static check error * fix bug about index (#8388) * fix bug about index * add test case Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * LogicalSliceAssign support full slice sbp (#8344) * feat(SliceOp): slice ops support 2d sbp * fix(SliceOp): fix [B, P] 2d sbp bug * refine error message * fix bug in parallel_num == 1 * add comment * add warning and format * add NOLINT for boxing check * feat(LogicalSliceOps): support all nd_sbp * feat(LogicalSlice): support nd_sbp * add error message * fix(AutoTest): fix auto_test bug in module.parameter pass * auto format by CI * fix(LogicalSliceAssign): skip test when 1n1d * fix SliceParams memset error * remove memset * add CHECK_JUST * fix(*): make sure split_axis >= 0 or equal to SPLIT_AXIS_FOR_NON_SPLIT * remove memset * fix spilit_info.axis bug * feat(LogicalSliceOps): support grad * add logical_slice gradient_funcs * feat(LogicalSliceAssign): LogicalSliceAssign support full slice sbp * auto format by CI * test(LogicalSlice): fix logical_slice dims Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Houjiang Chen Co-authored-by: oneflow-ci-bot * fix_tensor_from_numpy_mem_leak_bug (#8391) * fix_tensor_from_numpy_mem_leak_bug * add note * refine note * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Make of_pyext_obj static only to make sure only a python ext so has python symbols (#8393) * make of_pyext_obj static only * refine note Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Adjust tolerance setting in embedding_renorm unit test (#8394) * support front end compile for job to iree (#8249) * support frontend dev version * polish name * add tosa-to-elf.mlir * tosa to elf by llvm * conv2d partial * an enhanced frontend runner * support numpy as input * enable multiple using nn graph with different input(jobname make it it cd /home/yuhao/frontend/oneflow ; /usr/bin/env /usr/bin/python3 /home/yuhao/.vscode-server/extensions/ms-python.python-2022.6.2/pythonFiles/lib/python/debugpy/launcher 40873 -- /home/yuhao/frontend/oneflow/oneflow/ir/test/Frontend/runner.py ) * enable multiple input * enable cpu and cuda * change full_name to _full_name * support exchange cuda with cpu seamlessly * remove pip * lit config * polish * trim * auto format by CI * modify * auto format by CI * last line polish * use unittest * auto format by CI * use allclose * auto format by CI * pulish * optimize convert oneflow to tosa * conv2d * conv2d enhanced && conv2d examples add * add road map * add add_n2Op and boardcast_addOp conversion * add matmulOp conversion * support converting normailzation op to tosa(partically) * update roadmap * support i64 tensor to dense elem attr * support 100% resnet op conversion * add test mlir * add test iree resnet python script * auto format by CI * done * enhance iree resnet test script * auto format by CI * rebuild code * auto format by CI * rebuild test script * update * auto format by CI * pub * trim test scripts * move * move * input and output add block arg judgement * emit error in variable conversion * error handle for ci * modify err info * auto format by CI * merge * auto format by CI * output not block * flow ones * rm const * trim maybe * trim maybe with header file * const auto * solve clangd error Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Feat/zero mix with mp (#8036) * add zero limit * add debug * add mix zero test * refactor zero api * zero test with mp * add 2d test * add zero nd * add nd zero * add sbp cast * test passed soft limit consumer * refine size api * zero use stage 2 * add limit consumer api * add new api * refine zero s select * fix index out of range * rm zero limit on device type * zero test with activation checkpointing * add indentity when dp sequence len is 1 * move to base with master * fix * fix * fix * add test * debug bad case * refine test for eager and graph boxing * test case ready * simplify * refine test * fix buff size * fix conflict * refine zero nd * refine * add full test * revert change * refine split check * fix typo * rm log * spit long func * restore test * Update optimizer_placement_optimization_pass.cpp * auto format by CI * auto format by CI * fix static check * add tips for zero api change * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Revert embedding normal path and fix amp list (#8374) * revert embedding normal path, fix amp list * fix amp * fix memset bug in gather cpu kernel Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * replace fixed_vector with small_vector and make Shape inherit from it (#8365) * Replace fixed_vector with llvm::SmallVector Signed-off-by: daquexian * Shape inherited from llvm::SmallVector Signed-off-by: daquexian * refine cmake Signed-off-by: daquexian * rename fixed_vector to small_vector Signed-off-by: daquexian * fix reviews Signed-off-by: daquexian * auto format by CI * update Shape constructor Signed-off-by: daquexian * add 'PUBLIC' keyword to all target_link_libraries Signed-off-by: daquexian * auto format by CI * update cmake Signed-off-by: daquexian * auto format by CI * update cmake Signed-off-by: daquexian * update cmake Signed-off-by: daquexian * auto format by CI * set is_initialized_ default to true Signed-off-by: daquexian * override some methods to set is_initialized_ Signed-off-by: daquexian * auto format by CI Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot * Light plan for debug (#8396) * Light plan for debug * fix note * disable terminfo to fix missing terminfo symbols (#8400) * disable terminfo to fix missing terminfo symbols Signed-off-by: daquexian * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix bug of ZeRO MP in complex case (#8404) * Remove redundant output_lbns in ir (#8409) * mv case * remove redundant info * Dev FusedCrossInteraction[OneEmbedding] (#8335) * add simple fused cross interaction forward * add packed fused * Add cross interaction grad * simplify code * fix bug * support crossnet v2 * support cross interaction v2 * add lazy backward * Rename and add test * fix jc comment * fix comment * fix bug * fix userops td elem_cnt for FUSED Group * fix header file * fix clang static analysis * fix unittest Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add exe graph physical shape check msg (#8002) * fix index select op in graph * add exe graph physical shape check msg * improve the debug information for the python stack trace 1. add a parameter 'max_stack_depth' to specify the max depth for the stack trace 2. refactor other debug related classes. * remove parens * update * resolve PR comments * update * update graph debug test file. * restore self._debug in class Graph and class ModuleBlock * Do not shorten the stack frame string if it is in debug mode * delete TODOs * disable conv3d test (#7969) Signed-off-by: daquexian * skip layernorm random_data_warp test (#7941) * skip layernorm random_data_warp test * warp/block/uncached case only test gpu Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Lock click version (#7967) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add global avgpool unittest (#7585) * fix (#7978) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Support negative dim in scatter op (#7934) * support negative dim in scatter op * refine scatter test * refine scatter test again Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * run barrier callback in BarrierPhyInstrOperand::~BarrierPhyInstrOperand (#7702) * run barrier callback in BarrierPhyInstrOperand::~BarrierPhyInstrOperand * lock gil in vm Callback thread * more comments for VirtualMachineEngine::Callback() * the Env is never destroyed. * export Env into python * more unittests * wait shared_ptr.use_count() == 0 * export unittest.TestCase in framework/unittest.py * SwitchToShuttingDownPhase * optional is_normal_exit * VirtualMachine::CloseVMThreads * Delete env_api.h env_api.h is deleted by master * reshape_only_one_dim_infered * address pr comments * fix a ref-cnt bug in TryRunBarrierInstruction. * rollback flow.env.all_device_placement * no distributed running test_shutting_down.py * auto format by CI * expand lifetime of module oneflow in test_shutting_down.py * refine del depend on of * capture oneflow._oneflow_internal.eager when calling sync in __del__ * add try in flaky test Co-authored-by: Luyang Co-authored-by: chengtbf <472491134@qq.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot Co-authored-by: Xiaoyu Xu * Fix one hot scalar tensor bug (#7975) * fix reduce_sum scalar check bug * fix one_hot scalar tensor bug * fix clang tidy error Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * support ctor np array from of tensor (#7970) * support ctor np array from of tensor * add test case constructing np array from tensor * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * add_manual_seed_all_api (#7957) * add_manual_seed_all_api * Update conf.py * refine * add test case * auto format by CI * Update random_generator.cpp * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * one_embedding add doc string (#7902) * add doc string * add example * add * fix doc * refine * address review * mb to MB * add make_table_option * option to options * refine * add forward Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Support numpy scalar parameters (#7935) * feat(functional): support numpy scalar parameters * rename inferface * feat(*): TensorIndex support numpy scalar * feat(TensorIndex): support advance indexing * add unittest and int32 support for branch feat-param_support_np_scalar (#7939) * add unittest * refactor unittest * add todo for int16 advanced indexing * add int32 supporting for advance indexing * auto format by CI Co-authored-by: Wang Yi <53533850+marigoold@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot * fix tensor_scatter_nd_update (#7953) * fix tensor_scatter_nd_update * auto backward Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix one_embedding adam (#7974) * fix one_embedding adam * fix tidy * fix normal Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * speed test with score (#7990) Signed-off-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Feat/graph del by ref (#7857) * remove IsMultiClient() and single client logic Signed-off-by: daquexian * rename eager.multi_client to eager Signed-off-by: daquexian * auto format by CI * add py ref * refine new session * clean code * make scope api inner use * use session with ref cnt * run barrier callback in BarrierPhyInstrOperand::~BarrierPhyInstrOperand * test pass * lock gil in vm Callback thread * more comments for VirtualMachineEngine::Callback() * merge * merge rm single client * rm initenv * merge and fix master * refactor env c api * add debug code * fix and serving test pass * test passed * rm useless * rm useless code * format * rm useless include * rm sync in py * the Env is never destroyed. * export Env into python * more unittests * fix and pass tests * revert virtual_machine.cpp * revert core/vm * remove outdated python class oneflow.unittest.TestCase * graph test passed * wait shared_ptr.use_count() == 0 * export unittest.TestCase in framework/unittest.py * SwitchToShuttingDownPhase * optional is_normal_exit * VirtualMachine::CloseVMThreads * Delete env_api.h env_api.h is deleted by master * address pr comments * rm is env init * Clear empty thread when graph destroy (#7633) * Revert "Clear empty thread when graph destroy (#7633)" (#7860) This reverts commit 3e8585e5fa20b97229d6b0be46a7ff814dc8cd83. * fix a ref-cnt bug in TryRunBarrierInstruction. * rm env_api * fix clang-tidy error * fix clang-tidy in env_imp * refine env api * format * refine graph del and sync at shuttingdown * fix typo * add comment * rm useless * rm useless Co-authored-by: daquexian Co-authored-by: oneflow-ci-bot Co-authored-by: lixinqi Co-authored-by: Li Xinqi Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Luyang Co-authored-by: cheng cheng <472491134@qq.com> * [PersistentTable] Fix num blocks (#7986) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Add auto benchmark for flowvision (#7806) * update yml * update workflow * add resnet50 * [PersistentTable] Async write (#7946) * [PersistentTable] Async write * fix Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * save log in separate dir by default (#7825) Signed-off-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix index select op in graph * add exe graph physical shape check msg * improve the debug information for the python stack trace 1. add a parameter 'max_stack_depth' to specify the max depth for the stack trace 2. refactor other debug related classes. * remove parens * update * resolve PR comments * update * update graph debug test file. * restore self._debug in class Graph and class ModuleBlock * Do not shorten the stack frame string if it is in debug mode * delete TODOs * Revert "Merge branch 'master' into fea/graph_check_msg" This reverts commit 28833b73a8041463e5e3d130784be386ee248bd8, reversing changes made to baadf6045f2fce69c090e442a755229c1c949773. * Revert "Revert "Merge branch 'master' into fea/graph_check_msg"" This reverts commit 1d5e196d8530ffd2b9bf781abcf168b94ff9ca41. * update * resolve conflicts * resolve conflicts Co-authored-by: Cijie Xia Co-authored-by: daquexian Co-authored-by: guo ran <360112263@qq.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Shenghang Tsai Co-authored-by: Houjiang Chen Co-authored-by: Peihong Liu Co-authored-by: Li Xinqi Co-authored-by: Luyang Co-authored-by: chengtbf <472491134@qq.com> Co-authored-by: oneflow-ci-bot Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Co-authored-by: liufengwei0103 <2472937968@qq.com> Co-authored-by: binbinHan Co-authored-by: Yinggang Wang Co-authored-by: Wang Yi <53533850+marigoold@users.noreply.github.com> Co-authored-by: Shijie <821898965@qq.com> Co-authored-by: lixinqi Co-authored-by: Juncheng * add batch_matmul sbp (#8385) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * suppress gcc11 false positive warning (#8401) Signed-off-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix variable op conversion to tosa error in ninja c1 (#8412) * pub * move test iree resnet python script to oneflow_iree repo * add bracket * rename const_val to const_val_ and restore resnet.py test script Co-authored-by: Shenghang Tsai * Fix eval error in FusedMLP (#8413) Fix eval error * Init NCCL communicator in graph mode unifiedly (#8263) * centralized comm init * address review * revert * rename * ref nccl logical send recv * fix cpu only Co-authored-by: cheng cheng <472491134@qq.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix dim_scatter 0-dim tensor bug (#8418) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * target based external libraries (#8421) Signed-off-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Refine hardcoded attr setting/getting in ir (#8420) * use names in trait static func * more changes on op name attr * use wrapped func * Replace cu115 with cu116 in nightly (#8423) update workflows Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * fix repeat interleave 0-size tensor bug (#8414) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Autotest support print input in ci (#8383) * support print tensor value in autotest to provide more details in ci * revert * refine * auto format by CI * control precision to 1e-5 when record * fix bug * auto format by CI * relax tensor_size_mb * fix bug * fix bug * refine * releax * refinew * refine * fix bug * relax * refine * restruct * auto format by CI Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Modify sbp.split()'s karg: axis to dim (#8411) * Modify sbp.split()'s axis karg to dim * Refine * Refine * Refine * Refine * Feat/graph logical op debug repr (#8131) * add zero limit * add debug * add mix zero test * refactor zero api * zero test with mp * add 2d test * add zero nd * add nd zero * add sbp cast * test passed soft limit consumer * refine size api * add module config * save nn.Module info in job.proto for better debugging * add new line * add ModuleBlock.ops_proto() API * zero use stage 2 * print operators' info when print ModuleBlock * handle VariableOpConf * update * update * fix * move operators repr method to graph util * add limit consumer api * add new api * refine zero s select * add module block * fix * refact for rm op in module conf * fix * add sbp debug * add sbp repr * add shape * refine * add sys op in repr * add full op debug * fix index out of range * rm zero limit on device type * add no scope op to graph * zero test with activation checkpointing * fix order * add indentity when dp sequence len is 1 * add debug repr * refine repr of op * refine and fix * rm useless log * move to base with master * fix * fix * fix * fix proto * refine test * fix type * add test * debug bad case * refine test for eager and graph boxing * test case ready * simplify * refine test * fix buff size * fix conflict * refine zero nd * refine * add full test * revert change * refine split check * fix typo * rm log * spit long func * refine * restore test * refine pass and mem debug * merge master * repr dtype * add placement * Update optimizer_placement_optimization_pass.cpp * auto format by CI * auto format by CI * fix static check * add tips for zero api change * auto format by CI * fix merge * auto format by CI * auto format by CI * refine get job api * refine graph util import order * auto format by CI * fix static check * auto format by CI * fix special case * refine level print and add full dtype repr * rm useless Co-authored-by: Cijie Xia Co-authored-by: Cijie Xia Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * rm some test case in test_fused_dot_feature_interaction_pooling_sum (#8425) rm some case in test * Remove unused linkages (#8426) remove unused linkages * refactor stride (#8402) * Stride inherits DimVector Signed-off-by: daquexian * auto format by CI * fix argument type of OFStrideToNumpyStride Signed-off-by: daquexian Co-authored-by: oneflow-ci-bot * Move Tensor.__setitem__ and global related api to Python/C api (#8375) * add local_to_global, global_to_global, to_global. global_to_global still have bugs * fix bug of global_to_global * remove python api * add setitem * remove local_to_global sbp pack, format code * format code * remove redundant code * add error msg, refine check of to_global * fix bug of check * add error msg * fix clang static check error * remove useless api in tensor.py, remove redundant code, remove useless CHECK * add to_local * fix wrong exception type in unittest for to_local exception message * cuda add default error msg (#8427) default error Co-authored-by: Shenghang Tsai * Refactor ShapeView (#8422) * update Signed-off-by: daquexian * update and add docs Signed-off-by: daquexian * turn on view slice (#8302) * turn_on_view_slice * inplace scalar math hnandle non-contiguous input * fix clang check * add docs * refactor * auto format by CI Co-authored-by: oneflow-ci-bot * Add flow env init rdma api (#8415) * add_flow_env_init_rdma_api * adjust persistent_workers logic for RDMA support * adjust persistent_workers logic for RDMA support * add rmda_inited api * minro fix * add docs * Update python/oneflow/utils/data/dataloader.py Co-authored-by: daquexian * fix typo * refine * fix RDMAIsInitialized * minor fix * refine * rename InitRdma to InitRDMA * refine Co-authored-by: Flowingsun007 Co-authored-by: daquexian * add 1d send recv in nccl logical (#8355) * add 1d send recv in nccl logical * Update insert_nccl_logical_op_pass.cpp * auto format by CI Co-authored-by: cheng cheng <472491134@qq.com> Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Support iree ci (#8419) * create mlir cpu and modify build gcc 7 shell script * fix the bug of test_iree_resnet.py cuda test in cpu version error * fix constant folding tests * suport oneflow_test_cpu_only * pub * build script add flag * modify test yml * add python3 into \PATH * don't use pretrain model * install flowvision Co-authored-by: mosout Co-authored-by: jackalcooper * Feat straighten task nodes (#8347) * Add a fast topological traversal * Add an initial implementation of straighen nodes * Add the straighen nodes algorithm * Change algorithm structure * Remove some debug information * Finalize the straighten algorithm after deciding the parameters by experiments * Notify the usage of straighten algorithm * Of format * Update oneflow/core/graph/straighten_nodes.cpp Of format Co-authored-by: daquexian * Of format * Stop using visual string before we find a better key * Remove magic numbers and Of format * Remove starts * Of format * Fix a bug of using GetMaxVal() as an initial number for comparing * Refactor add straighten algo interface (#8435) * feat(*): export straighten nodes algorithm inferface * export documentation * Update python/oneflow/nn/graph/graph_config.py Co-authored-by: Yipeng Li Co-authored-by: Yipeng Li * Use TopoForEachNodeFast as default. (#8436) * Use TopoForEachNodeFast as default. Rename the original one as TopoForEachNodeDynamic * Speed up TopoForEachNodeFast when traversing a subgraph * Rename the switch and code clean up * Hide the class TopoStruct * Hide all the other functions * Grammar * Of format Co-authored-by: daquexian Co-authored-by: Yinggang Wang * Refactor NLLLoss to support split class dim (#8380) * refactor * RuntimeError * avoid atomic add * test * fixes * update test * update test * update test * fix kernel * improve backward * update test * out_weight to be required * address static analysis errer * fix static analysis error * fix static analysis error Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Strict ordering in memory reuse algorithm (#8441) * Support broadcast in fused_softmax kernel (#8321) * support broadcast * refine * Remove shape check * fix sbp when broadcast * rollback softmax grad threshold * increase threshold of test conv bn folding * tol to 1e-2 * check error msg of fuse softmax ops * add more dispatch * remove double datatype test and add broadcast test Co-authored-by: cheng cheng <472491134@qq.com> * Merge slice and logical slice (#8416) * remove Slice, SliceUpdate, SliceGrad op * rename logical_slice to slice and logical_slice_assign to slice_update * move gradient_func logical_slice.cpp to slice.cpp * fix some bug and refine local test * feat(SliceUpdate): support 0size tensor * test(Slice): refine consistent slice test * test(SliceUpdate): refine consistent slice_update test * not export slice_update's inplace parameter * auto format by CI * recovery slice_grad_op * fix slice_view bug * add error message and attr judgement * modified old test * auto format by CI * update test README * update tensor_string code * fix test bug * auto format by CI * fix(hsplit): hsplit functor bug * fix vsplit doc test bug * refine * fix test * fix pin_memory bug Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Graph block.config.set_stage() for recommended Pipeline api. (#8442) * Graph block.config.set_stage() for recommended Pipeline api. * revert diff * refine api doc Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Update PolynomialLR's doc and paramater (#8430) * update PolynomialLR doc, current_batch = min(decay_batch, current_batch) * * update PolynomialLR doc, current_batch = min(decay_batch, current_batch) * rename the steps to decay_batch in parameters * update PolynomialLR test case Co-authored-by: Yinggang Wang Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Add mv op (#8445) * add mv op with bug that Int is incompatible * add test * update test_mv.py * fix based on comments * fix based on comments Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * enable oneflow_iree(python package) and corresponding test works in ci (#8431) * update test.yml * add pytest for oneflow_iree examples * add oneflow frontend test * Dev tensor is pinned api (#8447) * support tensor.is_pinned * add test case * add docs * auto format by CI * refine * auto format by CI * refine * auto format by CI * refine * refine * refine Co-authored-by: oneflow-ci-bot * Nd sbp tensor str (#8458) * nd sbp tensor str * add nd sbp tensor str test * bigger input size * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Patch sbp cost (#8378) * Add a slight cost for B->S and B->P in 2d sbp * Add penalty for P in consumer * Add the slight penalty for eager * Consider B -> (B, B) for a scalar * Do not consider parallel description in priority ratio * Of format * Fix a bug in the old version group boxing with 2D SBP (#8448) * Update group boxing to deal with hierarchy [1, 2] * Use a uniform sbp while grouping consumers * Steal "ParallelDimReduce" from "hierarchical_sub_task_graph_builder_impl" to "sbp_infer_util" * Fix bugs of patch-sbp_cost (#8456) * Update group boxing to deal with hierarchy [1, 2] * Use a uniform sbp while grouping consumers * Steal "ParallelDimReduce" from "hierarchical_sub_task_graph_builder_impl" to "sbp_infer_util" * Reduce to uniform B for 1 device. Use the actual parallel description for each tensor * Fix a bug of fix-group_boxing-bug * Group boxing reduce [2, 2]: (S0, S0) to [4]: S0, then we might infer a 1D SBP from a 2D SBP hint Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: cheng cheng <472491134@qq.com> * Decouple stream and instruction (#7607) * remove deprecated python api * backup code * backup code * fix compiler complaints * fix typo in refactoring * kMockDevice * add unit test test_mock.py * revert mock kernels * vert DEVICE_TYPE_SEQ * mock placement * address pr comments * register device kCriticalSectionDevice and kLazyJobLauncher * kControlDevice * Stream::vm_stream_ * fix compiler complaints * backup code * rename StreamIsTransport to IsCommNetStream * decouple vm::StreamType and vm::InstructionType * fix compiler complaints * remove 'gpu' related code * address static analyzer complaints * address static analyzer complaints * remove unused module in test_mock.py * the Env is never destroyed. * export Env into python * more unittests * export unittest.TestCase in framework/unittest.py * SwitchToShuttingDownPhase * optional is_normal_exit * VirtualMachine::CloseVMThreads * Delete env_api.h env_api.h is deleted by master * reshape_only_one_dim_infered * address pr comments * rollback flow.env.all_device_placement * no distributed running test_shutting_down.py * auto format by CI * expand lifetime of module oneflow in test_shutting_down.py * refine del depend on of * fix oneflow.placement.__str__ * revert GlobalSync * init_producer_stream in oneflow.from_numpy * debug code for vm * init disable_vm_threads_ in VirtualMachine::VirtualMachine * Update oneflow/core/vm/virtual_machine.h Co-authored-by: daquexian * create stream in forked subprocesses. * refactor StreamRoleSwitch to StreamRoleVisistor * ThreadLocalGuard * auto format by CI * fix compiler complaints * fix static analyzer complaints * VirtualMachine::GetVmStream * fix static analyzer complaints * reimplement AddAndReadVector by std::deque * reimplement AddAndReadVector * merge master * increase atol for test_consistent_rnn_cell.py * StreamRole::AsyncLaunchedCommNet is bound to EventRecordedCudaStreamType * auto format by CI * remove StreamRoleVisitor::VisitInvalid * no copy in AddAndReadVector * fix bug of AddAndReadVector::size_ * disable terminfo to fix missing terminfo symbols Signed-off-by: daquexian * auto format by CI * fix AddAndReadVector::GetGranularity * remove bad unittest * auto format by CI * rename CallInstructionType to OpCallInstructionType * static variable GlobalSingletonPtr is a unique_ptr * replace ++atomic_cnt with atomic_cnt.fetch_add(1, std::memory_order_relaxed) * AddAndReadVector::operator[] * change comments 'lock free' to 'thread safe' * rename StatefulLocalOpKernel to StatefulOpKernel * rename VirtualMachine::vm_ to VirtualMachine::engine_ * mark VirtualMachine::NoMoreErasedInstructions private * mark VirtualMachine::FindOrCreateScheduleLocalDepObject private * remove unused version of VirtualMachineEngine::Receive * rename argname for VirtualMachineEngine::Receive * rename unused PendingInstructionList * rename AddAndReadVector to SteadyVector * optimize SteadyVector::operator[] by __builtin_clzll * refactor SteadyVector::granularity2vector_ to SteadyVector::granularity2data_ * reduce usage of steady_vector::size_ * rename unused anounymous namespace * greater atol for test_consistent_tensordot.py * fix BarrierInstructionType::ComputeInFuseMode * revert container_util.h * run AccessBlobByCallback in default stream of tensor->device * reslove static check * reslove static check * SteadyVector::MutableOrAdd Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: chengtbf <472491134@qq.com> Co-authored-by: oneflow-ci-bot Co-authored-by: Xiaoyu Xu Co-authored-by: daquexian Co-authored-by: binbinHan * fix_tensor_numpy_to_avoid_gpu_mem_increase (#8449) * fix_tensor_numpy_to_avoid_gpu_mem_increase * Update tensor.py * auto format by CI Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot * Rename user op tensor shape to shape view (#8433) * ThreadLocalGuard * rename user_op::Tensor::shape to user_op::Tensor::shape_view * auto format by CI * fix static analyzer complaints * more verbose code for HobDataType * larger timeout * larger timeout Co-authored-by: oneflow-ci-bot Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: jackalcooper Co-authored-by: binbinHan * speedup global test (#8468) * speedup global test * Test refine slice ops test (#8471) * refine consistent_slice test from 112s -> 30s in 4 device * test(SliceUpdate): refine test from 119s -> 28s in 4 device * delete useless code * auto format by CI Co-authored-by: Yinggang Wang Co-authored-by: wyg1997 Co-authored-by: oneflow-ci-bot * Set the minimum mtu value for IB communication connection (#8451) * Set the minimum mtu value for IB communication connection * refine * refine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> * Merge branch 'master' into feat-general_basic_communication Co-authored-by: Shenghang Tsai Co-authored-by: daquexian Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot Co-authored-by: liufengwei0103 <2472937968@qq.com> Co-authored-by: Wang Yi <53533850+marigoold@users.noreply.github.com> Co-authored-by: ZZK <359521840@qq.com> Co-authored-by: hjchen2 Co-authored-by: Juncheng Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Co-authored-by: Luyang Co-authored-by: binbinHan Co-authored-by: Yinggang Wang Co-authored-by: Yao Zihang <1162526220@qq.com> Co-authored-by: yuhao <72971170+howin98@users.noreply.github.com> Co-authored-by: Xiaoyu Xu Co-authored-by: cheng cheng <472491134@qq.com> Co-authored-by: Cijie Xia Co-authored-by: guo ran <360112263@qq.com> Co-authored-by: Peihong Liu Co-authored-by: Li Xinqi Co-authored-by: Shijie <821898965@qq.com> Co-authored-by: lixinqi Co-authored-by: leaves-zwx Co-authored-by: Li Xiang <54010254+lixiang007666@users.noreply.github.com> Co-authored-by: Cijie Xia Co-authored-by: Jia Co-authored-by: Shanshan Zhong <62104945+zhongshsh@users.noreply.github.com> Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com> Co-authored-by: wyg1997 Co-authored-by: Yu OuYang --- .github/workflows/canary.yml | 2 +- .github/workflows/on_merge.yml | 2 +- .github/workflows/release.yml | 8 +- .github/workflows/simple.yml | 4 +- .github/workflows/test.yml | 64 ++- ci/manylinux/build-gcc7.sh | 5 + ci/manylinux/build.sh | 5 + cmake/caches/cn/fast/mlir-cpu.cmake | 24 + cmake/oneflow.cmake | 18 +- cmake/util.cmake | 10 + docs/source/env.rst | 2 + docs/source/graph.rst | 2 + docs/source/oneflow.rst | 3 +- docs/source/tensor.rst | 2 + external/CMakeLists.txt | 8 +- external/onetbb/CMakeLists.txt | 1 - external/robin-hood-hashing/CMakeLists.txt | 13 +- oneflow/api/common/sbp.h | 2 +- oneflow/api/python/env/env.cpp | 2 + oneflow/api/python/framework/dtype.cpp | 5 +- oneflow/api/python/framework/nn_graph.cpp | 8 +- oneflow/api/python/framework/tensor.cpp | 26 +- .../api/python/framework/tensor_functions.cpp | 168 ++++++ oneflow/api/python/functional/tensor_api.cpp | 27 +- .../api/python/symbol/placement_symbol.cpp | 22 + oneflow/api/python/utils/tensor_utils.h | 4 +- oneflow/api/python/vm/id_generator.cpp | 41 -- .../autograd/gradient_funcs/logical_slice.cpp | 150 ------ oneflow/core/autograd/gradient_funcs/nll.cpp | 70 ++- .../core/autograd/gradient_funcs/slice.cpp | 58 +- .../core/boxing/nd_sbp_dim_reduce_boxing.cpp | 2 +- oneflow/core/boxing/slice_boxing_util.h | 1 + .../core/boxing/symmetric_b_to_s_boxing.cpp | 4 +- .../ibverbs/ibverbs_comm_network.cpp | 2 +- .../core/comm_network/ibverbs/ibverbs_qp.cpp | 7 +- .../core/comm_network/ibverbs/ibverbs_qp.h | 4 +- .../array_ref.h} | 19 +- oneflow/core/common/device_type.proto | 2 +- oneflow/core/common/shape.cpp | 171 +++--- oneflow/core/common/shape.h | 117 ++-- oneflow/core/common/shape_view.cpp | 78 +-- oneflow/core/common/shape_view.h | 76 +-- .../singleton_ptr.h} | 32 +- oneflow/core/common/steady_vector.h | 102 ++++ .../steady_vector_test.cpp} | 28 +- oneflow/core/common/stream_role.h | 60 +-- oneflow/core/common/stride.cpp | 36 +- oneflow/core/common/stride.h | 25 +- oneflow/core/common/tensor_buffer.h | 2 + oneflow/core/cuda/softmax.cuh | 2 +- oneflow/core/device/cuda_util.cpp | 6 +- oneflow/core/eager/blob_instruction_type.cpp | 2 +- oneflow/core/eager/blob_instruction_type.h | 92 +++- .../eager/cpu_opkernel_instruction_type.cpp | 36 -- ...pp => critical_section_instruction_type.h} | 20 +- .../critical_section_phy_instr_operand.cpp | 17 +- .../critical_section_phy_instr_operand.h | 37 +- .../core/eager/cuda_blob_instruction_type.cpp | 59 -- .../eager/cuda_opkernel_instruction_type.cpp | 74 --- oneflow/core/eager/eager_blob_object.h | 20 +- ...n_type.cpp => lazy_job_instruction_type.h} | 17 +- .../core/eager/lazy_job_phy_instr_operand.cpp | 27 +- ..._type.cpp => op_call_instruction_type.cpp} | 70 ++- ...tion_type.h => op_call_instruction_type.h} | 17 +- ...rand.cpp => op_call_phy_instr_operand.cpp} | 25 +- ..._operand.h => op_call_phy_instr_operand.h} | 41 +- .../release_tensor_arg_phy_instr_operand.h | 5 +- ....cpp => release_tensor_instruction_type.h} | 79 ++- oneflow/core/framework/dtype.cpp | 9 + .../core/framework/instructions_builder.cpp | 221 ++++---- oneflow/core/framework/instructions_builder.h | 42 +- oneflow/core/framework/op_expr.cpp | 8 +- oneflow/core/framework/op_expr.h | 6 +- oneflow/core/framework/op_interpreter.h | 3 + .../eager_consistent_op_interpreter.cpp | 10 +- .../eager_mirrored_op_interpreter.cpp | 7 +- .../op_interpreter/op_interpreter.cpp | 4 + oneflow/core/framework/placement_sbp_util.cpp | 23 +- oneflow/core/framework/sbp_infer_util.cpp | 108 +++- oneflow/core/framework/sbp_infer_util.h | 12 +- oneflow/core/framework/stream.cpp | 50 +- oneflow/core/framework/stream.h | 29 +- .../stream_get_call_instruction_name.h | 99 ---- .../stream_get_release_instruction_name.h | 99 ---- .../framework/stream_get_stream_role_name.h | 40 ++ .../framework/stream_is_comm_net_stream.h | 19 +- oneflow/core/framework/stream_mgr.cpp | 61 +++ oneflow/core/framework/stream_mgr.h | 48 ++ .../core/framework/stream_need_soft_sync.h | 25 +- .../framework/stream_on_independent_thread.h | 37 ++ oneflow/core/framework/tensor.cpp | 2 +- oneflow/core/framework/tensor.h | 7 + .../core/framework/tensor_consistent_id.cpp | 1 + oneflow/core/framework/tensor_impl.cpp | 10 +- oneflow/core/framework/tensor_impl.h | 3 + oneflow/core/framework/tensor_meta.cpp | 2 +- oneflow/core/framework/tensor_methods.cpp | 79 ++- oneflow/core/framework/user_op_hob.h | 1 + oneflow/core/framework/user_op_tensor.h | 4 +- oneflow/core/functional/functional_api.yaml | 33 +- .../core/functional/impl/array_functor.cpp | 162 ++---- oneflow/core/functional/impl/common.cpp | 17 +- .../core/functional/impl/consistent_cast.cpp | 4 +- oneflow/core/functional/impl/math_functor.cpp | 15 +- oneflow/core/functional/impl/nn_functor.cpp | 146 +++-- .../core/functional/impl/nn_grad_functor.cpp | 30 +- oneflow/core/functional/tensor_index.cpp | 6 +- ...erarchical_sub_task_graph_builder_impl.cpp | 95 +--- ...hierarchical_sub_task_graph_builder_impl.h | 6 - oneflow/core/graph/graph.h | 150 +++++- oneflow/core/graph/op_graph.cpp | 3 +- oneflow/core/graph/straighten_nodes.cpp | 485 +++++++++++++++++ .../graph/straighten_nodes.h} | 14 +- oneflow/core/graph/task_graph.cpp | 9 +- oneflow/core/graph/task_graph.h | 2 +- oneflow/core/job/compiler.cpp | 3 +- oneflow/core/job/eager_nccl_comm_manager.cpp | 71 +++ oneflow/core/job/eager_nccl_comm_manager.h | 39 ++ oneflow/core/job/env_global_objects_scope.cpp | 58 +- oneflow/core/job/env_global_objects_scope.h | 4 + .../core/job/intra_job_mem_sharing_util.cpp | 27 +- oneflow/core/job/job_build_and_infer_ctx.cpp | 7 +- oneflow/core/job/job_builder.cpp | 44 ++ oneflow/core/job/job_builder.h | 1 + oneflow/core/job/job_conf.proto | 2 + oneflow/core/job/module_conf.proto | 4 +- oneflow/core/job/nd_sbp_util.cpp | 4 +- oneflow/core/job/plan_util.cpp | 45 +- oneflow/core/job/runtime.cpp | 4 + .../group_boxing_by_dst_parallel.cpp | 32 +- .../insert_nccl_logical_op_pass.cpp | 18 +- ...t_sparse_softmax_cross_entropy_op_pass.cpp | 6 +- oneflow/core/kernel/blob_tensor_view.cpp | 4 +- oneflow/core/kernel/blob_tensor_view.h | 4 +- oneflow/core/kernel/user_kernel.cpp | 4 +- .../ndarray/cpu_concat_var_ndarray_test.cpp | 37 +- .../ndarray/cpu_slice_var_ndarray_test.cpp | 40 +- oneflow/core/ndarray/cpu_var_ndarray_test.cpp | 8 +- oneflow/core/operator/operator.cpp | 6 - oneflow/core/vm/barrier_instruction_type.h | 66 +++ oneflow/core/vm/control_stream_type.cpp | 13 +- oneflow/core/vm/control_stream_type.h | 4 - oneflow/core/vm/cpu_stream_type.cpp | 16 +- oneflow/core/vm/cpu_stream_type.h | 4 - .../critical_section_status_querier.h | 6 +- .../critical_section_stream_type.cpp | 18 +- .../critical_section_stream_type.h | 10 +- oneflow/core/vm/cuda_copy_d2h_stream_type.cpp | 19 +- oneflow/core/vm/cuda_copy_d2h_stream_type.h | 4 - oneflow/core/vm/cuda_copy_h2d_stream_type.cpp | 18 +- oneflow/core/vm/cuda_copy_h2d_stream_type.h | 4 - oneflow/core/vm/cuda_stream_type.cpp | 18 +- oneflow/core/vm/cuda_stream_type.h | 4 - ...pp => event_recorded_cuda_stream_type.cpp} | 36 +- ...pe.h => event_recorded_cuda_stream_type.h} | 16 +- ...ction_type.cpp => fuse_instruction_type.h} | 32 +- oneflow/core/vm/fuse_phy_instr_operand.h | 9 +- oneflow/core/vm/id_generator.cpp | 44 -- oneflow/core/vm/id_generator.h | 60 --- oneflow/core/vm/id_util.cpp | 91 ---- oneflow/core/vm/id_util.h | 64 --- oneflow/core/vm/instr_type_id.h | 81 --- oneflow/core/vm/instruction.cpp | 59 +- oneflow/core/vm/instruction.h | 75 +-- oneflow/core/vm/instruction.proto | 49 -- oneflow/core/vm/instruction_type.cpp | 28 - oneflow/core/vm/instruction_type.h | 27 +- .../{eager => vm}/lazy_job_device_context.h | 6 +- .../{eager => vm}/lazy_job_stream_type.cpp | 18 +- .../core/{eager => vm}/lazy_job_stream_type.h | 10 +- oneflow/core/vm/runtime_instr_type_id.h | 52 -- .../core/vm/sequential_instruction_type.cpp | 105 ---- oneflow/core/vm/stream.cpp | 35 +- oneflow/core/vm/stream.h | 48 +- oneflow/core/vm/stream_desc.h | 99 ---- oneflow/core/vm/stream_get_stream_type.h | 108 ++++ oneflow/core/vm/stream_runtime_desc.h | 85 --- oneflow/core/vm/stream_type.h | 7 - oneflow/core/vm/thread_ctx.cpp | 2 +- oneflow/core/vm/thread_ctx.h | 17 +- oneflow/core/vm/virtual_machine.cpp | 296 +++++++---- oneflow/core/vm/virtual_machine.h | 48 +- oneflow/core/vm/virtual_machine_engine.cpp | 100 +--- oneflow/core/vm/virtual_machine_engine.h | 46 +- oneflow/core/vm/virtual_machine_scope.cpp | 2 +- oneflow/core/vm/vm_desc.cpp | 70 --- oneflow/core/vm/vm_desc.h | 74 --- oneflow/core/vm/vm_object.h | 3 - oneflow/core/vm/vm_util.cpp | 7 +- oneflow/extension/python/numpy.cpp | 7 +- oneflow/extension/python/numpy_internal.h | 4 +- oneflow/extension/python/py_compute.cpp | 8 +- oneflow/ir/include/OneFlow/OneFlowDialect.td | 1 + oneflow/ir/include/OneFlow/OneFlowOps.td | 6 - oneflow/ir/include/OneFlow/OneFlowPatterns.td | 2 +- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 122 ++--- oneflow/ir/include/OneFlow/Passes.h | 3 +- oneflow/ir/install-llvm.cmake | 1 + oneflow/ir/lib/OneFlow/CMakeLists.txt | 4 +- .../lib/OneFlow/Conversion/OneFlowToTosa.cpp | 10 +- .../ir/lib/OneFlow/Conversion/PTXToCubin.cpp | 2 +- .../ir/lib/OneFlow/Conversion/SCFToGPU.cpp | 70 --- oneflow/ir/lib/OneFlow/OneFlowOpFolders.cpp | 21 +- oneflow/ir/lib/OneFlow/Passes.cpp | 22 +- oneflow/ir/oneflow-extension/CMakeLists.txt | 2 +- oneflow/ir/oneflow-extension/extension.cpp | 12 +- oneflow/ir/oneflow-opt/oneflow-opt.cpp | 2 +- oneflow/ir/oneflow-runner/CMakeLists.txt | 2 +- .../lib/OneFlow/CMakeLists.txt | 2 +- .../lib/OneFlow/Importer.cpp | 16 +- .../ir/test/Frontend/test_tosa_to_elf.mlir | 2 +- .../cuda_code_gen/fuse_cast_scale.mlir | 18 +- .../OneFlow/cuda_code_gen/gpu_copy_arg.mlir | 4 +- .../OneFlow/cuda_code_gen/gpu_runner.mlir | 6 +- .../ir/test/OneFlow/folding/test_conv_bn.py | 4 +- .../OneFlow/folding/test_simple_multiply.py | 12 +- oneflow/ir/test/OneFlow/lower_to_tosa.mlir | 3 +- oneflow/ir/test/OneFlow/traits.mlir | 28 +- .../test_conv_bn_auto_nhwc.py | 20 +- oneflow/user/data/coco_parser.cpp | 39 +- .../ofrecord_image_classification_dataset.cpp | 2 +- .../ofrecord_image_classification_parser.h | 8 +- oneflow/user/data/ofrecord_parser.h | 6 +- oneflow/user/image/image_util.cpp | 20 +- oneflow/user/kernels/acc_kernel.cpp | 4 +- .../user/kernels/adaptive_pool_cpu_kernel.cpp | 2 +- .../user/kernels/adaptive_pool_gpu_kernel.cu | 6 +- oneflow/user/kernels/add_n_kernel.cpp | 4 +- oneflow/user/kernels/affine_grid_kernel.cpp | 12 +- oneflow/user/kernels/arg_sort_kernel.cpp | 4 +- oneflow/user/kernels/arg_sort_kernel.cu | 8 +- oneflow/user/kernels/arg_where_kernel.cpp | 6 +- oneflow/user/kernels/argmax_kernel.cpp | 4 +- oneflow/user/kernels/argmax_kernel.cu | 8 +- oneflow/user/kernels/as_strided_kernel.cpp | 19 +- oneflow/user/kernels/as_strided_kernel.cu | 19 +- oneflow/user/kernels/assign_if_kernel.cpp | 5 +- oneflow/user/kernels/assign_if_kernel.cu | 8 +- oneflow/user/kernels/assign_kernel.cpp | 4 +- oneflow/user/kernels/avg_pool_kernel.cpp | 54 +- oneflow/user/kernels/batch_gather_kernel.cpp | 11 +- .../user/kernels/batch_gather_kernel_util.cpp | 12 +- oneflow/user/kernels/bernoulli_kernel.cpp | 4 +- oneflow/user/kernels/bias_add_kernel.h | 12 +- .../kernels/binary_cross_entropy_kernel.cpp | 4 +- .../kernels/binary_cross_entropy_kernel.cu | 4 +- ...inary_cross_entropy_with_logits_kernel.cpp | 20 +- ...binary_cross_entropy_with_logits_kernel.cu | 20 +- .../kernels/broadcast_div_grad_kernel.cpp | 16 +- .../user/kernels/broadcast_like_kernel.cpp | 4 +- .../kernels/broadcast_pow_grad_kernel.cpp | 27 +- .../user/kernels/broadcast_pow_grad_kernel.cu | 12 +- oneflow/user/kernels/cast_kernel.cpp | 4 +- .../kernels/cast_to_static_shape_kernel.cpp | 6 +- .../categorical_ordinal_encode_kernel.cpp | 6 +- oneflow/user/kernels/clip_by_value_kernel.cpp | 12 +- .../kernels/combined_margin_loss_kernel.cpp | 12 +- .../kernels/combined_margin_loss_kernel.cu | 35 +- oneflow/user/kernels/concat_kernel.cpp | 12 +- oneflow/user/kernels/constant_kernel.cpp | 2 +- oneflow/user/kernels/conv_cudnn_kernels.cpp | 26 +- oneflow/user/kernels/conv_kernels.cpp | 41 +- .../user/kernels/copy_data_content_kernel.h | 4 +- oneflow/user/kernels/copy_kernel.cpp | 4 +- .../user/kernels/count_not_finite_kernel.cpp | 2 +- .../user/kernels/count_not_finite_kernel.cu | 10 +- oneflow/user/kernels/ctc_greedy_decoder.h | 8 +- oneflow/user/kernels/ctc_loss_kernel.cpp | 16 +- ...cublas_bias_add_relu_matmul_grad_kernel.cu | 4 +- .../cublas_fused_matmul_bias_add_grad.cu | 4 +- .../user/kernels/cublas_fused_mlp_kernel.cu | 6 +- oneflow/user/kernels/cum_backward_kernel.cpp | 8 +- oneflow/user/kernels/cum_backward_kernel.cu | 8 +- oneflow/user/kernels/cum_forward_kernel.cpp | 8 +- oneflow/user/kernels/cum_forward_kernel.cu | 8 +- oneflow/user/kernels/data_shuffle_kernel.cu | 60 +-- oneflow/user/kernels/deconv_cpu_kernel.cpp | 8 +- oneflow/user/kernels/deconv_cudnn_kernel.cpp | 8 +- oneflow/user/kernels/diag_kernel.h | 8 +- oneflow/user/kernels/diagonal_kernel.cpp | 8 +- oneflow/user/kernels/diagonal_kernel.cu | 8 +- oneflow/user/kernels/dim_gather_kernels.cpp | 25 +- oneflow/user/kernels/dim_scatter_kernels.cpp | 41 +- .../kernels/dim_scatter_scalar_kernels.cpp | 17 +- .../kernels/distributions/normal_kernel.h | 2 +- .../distributions/uniform_int_kernel.h | 2 +- .../kernels/distributions/uniform_kernel.h | 2 +- oneflow/user/kernels/dot_kernel.cpp | 2 +- oneflow/user/kernels/dropout_kernel.cpp | 10 +- oneflow/user/kernels/dropout_kernel.cu | 6 +- oneflow/user/kernels/eager_nccl_kernels.cpp | 36 +- oneflow/user/kernels/eager_nccl_kernels.cu | 33 +- .../kernels/eager_symmetric_s_to_p_kernel.cpp | 2 +- .../elementwise_maximum_minimum_kernel.h | 7 +- oneflow/user/kernels/elementwise_xpu_kernel.h | 14 +- oneflow/user/kernels/embedding_kernel.cpp | 14 +- oneflow/user/kernels/embedding_kernel.cu | 18 +- oneflow/user/kernels/empty_kernel.cpp | 2 +- oneflow/user/kernels/erfinv_kernel.cpp | 2 +- oneflow/user/kernels/erfinv_kernel.cu | 2 +- oneflow/user/kernels/example_generated.h | 4 +- oneflow/user/kernels/expand_kernel.cpp | 18 +- oneflow/user/kernels/expand_kernel.cu | 18 +- oneflow/user/kernels/eye_kernel.cpp | 2 +- .../user/kernels/fake_quantization_kernel.cpp | 10 +- .../user/kernels/fake_quantization_kernel.cu | 6 +- oneflow/user/kernels/flip_kernel.cpp | 8 +- oneflow/user/kernels/flip_kernel.cu | 8 +- oneflow/user/kernels/fold_kernel.cpp | 5 +- oneflow/user/kernels/fused_bias_add_kernel.cu | 24 +- .../user/kernels/fused_cast_scale_kernel.cpp | 2 +- .../user/kernels/fused_cast_scale_kernel.cu | 2 +- .../fused_cross_feature_interaction.cu | 9 +- .../fused_cross_feature_interaction_grad.cu | 30 +- .../fused_dot_feature_interaction_kernel.cu | 48 +- oneflow/user/kernels/fused_gru_cell_kernel.cu | 38 +- .../user/kernels/fused_lstm_cell_kernel.cu | 30 +- .../fused_matmul_bias_add_relu_dropout.cu | 8 +- .../kernels/fused_relu_dropout_grad_kernel.cu | 6 +- .../user/kernels/fused_scale_mask_softmax.cu | 262 +++++---- .../user/kernels/fused_scale_mask_softmax.cuh | 216 ++++++++ .../fused_scale_mask_softmax_dropout.cu | 277 ++++++---- ...ttention_query_mul_key_and_value_kernel.cu | 23 +- ...ed_tril_scale_softmax_mask_scale_kernel.cu | 4 +- oneflow/user/kernels/gather_kernel.cpp | 19 +- oneflow/user/kernels/gather_kernel_util.cpp | 4 +- oneflow/user/kernels/gelu_kernel.cpp | 2 +- oneflow/user/kernels/gelu_kernel.cu | 2 +- ...andom_batch_permutation_indices_kernel.cpp | 4 +- ...random_batch_permutation_indices_kernel.cu | 5 +- .../user/kernels/gpt_data_loader_kernel.cpp | 8 +- oneflow/user/kernels/grid_sample_kernel.cpp | 12 +- .../user/kernels/grid_sample_kernel_util.cu | 12 +- oneflow/user/kernels/group_conv_kernel.cpp | 60 ++- oneflow/user/kernels/group_deconv_kernel.cpp | 20 +- .../kernels/heap_selection_top_k_kernel.cu | 6 +- oneflow/user/kernels/identity_kernel.cpp | 4 +- .../user/kernels/image_batch_align_kernel.cpp | 26 +- oneflow/user/kernels/image_decode_kernel.cpp | 6 +- .../image_object_preprocess_kernels.cpp | 92 ++-- .../user/kernels/image_preprocess_kernels.cpp | 24 +- .../user/kernels/image_preprocess_kernels.cu | 4 +- oneflow/user/kernels/image_resize_kernels.cpp | 42 +- .../kernels/image_target_resize_kernel.cpp | 30 +- oneflow/user/kernels/in_top_k_kernel.cpp | 10 +- .../indexed_slices_reduce_sum_kernel.cpp | 6 +- .../l1_l2_regularize_gradient_kernel.cpp | 2 +- oneflow/user/kernels/l2_normalize_kernel.cpp | 14 +- oneflow/user/kernels/l2_normalize_kernel.cu | 12 +- oneflow/user/kernels/layer_norm_gpu_kernel.cu | 16 +- oneflow/user/kernels/log_softmax_kernel.cpp | 8 +- oneflow/user/kernels/logical_not_kernel.cpp | 2 +- oneflow/user/kernels/logical_not_kernel.cu | 2 +- oneflow/user/kernels/loss_kernel_util.h | 4 +- oneflow/user/kernels/masked_fill_kernel.cpp | 6 +- .../kernels/math_binary_broadcast_kernels.cpp | 20 +- .../math_binary_elementwise_kernel.cpp | 6 +- .../kernels/math_binary_elementwise_kernel.cu | 12 +- .../kernels/math_unary_elementwise_kernel.cpp | 4 +- .../kernels/math_unary_elementwise_kernel.cu | 8 +- oneflow/user/kernels/matmul_kernels.cpp | 66 +-- oneflow/user/kernels/max_pool_kernel.cpp | 58 +- oneflow/user/kernels/median_kernel.cpp | 2 +- oneflow/user/kernels/median_kernel.cu | 4 +- .../kernels/median_with_indices_kernel.cpp | 6 +- .../kernels/median_with_indices_kernel.cu | 10 +- .../user/kernels/min_max_observer_kernel.cpp | 8 +- .../user/kernels/min_max_observer_kernel.cu | 4 +- oneflow/user/kernels/model_update_kernels.cpp | 119 +++-- ...moving_average_min_max_observer_kernel.cpp | 2 +- .../moving_average_min_max_observer_kernel.cu | 6 +- oneflow/user/kernels/multi_reduce_kernels.h | 4 +- oneflow/user/kernels/narrow_kernel.cpp | 12 +- .../kernels/nccl_logical_2d_sbp_kernels.cpp | 67 ++- oneflow/user/kernels/nccl_logical_kernels.cpp | 66 +-- .../kernels/nccl_logical_send_recv_kernel.cpp | 23 +- oneflow/user/kernels/nd_index_slice_kernels.h | 14 +- oneflow/user/kernels/nd_index_slice_util.h | 10 +- oneflow/user/kernels/nll_kernel.cpp | 254 +++++---- oneflow/user/kernels/nll_kernel.cu | 207 ------- oneflow/user/kernels/nll_kernel_util.cpp | 63 +++ oneflow/user/kernels/nll_kernel_util.cu | 92 ++++ oneflow/user/kernels/nll_kernel_util.h | 36 ++ oneflow/user/kernels/nms_kernel.cu | 2 +- oneflow/user/kernels/normalization_kernel.cpp | 49 +- oneflow/user/kernels/normalization_kernel.cu | 73 +-- oneflow/user/kernels/nvtx_range_kernel.cu | 8 +- .../user/kernels/ofrecord_decoder_kernels.cpp | 16 +- oneflow/user/kernels/one_embedding_kernels.cu | 14 +- .../kernels/one_embedding_update_kernels.cu | 96 ++-- oneflow/user/kernels/one_hot_kernel.cpp | 4 +- oneflow/user/kernels/one_hot_kernel.cu | 2 +- .../user/kernels/onerec_decoder_kernels.cpp | 16 +- oneflow/user/kernels/ones_like_kernel.cpp | 2 +- oneflow/user/kernels/p2p_comm_kernel.cpp | 6 +- oneflow/user/kernels/pack_kernel.cpp | 20 +- oneflow/user/kernels/pad2d_kernels.cpp | 68 +-- oneflow/user/kernels/pad_kernel.cpp | 8 +- .../user/kernels/partial_fc_sample_kernel.cu | 10 +- oneflow/user/kernels/prelu_kernel.cpp | 18 +- oneflow/user/kernels/prelu_kernel.cu | 22 +- oneflow/user/kernels/quantization_kernel.cpp | 12 +- oneflow/user/kernels/quantization_kernel.cu | 6 +- .../user/kernels/radix_sort_top_k_kernel.cu | 10 +- .../user/kernels/random_mask_like_kernel.h | 2 +- oneflow/user/kernels/reduce_kernel.cpp | 30 +- oneflow/user/kernels/reduce_like_kernels.cpp | 31 +- oneflow/user/kernels/relu_bfloat16_kernel.cu | 2 +- .../user/kernels/repeat_interleave_kernel.cpp | 2 +- .../user/kernels/repeat_interleave_kernel.cu | 4 +- oneflow/user/kernels/repeat_kernel.cpp | 4 +- oneflow/user/kernels/roc_auc_score_kernel.cpp | 6 +- oneflow/user/kernels/roi_align_kernel.cu | 16 +- oneflow/user/kernels/roll_kernel.cpp | 4 +- oneflow/user/kernels/roll_kernel.cu | 4 +- oneflow/user/kernels/roll_kernel_utils.h | 2 +- oneflow/user/kernels/same_padding_kernel.cpp | 40 +- .../user/kernels/scalar_by_tensor_kernel.cpp | 6 +- .../user/kernels/scalar_logical_kernels.cpp | 2 +- oneflow/user/kernels/scalar_math_kernels.cpp | 8 +- oneflow/user/kernels/scalar_math_kernels.cu | 4 +- oneflow/user/kernels/search_sorted_kernel.cpp | 14 +- oneflow/user/kernels/search_sorted_kernel.cu | 14 +- .../kernels/sigmoid_cross_entropy_kernel.h | 4 +- oneflow/user/kernels/slice_kernel.cpp | 197 +++---- oneflow/user/kernels/slice_util.h | 10 + .../kernels/softmax_cross_entropy_kernel.h | 14 +- oneflow/user/kernels/softmax_kernel.cpp | 6 +- oneflow/user/kernels/sort_kernel.cpp | 6 +- oneflow/user/kernels/sort_kernel.cu | 10 +- .../kernels/sparse_cross_entropy_kernel.cpp | 30 +- .../sparse_softmax_cross_entropy_kernel.cpp | 22 +- .../sparse_softmax_cross_entropy_kernel.cu | 6 +- oneflow/user/kernels/split_like_kernel.cpp | 8 +- .../user/kernels/sqrt_square_sum_kernel.cpp | 5 +- oneflow/user/kernels/square_sum_kernel.cpp | 4 +- .../kernels/ssp_variable_proxy_kernel.cpp | 4 +- oneflow/user/kernels/stack_kernel.cpp | 20 +- ...cal_opkernel.cpp => stateful_opkernel.cpp} | 26 +- ...l_local_opkernel.h => stateful_opkernel.h} | 34 +- oneflow/user/kernels/summary_kernels.cpp | 8 +- oneflow/user/kernels/tanh_grad_kernel.cu | 2 +- oneflow/user/kernels/tanh_kernel.cpp | 2 +- .../user/kernels/tensor_buffer_kernels.cpp | 30 +- oneflow/user/kernels/tf_prelu_kernel.cpp | 16 +- oneflow/user/kernels/tf_prelu_kernel.cu | 28 +- oneflow/user/kernels/to_contiguous_kernel.cpp | 8 +- oneflow/user/kernels/top_k_kernel.cpp | 6 +- oneflow/user/kernels/transpose_kernel.cpp | 6 +- oneflow/user/kernels/tril_kernel.cpp | 2 +- oneflow/user/kernels/tril_kernel.cu | 4 +- oneflow/user/kernels/triu_kernel.cpp | 2 +- oneflow/user/kernels/triu_kernel.cu | 2 +- .../user/kernels/tuple_identity_kernel.cpp | 4 +- .../user/kernels/two_stage_reduce_kernel.cpp | 71 +-- oneflow/user/kernels/unfold_kernel.cpp | 2 +- oneflow/user/kernels/unfold_tensor_kernel.cpp | 22 +- oneflow/user/kernels/unfold_tensor_kernel.cu | 22 +- .../kernels/unique_with_counts_kernel.cpp | 4 +- oneflow/user/kernels/unpack_kernel.cpp | 12 +- .../unsorted_batch_segment_sum_kernel.cpp | 9 +- .../kernels/unsorted_segment_sum_kernel.cpp | 27 +- .../kernels/upsample_bicubic_2d_kernel.cpp | 26 +- .../kernels/upsample_bicubic_2d_kernel.cu | 30 +- .../kernels/upsample_bilinear_2d_kernel.cpp | 54 +- .../kernels/upsample_bilinear_2d_kernel.cu | 50 +- .../kernels/upsample_linear_1d_kernel.cpp | 42 +- .../user/kernels/upsample_linear_1d_kernel.cu | 36 +- .../user/kernels/upsample_nearest_kernel.cpp | 162 +++--- .../user/kernels/upsample_nearest_kernel.cu | 156 +++--- .../kernels/upsample_trilinear_3d_kernel.cpp | 74 +-- .../kernels/upsample_trilinear_3d_kernel.cu | 74 +-- oneflow/user/kernels/variance_kernel.cpp | 4 +- oneflow/user/kernels/where_kernel.cpp | 78 +-- oneflow/user/kernels/zero_like_kernel.cpp | 2 +- oneflow/user/ops/flatten_op.cpp | 2 +- .../fused_scale_mask_softmax_dropout_op.cpp | 82 ++- .../user/ops/fused_scale_mask_softmax_op.cpp | 69 ++- .../user/ops/math_binary_broadcast_ops.cpp | 4 +- oneflow/user/ops/nll_op.cpp | 227 +++++--- oneflow/user/ops/slice_op.cpp | 354 +++--------- oneflow/user/summary/event_writer_helper.cpp | 21 +- python/oneflow/__init__.py | 4 +- python/oneflow/env.py | 28 + python/oneflow/framework/distribute.py | 44 +- python/oneflow/framework/docstr/math_ops.py | 37 +- python/oneflow/framework/docstr/tensor.py | 32 +- .../framework/docstr/tensor_attributes.py | 6 +- python/oneflow/framework/graph_build_util.py | 49 +- python/oneflow/framework/tensor.py | 67 +-- python/oneflow/framework/tensor_str.py | 4 - python/oneflow/framework/tensor_str_util.py | 15 +- python/oneflow/nn/graph/block.py | 40 +- python/oneflow/nn/graph/block_config.py | 54 +- python/oneflow/nn/graph/graph.py | 58 +- python/oneflow/nn/graph/graph_config.py | 10 + python/oneflow/nn/graph/util.py | 221 ++++++-- python/oneflow/nn/modules/fused_mlp.py | 12 +- python/oneflow/nn/modules/loss.py | 2 +- python/oneflow/nn/modules/slice.py | 38 +- python/oneflow/nn/optimizer/polynomial_lr.py | 14 +- python/oneflow/test/README.md | 503 +++++++++--------- python/oneflow/test/exceptions/test_device.py | 5 +- .../test_local_global_convert_error.py | 2 +- python/oneflow/test/exceptions/test_mv.py | 50 ++ .../oneflow/test/expensive/test_id_shuffle.py | 6 +- .../oneflow/test/expensive/test_tensor_str.py | 30 ++ python/oneflow/test/gen_ops_process.py | 2 - python/oneflow/test/graph/test_comb2d.py | 4 +- .../test/graph/test_graph_linear_train.py | 1 + .../test/graph/test_graph_lr_scheduler.py | 4 +- python/oneflow/test/graph/test_graph_lrs.py | 2 +- .../test/graph/test_graph_ofrecord_reader.py | 3 - python/oneflow/test/graph/test_graph_zero.py | 9 +- .../test/graph/test_nccl_logical_send_recv.py | 90 +++- .../modules/test_consistent_adaptive_pool.py | 6 +- .../test/modules/test_consistent_mv.py | 39 ++ .../test/modules/test_consistent_rnn_cell.py | 8 +- .../test/modules/test_consistent_slice.py | 49 +- ...ign.py => test_consistent_slice_update.py} | 36 +- ...t_consistent_stateful_kernel_with_cache.py | 27 +- .../test/modules/test_consistent_tensordot.py | 2 +- .../test/modules/test_consistent_var.py | 26 +- .../test_fused_dot_feature_interaction.py | 2 +- .../modules/test_fused_scale_mask_softmax.py | 15 +- .../test_fused_scale_mask_softmax_dropout.py | 21 +- python/oneflow/test/modules/test_hsplit.py | 6 +- python/oneflow/test/modules/test_matmul.py | 13 + python/oneflow/test/modules/test_nll_loss.py | 134 +++++ .../test/modules/test_repeat_interleave.py | 26 +- python/oneflow/test/modules/test_slice.py | 167 +++--- .../test_stateful_kernel_with_cache.py | 4 +- .../oneflow/test/tensor/test_tensor_part_1.py | 27 +- .../test/tensor/test_tensor_pin_memory.py | 11 + .../automated_test_util/profiler.py | 4 +- .../torch_flow_dual_object.py | 148 ++++-- python/oneflow/utils/data/dataloader.py | 17 +- 537 files changed, 8549 insertions(+), 7677 deletions(-) create mode 100644 cmake/caches/cn/fast/mlir-cpu.cmake delete mode 100644 oneflow/api/python/vm/id_generator.cpp delete mode 100644 oneflow/core/autograd/gradient_funcs/logical_slice.cpp rename oneflow/core/{vm/stream_runtime_desc.cpp => common/array_ref.h} (70%) rename oneflow/core/{eager/cpu_blob_instruction_type.cpp => common/singleton_ptr.h} (55%) create mode 100644 oneflow/core/common/steady_vector.h rename oneflow/core/{vm/stream_desc.cpp => common/steady_vector_test.cpp} (51%) delete mode 100644 oneflow/core/eager/cpu_opkernel_instruction_type.cpp rename oneflow/core/eager/{critical_section_instruction_type.cpp => critical_section_instruction_type.h} (92%) delete mode 100644 oneflow/core/eager/cuda_blob_instruction_type.cpp delete mode 100644 oneflow/core/eager/cuda_opkernel_instruction_type.cpp rename oneflow/core/eager/{lazy_job_instruction_type.cpp => lazy_job_instruction_type.h} (93%) rename oneflow/core/eager/{opkernel_instruction_type.cpp => op_call_instruction_type.cpp} (72%) rename oneflow/core/eager/{opkernel_instruction_type.h => op_call_instruction_type.h} (70%) rename oneflow/core/eager/{local_call_opkernel_phy_instr_operand.cpp => op_call_phy_instr_operand.cpp} (78%) rename oneflow/core/eager/{local_call_opkernel_phy_instr_operand.h => op_call_phy_instr_operand.h} (78%) rename oneflow/core/eager/{release_tensor_instruction_type.cpp => release_tensor_instruction_type.h} (53%) delete mode 100644 oneflow/core/framework/stream_get_call_instruction_name.h delete mode 100644 oneflow/core/framework/stream_get_release_instruction_name.h create mode 100644 oneflow/core/framework/stream_get_stream_role_name.h create mode 100644 oneflow/core/framework/stream_mgr.cpp create mode 100644 oneflow/core/framework/stream_mgr.h create mode 100644 oneflow/core/framework/stream_on_independent_thread.h create mode 100644 oneflow/core/graph/straighten_nodes.cpp rename oneflow/{ir/include/OneFlow/Conversion/SCFToGPU.h => core/graph/straighten_nodes.h} (68%) create mode 100644 oneflow/core/vm/barrier_instruction_type.h rename oneflow/core/{eager => vm}/critical_section_status_querier.h (91%) rename oneflow/core/{eager => vm}/critical_section_stream_type.cpp (75%) rename oneflow/core/{eager => vm}/critical_section_stream_type.h (80%) rename oneflow/core/vm/{async_cuda_stream_type.cpp => event_recorded_cuda_stream_type.cpp} (60%) rename oneflow/core/vm/{async_cuda_stream_type.h => event_recorded_cuda_stream_type.h} (75%) rename oneflow/core/vm/{fuse_instruction_type.cpp => fuse_instruction_type.h} (58%) delete mode 100644 oneflow/core/vm/id_generator.cpp delete mode 100644 oneflow/core/vm/id_generator.h delete mode 100644 oneflow/core/vm/id_util.cpp delete mode 100644 oneflow/core/vm/id_util.h delete mode 100644 oneflow/core/vm/instr_type_id.h delete mode 100644 oneflow/core/vm/instruction.proto rename oneflow/core/{eager => vm}/lazy_job_device_context.h (93%) rename oneflow/core/{eager => vm}/lazy_job_stream_type.cpp (75%) rename oneflow/core/{eager => vm}/lazy_job_stream_type.h (81%) delete mode 100644 oneflow/core/vm/runtime_instr_type_id.h delete mode 100644 oneflow/core/vm/sequential_instruction_type.cpp delete mode 100644 oneflow/core/vm/stream_desc.h create mode 100644 oneflow/core/vm/stream_get_stream_type.h delete mode 100644 oneflow/core/vm/stream_runtime_desc.h delete mode 100644 oneflow/core/vm/vm_desc.cpp delete mode 100644 oneflow/core/vm/vm_desc.h delete mode 100644 oneflow/ir/lib/OneFlow/Conversion/SCFToGPU.cpp rename oneflow/ir/test/OneFlow/{folding => with_cuda}/test_conv_bn_auto_nhwc.py (78%) create mode 100644 oneflow/user/kernels/fused_scale_mask_softmax.cuh delete mode 100644 oneflow/user/kernels/nll_kernel.cu create mode 100644 oneflow/user/kernels/nll_kernel_util.cpp create mode 100644 oneflow/user/kernels/nll_kernel_util.cu create mode 100644 oneflow/user/kernels/nll_kernel_util.h rename oneflow/user/kernels/{stateful_local_opkernel.cpp => stateful_opkernel.cpp} (96%) rename oneflow/user/kernels/{stateful_local_opkernel.h => stateful_opkernel.h} (94%) create mode 100644 python/oneflow/test/exceptions/test_mv.py create mode 100644 python/oneflow/test/modules/test_consistent_mv.py rename python/oneflow/test/modules/{test_consistent_slice_assign.py => test_consistent_slice_update.py} (75%) create mode 100644 python/oneflow/test/modules/test_nll_loss.py diff --git a/.github/workflows/canary.yml b/.github/workflows/canary.yml index 1748ad13400..f39b16d050e 100644 --- a/.github/workflows/canary.yml +++ b/.github/workflows/canary.yml @@ -55,7 +55,7 @@ jobs: - name: Checkout Oneflow-Inc/oneflow if: ${{ github.event.inputs.oneflow-ref == '' }} uses: actions/checkout@v2 - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build manylinux id: build-cuda with: diff --git a/.github/workflows/on_merge.yml b/.github/workflows/on_merge.yml index e94459e07f7..6085a59da77 100644 --- a/.github/workflows/on_merge.yml +++ b/.github/workflows/on_merge.yml @@ -15,6 +15,6 @@ jobs: if: github.event.pull_request.merged == true runs-on: ubuntu-latest steps: - - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@support-iree-ci name: Update benchmark history timeout-minutes: 10 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 19743c9a0d8..1e4112a28ba 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,7 +33,7 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} repository: ${{github.event.pull_request.head.repo.full_name}} - - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-iree-ci name: find cache id: find-cache timeout-minutes: 5 @@ -45,7 +45,7 @@ jobs: release oneflow-src: ${{ env.ONEFLOW_SRC }} entries: | - cu115 + cu116 cu112 cu102 cpu @@ -74,7 +74,7 @@ jobs: python3 -m pip install -U pip setuptools wheel --user python3 -m pip install oss2 --user - uses: actions/checkout@v2 - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build ${{ matrix.entry }} if: ${{ matrix.entry !='cpu' }} with: @@ -98,7 +98,7 @@ jobs: 3.8 3.9 3.10 - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build ${{ matrix.entry }} if: ${{ matrix.entry =='cpu' }} with: diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml index eeec34cef05..1b2064f1a61 100644 --- a/.github/workflows/simple.yml +++ b/.github/workflows/simple.yml @@ -245,7 +245,7 @@ jobs: repository: Oneflow-Inc/conda-env ref: 30a7f00eb48ee9009d85a848e720823e5054c66b path: conda-env - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build with gcc7 if: ${{ matrix.build-type == 'gcc7'}} with: @@ -254,7 +254,7 @@ jobs: oneflow-build-env: conda conda-env-file: conda-env/dev/gcc7/environment-v2.yml conda-env-name: oneflow-dev-gcc7-v2 - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build with clang10 if: ${{ matrix.build-type == 'clang10'}} with: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 213d0246ebf..2a826896162 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,6 +16,8 @@ env: FLOW_VISION_COMMIT: ca8ebc663b58667cf8cd1b6ef0c861522780b7bb LIBAI_SRC: libai LIBAI_COMMIT: 7d31d9781e5f2d559dc0820f599e0bed798488ca + ONEFLOW_IREE_SRC: oneflow_iree + ONEFLOW_IREE_COMMIT: 4322cbad2545877b1664aa8e0f17a17f6b5f687c TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.10.0-cuda11.3-cudnn8-runtime:afaf913e02a4ba02db92260daee22f99121cef62 MLIR_DOCKER_ARGS: "-e ONEFLOW_MLIR_ENABLE_ROUND_TRIP=1 -e ONEFLOW_MLIR_PREFER_NHWC=0 -e ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION=1" @@ -25,7 +27,7 @@ jobs: runs-on: ubuntu-latest if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.requested_reviewers.*.login, 'oneflow-ci-bot') steps: - - uses: Oneflow-Inc/get-oneflow/priority-pr@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/priority-pr@support-iree-ci name: Check priority PR closed id: save-cache timeout-minutes: 5 @@ -159,7 +161,7 @@ jobs: fi echo "is_secrets_accessible=1" >> $GITHUB_ENV - name: Wait for GPU slot - uses: Oneflow-Inc/get-oneflow/wait-for-gpu@single-matrix-for-efficiency + uses: Oneflow-Inc/get-oneflow/wait-for-gpu@support-iree-ci if: env.is_secrets_accessible == '1' timeout-minutes: 90 continue-on-error: true @@ -183,7 +185,7 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} repository: ${{github.event.pull_request.head.repo.full_name}} - - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-iree-ci name: find cache id: find-cache timeout-minutes: 5 @@ -230,7 +232,7 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} repository: ${{github.event.pull_request.head.repo.full_name}} - - uses: Oneflow-Inc/get-oneflow/cache-complete@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci name: Save cache if successful id: save-cache timeout-minutes: 5 @@ -244,13 +246,14 @@ jobs: run: | echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit" exit 1 - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build manylinux ${{ matrix.entry }} id: build-cpu if: ${{ matrix.entry =='cpu' && !matrix.cache-hit }} with: cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cpu.cmake build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build.sh + run-lit: true oneflow-src: ${{ env.ONEFLOW_SRC }} oneflow-build-env: manylinux wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }} @@ -265,7 +268,7 @@ jobs: python-versions: | 3.6 3.7 - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build manylinux ${{ matrix.entry }} id: build-cuda if: ${{ matrix.entry =='cu102' && !matrix.cache-hit }} @@ -285,7 +288,7 @@ jobs: clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }} python-versions: | 3.7 - - uses: Oneflow-Inc/get-oneflow@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow@support-iree-ci name: Build ${{ matrix.entry }} if: ${{ matrix.entry == 'llvm13' && !matrix.cache-hit }} with: @@ -324,7 +327,7 @@ jobs: }) - name: Upload packed liboneflow if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }} - uses: Oneflow-Inc/get-oneflow/digest/upload@single-matrix-for-efficiency + uses: Oneflow-Inc/get-oneflow/digest/upload@support-iree-ci timeout-minutes: 10 with: digest: ${{ steps.save-cache.outputs.build-digest }} @@ -335,7 +338,7 @@ jobs: dst-dir: cpack - name: Upload whl if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }} - uses: Oneflow-Inc/get-oneflow/digest/upload@single-matrix-for-efficiency + uses: Oneflow-Inc/get-oneflow/digest/upload@support-iree-ci timeout-minutes: 10 with: digest: ${{ steps.save-cache.outputs.build-digest }} @@ -360,7 +363,7 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} repository: ${{github.event.pull_request.head.repo.full_name}} - - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-iree-ci name: find cache id: find-cache timeout-minutes: 5 @@ -391,7 +394,7 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} repository: ${{github.event.pull_request.head.repo.full_name}} - - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-iree-ci name: find cache id: find-cache timeout-minutes: 5 @@ -455,12 +458,20 @@ jobs: # please use a commit here ref: ${{ env.LIBAI_COMMIT}} path: ${{ env.LIBAI_SRC}} + - name: Checkout Oneflow-Inc/oneflow_iree + if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} + uses: actions/checkout@v2 + with: + repository: Oneflow-Inc/oneflow_iree + # please use a commit here + ref: ${{ env.ONEFLOW_IREE_COMMIT}} + path: ${{ env.ONEFLOW_IREE_SRC}} - name: Remove container timeout-minutes: 45 if: ${{ contains(matrix.runs-on, 'self-hosted') }} run: | docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true - - uses: Oneflow-Inc/get-oneflow/cache-complete@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci name: Save cache if successful id: save-cache timeout-minutes: 5 @@ -476,7 +487,7 @@ jobs: exit 1 - name: Download wheel and packed liboneflow if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} - uses: Oneflow-Inc/get-oneflow/digest/download@single-matrix-for-efficiency + uses: Oneflow-Inc/get-oneflow/digest/download@support-iree-ci id: download-digest timeout-minutes: 10 with: @@ -486,7 +497,7 @@ jobs: ssh-tank-path: ${{ env.SSH_TANK_PATH }} - name: Get primary node if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} - uses: Oneflow-Inc/get-oneflow/master-address@single-matrix-for-efficiency + uses: Oneflow-Inc/get-oneflow/master-address@support-iree-ci id: get-primary-node with: rank: ${{ matrix.rank }} @@ -559,6 +570,7 @@ jobs: docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.FLOW_VISION_SRC}} docker exec ${TEST_CONTAINER_NAME} python3 -m pip install pybind11 --user docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.LIBAI_SRC}} + docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.ONEFLOW_IREE_SRC}} - name: Module API test (distributed) timeout-minutes: 90 if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'module' && matrix.device == 'cuda' && fromJson(matrix.is-distributed) }} @@ -648,12 +660,20 @@ jobs: # please use a commit here ref: ${{ env.LIBAI_COMMIT}} path: ${{ env.LIBAI_SRC}} + - name: Checkout Oneflow-Inc/oneflow_iree + if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} + uses: actions/checkout@v2 + with: + repository: Oneflow-Inc/oneflow_iree + # please use a commit here + ref: ${{ env.ONEFLOW_IREE_COMMIT}} + path: ${{ env.ONEFLOW_IREE_SRC}} - name: Remove container timeout-minutes: 45 if: ${{ contains(matrix.runs-on, 'self-hosted') }} run: | docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true - - uses: Oneflow-Inc/get-oneflow/cache-complete@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci name: Save cache if successful id: save-cache timeout-minutes: 5 @@ -669,7 +689,7 @@ jobs: exit 1 - name: Download wheel and packed liboneflow if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} - uses: Oneflow-Inc/get-oneflow/digest/download@single-matrix-for-efficiency + uses: Oneflow-Inc/get-oneflow/digest/download@support-iree-ci id: download-digest timeout-minutes: 10 with: @@ -781,6 +801,7 @@ jobs: docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.FLOW_VISION_SRC}} docker exec ${TEST_CONTAINER_NAME} python3 -m pip install pybind11 --user docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.LIBAI_SRC}} + docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.ONEFLOW_IREE_SRC}} - name: Run OneFlow doctor if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }} run: | @@ -865,7 +886,7 @@ jobs: body: "
\n Speed stats:\n\n ``` \n${{ steps.speed.outputs.stats }}\n ``` \n\n
".replace(/\\n/g, '\n') }) - name: Module API test - timeout-minutes: 45 + timeout-minutes: 60 if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'module' && !fromJson(matrix.is-distributed) }} run: | docker exec -e ONEFLOW_TEST_DIR=$PWD/python/oneflow/test/modules ${{ env.TEST_CONTAINER_NAME }} bash ci/test/generic_test_multi_client.sh @@ -883,6 +904,11 @@ jobs: docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.LIBAI_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m unittest -f tests/models/test_gpt.py docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.LIBAI_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m unittest -f tests/models/test_t5.py docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.LIBAI_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m unittest -f tests/models/test_vit.py + - name: oneflow_iree test + timeout-minutes: 45 + if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }} + run: | + docker exec -w $PWD/${{ env.ONEFLOW_IREE_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m pytest examples - name: Expensive tests (models, cases require exclusive access to GPU) timeout-minutes: 45 if: ${{ !fromJson(matrix.cache-hit) && (matrix.test-type == 'speed-test' || (matrix.test-type == 'misc' && matrix.device == 'cpu')) && !fromJson(matrix.is-distributed) }} @@ -908,7 +934,7 @@ jobs: - name: Benchmark Test timeout-minutes: 100 if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'benchmark' && matrix.device == 'cuda' }} - uses: Oneflow-Inc/get-oneflow/pytest-benchmark@single-matrix-for-efficiency + uses: Oneflow-Inc/get-oneflow/pytest-benchmark@support-iree-ci with: collect-path: ${{ env.FLOW_VISION_SRC }}/benchmark container-name: ${{ env.TEST_CONTAINER_NAME }} @@ -961,7 +987,7 @@ jobs: ref: ${{ github.event.pull_request.head.sha }} repository: ${{github.event.pull_request.head.repo.full_name}} fetch-depth: 0 - - uses: Oneflow-Inc/get-oneflow/cache-complete@single-matrix-for-efficiency + - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci name: Save cache if successful id: save-cache timeout-minutes: 5 diff --git a/ci/manylinux/build-gcc7.sh b/ci/manylinux/build-gcc7.sh index f9deb933083..42244968a0e 100644 --- a/ci/manylinux/build-gcc7.sh +++ b/ci/manylinux/build-gcc7.sh @@ -31,6 +31,11 @@ cmake -S ${ONEFLOW_CI_SRC_DIR} -C ${ONEFLOW_CI_CMAKE_INIT_CACHE} -DPython3_EXECU # cmake build cd ${ONEFLOW_CI_BUILD_DIR} cmake --build . --parallel ${ONEFLOW_CI_BUILD_PARALLEL} +if [ ! -z "$ONEFLOW_CI_BUILD_RUN_LIT" ]; then + ${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user flowvision==0.1.0 + export PATH=$PATH:$(dirname $ONEFLOW_CI_PYTHON_EXE) + cmake --build . -t c1 +fi # build pip cd ${ONEFLOW_CI_SRC_DIR} diff --git a/ci/manylinux/build.sh b/ci/manylinux/build.sh index 5ce5c448355..263a6fb5194 100644 --- a/ci/manylinux/build.sh +++ b/ci/manylinux/build.sh @@ -27,6 +27,11 @@ cmake -S ${ONEFLOW_CI_SRC_DIR} -C ${ONEFLOW_CI_CMAKE_INIT_CACHE} -DPython3_EXECU # cmake build cd ${ONEFLOW_CI_BUILD_DIR} cmake --build . --parallel ${ONEFLOW_CI_BUILD_PARALLEL} +if [ ! -z "$ONEFLOW_CI_BUILD_RUN_LIT" ]; then + ${ONEFLOW_CI_PYTHON_EXE} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user flowvision==0.1.0 + export PATH=$PATH:$(dirname $ONEFLOW_CI_PYTHON_EXE) + cmake --build . -t c1 +fi # build pip cd ${ONEFLOW_CI_SRC_DIR} diff --git a/cmake/caches/cn/fast/mlir-cpu.cmake b/cmake/caches/cn/fast/mlir-cpu.cmake new file mode 100644 index 00000000000..7c7351e65ef --- /dev/null +++ b/cmake/caches/cn/fast/mlir-cpu.cmake @@ -0,0 +1,24 @@ +set(BUILD_SHARED_LIBS YES CACHE BOOL "") +# uncomment only if you know what you are doing +# set(CMAKE_LINK_DEPENDS_NO_SHARED YES CACHE BOOL "") +set(BUILD_CUDA NO CACHE BOOL "") +set(BUILD_GIT_VERSION NO CACHE BOOL "") +set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "") +set(BUILD_HWLOC NO CACHE BOOL "") +set(BUILD_TESTING OFF CACHE BOOL "") +set(WITH_MLIR YES CACHE BOOL "") +set(WITH_MLIR_CUDA_CODEGEN NO CACHE BOOL "") +set(THIRD_PARTY_MIRROR aliyun CACHE STRING "") +set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "") +set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "") +set(CMAKE_GENERATOR Ninja CACHE STRING "") +set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "") +set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "") +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF CACHE BOOL "") +set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "") +set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "") +set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld" CACHE STRING "") +set(CPU_THREADING_RUNTIME SEQ CACHE STRING + "when using lld with TBB enabled, there will be linkage error") +set(BUILD_HWLOC OFF CACHE BOOL "") +set(WITH_ONEDNN OFF CACHE BOOL "") diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake index 205224541a2..0176468ccd6 100644 --- a/cmake/oneflow.cmake +++ b/cmake/oneflow.cmake @@ -184,13 +184,7 @@ relative_protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROJECT_SOURCE_DIR} ${of_ oneflow_add_library(of_protoobj SHARED ${PROTO_SRCS} ${PROTO_HDRS}) add_dependencies(of_protoobj make_pyproto_dir protobuf) - -if(BUILD_SHARED_LIBS) - target_link_libraries(of_protoobj protobuf_imported) -else() - # For some unknown reasons, when building static libraries, we have to link of_protoobj with oneflow_third_party_libs - target_link_libraries(of_protoobj ${oneflow_third_party_libs}) -endif() +target_link_libraries(of_protoobj protobuf_imported) include(functional) generate_functional_api_and_pybind11_cpp(FUNCTIONAL_GENERATED_SRCS FUNCTIONAL_GENERATED_HRCS @@ -256,18 +250,21 @@ if("${LLVM_MONO_REPO_URL}" STREQUAL "https://github.com/llvm/llvm-project/archive/7eaa84eac3ba935d13f4267d3d533a6c3e1283ed.zip" OR "${LLVM_MONO_REPO_URL}" STREQUAL "https://github.com/llvm/llvm-project/archive/35e60f5de180aea55ed478298f4b40f04dcc57d1.zip" + OR "${LLVM_MONO_REPO_URL}" STREQUAL + "https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip" OR "${LLVM_MONO_REPO_MD5}" STREQUAL "f2f17229cf21049663b8ef4f2b6b8062" OR "${LLVM_MONO_REPO_MD5}" STREQUAL "6b7c6506d5922de9632c8ff012b2f945" OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e0ea669a9f0872d35bffda5ec6c5ac6f" + OR "${LLVM_MONO_REPO_MD5}" STREQUAL "241a333828bba1efa35aff4c4fc2ce87" OR "${LLVM_MONO_REPO_MD5}" STREQUAL "075fbfdf06cb3f02373ea44971af7b03") unset(LLVM_MONO_REPO_URL CACHE) unset(LLVM_MONO_REPO_MD5 CACHE) endif() set(LLVM_MONO_REPO_URL - "https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip" + "https://github.com/llvm/llvm-project/archive/32805e60c9de1f82887cd2af30d247dcabd2e1d3.zip" CACHE STRING "") use_mirror(VARIABLE LLVM_MONO_REPO_URL URL ${LLVM_MONO_REPO_URL}) -set(LLVM_MONO_REPO_MD5 "241a333828bba1efa35aff4c4fc2ce87" CACHE STRING "") +set(LLVM_MONO_REPO_MD5 "e412dc61159b5e929b0c94e44b11feb2" CACHE STRING "") set(ONEFLOW_BUILD_ROOT_DIR "${PROJECT_BINARY_DIR}") add_subdirectory(${PROJECT_SOURCE_DIR}/oneflow/ir) if(WITH_MLIR) @@ -292,11 +289,8 @@ list(APPEND oneflow_third_party_libs LLVMSupportWithHeader) include(op_schema) -get_property(EXTERNAL_INCLUDE_DIRS GLOBAL PROPERTY EXTERNAL_INCLUDE_DIRS) get_property(EXTERNAL_TARGETS GLOBAL PROPERTY EXTERNAL_TARGETS) -target_include_directories(oneflow PRIVATE ${EXTERNAL_INCLUDE_DIRS}) - if(APPLE) set(of_libs -Wl,-force_load oneflow of_op_schema) target_link_libraries(oneflow of_protoobj of_functional_obj ${oneflow_third_party_libs}) diff --git a/cmake/util.cmake b/cmake/util.cmake index a69128f416e..4ab55d6bb55 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -287,3 +287,13 @@ function(checkDirAndAppendSlash) endif() endfunction() + +function(mark_targets_as_system) + # TODO(daquexian): update this function once https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7308 + # and its following PRs are merged in cmake v3.25. + foreach(target ${ARGV}) + get_target_property(include_dir ${target} INTERFACE_INCLUDE_DIRECTORIES) + set_target_properties(${target} PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES + "${include_dir}") + endforeach() +endfunction() diff --git a/docs/source/env.rst b/docs/source/env.rst index fdf298b8578..3738f0a67c5 100644 --- a/docs/source/env.rst +++ b/docs/source/env.rst @@ -8,3 +8,5 @@ Environment .. autofunction:: oneflow.env.get_rank .. autofunction:: oneflow.env.get_local_rank .. autofunction:: oneflow.env.get_node_size +.. autofunction:: oneflow.env.init_rdma +.. autofunction:: oneflow.env.rdma_is_initialized diff --git a/docs/source/graph.rst b/docs/source/graph.rst index 270e5a01cf0..c2e6f340c00 100644 --- a/docs/source/graph.rst +++ b/docs/source/graph.rst @@ -26,12 +26,14 @@ Base class for running neural networks in Static Graph Mode. allow_fuse_cast_scale, set_gradient_accumulation_steps, enable_cudnn_conv_heuristic_search_algo, + disable_straighten_algorithm, :member-order: bysource .. autoclass:: oneflow.nn.graph.block_config.BlockConfig :members: stage_id, + set_stage, activation_checkpointing, :member-order: bysource diff --git a/docs/source/oneflow.rst b/docs/source/oneflow.rst index 3550065a960..39729b8c6e3 100644 --- a/docs/source/oneflow.rst +++ b/docs/source/oneflow.rst @@ -92,6 +92,7 @@ oneflow masked_fill, masked_select, matmul, + mv, narrow, max, mean, @@ -136,7 +137,7 @@ oneflow selu, silu, slice, - logical_slice, + slice_update, softsign, sort, softplus, diff --git a/docs/source/tensor.rst b/docs/source/tensor.rst index 753abcd6889..a8a305ac9c8 100644 --- a/docs/source/tensor.rst +++ b/docs/source/tensor.rst @@ -104,6 +104,7 @@ OneFlow Tensor Class masked_fill, masked_select, matmul, + mv, max, mean, min, @@ -193,4 +194,5 @@ OneFlow Tensor Class zero_, nms, pin_memory, + is_pinned, diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index db603be09b6..4d5f3fae257 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -1,16 +1,12 @@ set(EXTERNAL_TARGETS) -set(EXTERNAL_INCLUDE_DIRS) if (CPU_THREADING_RUNTIME STREQUAL "TBB") add_subdirectory(onetbb) - get_property(TBB_INCLUDE_DIRS GLOBAL PROPERTY TBB_INCLUDE_DIRS) - list(APPEND EXTERNAL_INCLUDE_DIRS ${TBB_INCLUDE_DIRS}) list(APPEND EXTERNAL_TARGETS tbb) endif() add_subdirectory(robin-hood-hashing) -get_property(ROBIN_HOOD_HASHING_INCLUDE_DIR GLOBAL PROPERTY ROBIN_HOOD_HASHING_INCLUDE_DIR) -list(APPEND EXTERNAL_INCLUDE_DIRS ${ROBIN_HOOD_HASHING_INCLUDE_DIR}) +list(APPEND EXTERNAL_TARGETS robin_hood) +mark_targets_as_system(${EXTERNAL_TARGETS}) set_property(GLOBAL PROPERTY EXTERNAL_TARGETS ${EXTERNAL_TARGETS}) -set_property(GLOBAL PROPERTY EXTERNAL_INCLUDE_DIRS ${EXTERNAL_INCLUDE_DIRS}) diff --git a/external/onetbb/CMakeLists.txt b/external/onetbb/CMakeLists.txt index 6d83773e58f..399fab32256 100644 --- a/external/onetbb/CMakeLists.txt +++ b/external/onetbb/CMakeLists.txt @@ -15,7 +15,6 @@ set(BUILD_SHARED_LIBS ON) set(CMAKE_POLICY_DEFAULT_CMP0079 NEW) FetchContent_MakeAvailable(tbb) -set_property(GLOBAL PROPERTY TBB_INCLUDE_DIRS "${tbb_SOURCE_DIR}/include") install(TARGETS tbb tbbmalloc tbbmalloc_proxy COMPONENT OneFlowTBB) install(DIRECTORY ${tbb_SOURCE_DIR}/include DESTINATION ${ONETBB_INSTALL_DIR} COMPONENT OneFlowTBB) diff --git a/external/robin-hood-hashing/CMakeLists.txt b/external/robin-hood-hashing/CMakeLists.txt index e079ad6b36f..d60277a1a1f 100644 --- a/external/robin-hood-hashing/CMakeLists.txt +++ b/external/robin-hood-hashing/CMakeLists.txt @@ -1,14 +1,7 @@ include(FetchContent) FetchContent_Declare( robin_hood_hashing + URL ${ROBIN_HOOD_HASHING_URL} + URL_HASH MD5=${ROBIN_HOOD_HASHING_MD5} ) -FetchContent_GetProperties(robin_hood_hashing) - -if(NOT robin_hood_hashing_POPULATED) - FetchContent_Populate(robin_hood_hashing - URL ${ROBIN_HOOD_HASHING_URL} - URL_HASH MD5=${ROBIN_HOOD_HASHING_MD5} - ) -endif() - -set_property(GLOBAL PROPERTY ROBIN_HOOD_HASHING_INCLUDE_DIR "${robin_hood_hashing_SOURCE_DIR}/src/include") +FetchContent_MakeAvailable(robin_hood_hashing) diff --git a/oneflow/api/common/sbp.h b/oneflow/api/common/sbp.h index e20878f32aa..423c92a1633 100644 --- a/oneflow/api/common/sbp.h +++ b/oneflow/api/common/sbp.h @@ -33,7 +33,7 @@ inline Maybe SbpToString(Symbol sbp_sym) { } else if (sbp_sym->has_partial_sum_parallel()) { sbp_str += "partial_sum"; } else if (sbp_sym->has_split_parallel()) { - sbp_str += "split(axis=" + std::to_string(sbp_sym->split_parallel().axis()) + ")"; + sbp_str += "split(dim=" + std::to_string(sbp_sym->split_parallel().axis()) + ")"; } else { UNIMPLEMENTED_THEN_RETURN(); } diff --git a/oneflow/api/python/env/env.cpp b/oneflow/api/python/env/env.cpp index 7d539fd8098..5af31528c63 100644 --- a/oneflow/api/python/env/env.cpp +++ b/oneflow/api/python/env/env.cpp @@ -55,6 +55,8 @@ ONEFLOW_API_PYBIND11_MODULE("", m) { m.def("GetWorldSize", &GetWorldSize); m.def("GetNodeSize", &GetNodeSize); m.def("GetLocalRank", &GetLocalRank); + m.def("InitRDMA", &InitRDMA); + m.def("RDMAIsInitialized", &RDMAIsInitialized); m.def("CudaGetDeviceCount", &CudaGetDeviceCount); #ifdef WITH_CUDA m.def("GetCudaDeviceIndex", &GetCudaDeviceIndex); diff --git a/oneflow/api/python/framework/dtype.cpp b/oneflow/api/python/framework/dtype.cpp index b09cc6d21d5..d6588832904 100644 --- a/oneflow/api/python/framework/dtype.cpp +++ b/oneflow/api/python/framework/dtype.cpp @@ -38,7 +38,10 @@ ONEFLOW_API_PYBIND11_MODULE("", m) { [](int t) { // __setstate__ return CHECK_JUST(DType::Get(DataType(t))); })) - .def_property_readonly("bytes", [](const Symbol& dtype) { return dtype->bytes(); }); + .def_property_readonly("bytes", [](const Symbol& dtype) { return dtype->bytes(); }) + .def("get", [](const int data_type_enum) { + return CHECK_JUST(DType::Get(static_cast(data_type_enum))); + }); m.attr("bool") = &CHECK_JUST(DType::Get(DataType::kBool)); m.attr("char") = &CHECK_JUST(DType::Get(DataType::kChar)); diff --git a/oneflow/api/python/framework/nn_graph.cpp b/oneflow/api/python/framework/nn_graph.cpp index 9e0c939b3e2..aa78605dab0 100644 --- a/oneflow/api/python/framework/nn_graph.cpp +++ b/oneflow/api/python/framework/nn_graph.cpp @@ -41,6 +41,11 @@ Maybe APINNGraphAdditionalVarTensors(const std::shared_ptr& py::list tensor_list = py::cast(tensors); return py::cast(tensor_list); } + +Maybe APINNGraphGetCurrentSerializedJob(const std::shared_ptr& graph) { + const auto job = graph->job(); + return py::bytes(job.SerializeAsString()); +} } // namespace ONEFLOW_API_PYBIND11_MODULE("nn.graph.", m) { @@ -75,7 +80,8 @@ ONEFLOW_API_PYBIND11_MODULE("nn.graph.", m) { &NNGraph::RegisterAdditionalVarOpNamesAndTensorsToBeLoaded) .def_property_readonly("additional_var_names", &APINNGraphAdditionalVarNames) .def_property_readonly("additional_var_tensors", &APINNGraphAdditionalVarTensors) - .def("complie_and_init_runtime", &NNGraph::CompileAndInitRuntime); + .def("complie_and_init_runtime", &NNGraph::CompileAndInitRuntime) + .def("get_current_job_str", &APINNGraphGetCurrentSerializedJob); m.def("RunLazyNNGraph", &RunLazyNNGraph); m.def("SoftSyncNNGraphBuffers", &SoftSyncNNGraphBuffers); diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp index 71120182894..0ddd612b698 100644 --- a/oneflow/api/python/framework/tensor.cpp +++ b/oneflow/api/python/framework/tensor.cpp @@ -125,26 +125,17 @@ static PyObject* PyTensorObject_subscript(PyObject* self, PyObject* item) { END_HANDLE_ERRORS } -static int PyTensorObject_ass_subscript(PyObject* self, PyObject* item, PyObject* value) { - HANDLE_ERRORS - const auto& p = PyTensor_Unpack(self); - const auto& v = PyTensor_Unpack(value); - functional::PythonArg arg(item); - ASSERT(functional::TensorSetItem(p, arg.As(), v)); - return 0; - END_HANDLE_ERRORS_RET(-1) -} - static PySequenceMethods PyTensorObject_as_sequence = { (lenfunc)PyTensorObject_length, NULL, /*sq_concat*/ NULL, /*sq_repeat*/ (ssizeargfunc)PyTensorObject_getitem, /*sq_item*/ }; +extern int PyTensorObject_setitem(PyObject*, PyObject*, PyObject*); static PyMappingMethods PyTensorObject_as_mapping = { (lenfunc)PyTensorObject_length, (binaryfunc)PyTensorObject_subscript, - (objobjargproc)PyTensorObject_ass_subscript, + (objobjargproc)PyTensorObject_setitem, }; static PyObject* PyTensorObject_storage_offset(PyObject* self, PyObject* unused) { @@ -156,9 +147,9 @@ static PyObject* PyTensorObject_storage_offset(PyObject* self, PyObject* unused) static PyObject* PyTensorObject_stride(PyObject* self, PyObject* unused) { HANDLE_ERRORS const auto& stride = ASSERT_PTR(PyTensor_Unpack(self)->stride()); - PyObject* tup = PyTuple_New(stride->NumAxes()); - for (int i = 0; i < stride->NumAxes(); ++i) { - PyTuple_SetItem(tup, i, PyLong_FromUnsignedLong(stride->At(i))); + PyObject* tup = PyTuple_New(stride->size()); + for (int i = 0; i < stride->size(); ++i) { + PyTuple_SetItem(tup, i, PyLong_FromUnsignedLong(stride->at(i))); } return tup; END_HANDLE_ERRORS @@ -189,6 +180,12 @@ static PyObject* PyTensorObject_pin_memory(PyObject* self, PyObject* unused) { END_HANDLE_ERRORS } +static PyObject* PyTensorObject_is_pinned(PyObject* self, PyObject* unused) { + HANDLE_ERRORS + return functional::CastToPyObject(CHECK_JUST(PyTensor_Unpack(self)->is_pinned())); + END_HANDLE_ERRORS +} + static PyObject* PyTensorObject_requires_grad_(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_ERRORS int requires_grad = 1; @@ -390,6 +387,7 @@ static PyMethodDef PyTensorObject_methods[] = { {"contiguous", PyTensorObject_contiguous, METH_NOARGS, NULL}, {"contiguous_", PyTensorObject_contiguous_, METH_NOARGS, NULL}, {"pin_memory", PyTensorObject_pin_memory, METH_NOARGS, NULL}, + {"is_pinned", PyTensorObject_is_pinned, METH_NOARGS, NULL}, {"requires_grad_", (PyCFunction)PyTensorObject_requires_grad_, METH_VARARGS | METH_KEYWORDS, NULL}, {"retain_grad", PyTensorObject_retain_grad, METH_NOARGS, NULL}, diff --git a/oneflow/api/python/framework/tensor_functions.cpp b/oneflow/api/python/framework/tensor_functions.cpp index 2dbfd4a3a02..f74050debf7 100644 --- a/oneflow/api/python/framework/tensor_functions.cpp +++ b/oneflow/api/python/framework/tensor_functions.cpp @@ -632,6 +632,168 @@ static PyObject* PyTensorObject_transpose(PyObject* self, PyObject* args, PyObje END_HANDLE_ERRORS } +static PyObject* PyTensorObject_local_to_global(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_ERRORS + auto tensor = PyTensor_Unpack(self); + CHECK_OR_THROW(tensor->is_local()) << Error::RuntimeError() << "input must be a local tensor"; + PyObject* placement_obj = Py_None; + PyObject* sbp_obj = Py_None; + bool check_meta = true; + static const char* keywords[4] = {"placement", "sbp", "check_meta", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO$O!:local_to_global", + const_cast(keywords), &placement_obj, &sbp_obj, + &PyBool_Type, &check_meta)) { + return NULL; + }; + + CHECK_OR_THROW(placement_obj != Py_None && sbp_obj != Py_None) << Error::InvalidValueError( + "Converting a local tensor to global tensor must have placement and sbp parameters."); + CHECK_OR_THROW(functional::PyParallelDescCheck(placement_obj)) + << Error::TypeError() << "Invalid parameter placement with type " + << functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(placement_obj))); + + std::vector> sbp; + if (functional::PySbpParallelCheck(sbp_obj)) { + sbp.emplace_back(functional::PyUnpackSbpParallel(sbp_obj)); + } else { + CHECK_OR_THROW(functional::PySbpParallelSequenceCheck(sbp_obj)) + << Error::TypeError() << "Invalid parameter sbp with type " + << functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(sbp_obj))); + sbp = functional::PyUnpackSbpParallelSequence(sbp_obj); + } + return PyTensor_New(ASSERT_PTR(functional::ToConsistent( + tensor, functional::PyUnpackParallelDesc(placement_obj), sbp, {}, check_meta))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_global_to_global(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_ERRORS + auto tensor = PyTensor_Unpack(self); + CHECK_OR_THROW(tensor->is_consistent()) + << Error::RuntimeError() << "input must be a global tensor"; + PyObject* placement_obj = Py_None; + PyObject* sbp_obj = Py_None; + PyObject* grad_sbp_obj = Py_None; + Symbol placement; + std::vector> sbp; + std::vector> grad_sbp; + bool check_meta = false; + static const char* keywords[5] = {"placement", "sbp", "grad_sbp", "check_meta", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO$OO!:global_to_global", + const_cast(keywords), &placement_obj, &sbp_obj, + &grad_sbp_obj, &PyBool_Type, &check_meta)) { + return NULL; + }; + + // sbp + CHECK_OR_THROW(sbp_obj == Py_None || functional::PySbpParallelCheck(sbp_obj) + || functional::PySbpParallelSequenceCheck(sbp_obj)) + << Error::TypeError() + << "sbp parameter must be type of oneflow.sbp.sbp or list/tuple of oneflow.sbp.sbp"; + if (functional::PySbpParallelCheck(sbp_obj)) { + sbp.emplace_back(functional::PyUnpackSbpParallel(sbp_obj)); + } else if (functional::PySbpParallelSequenceCheck(sbp_obj)) { + sbp = functional::PyUnpackSbpParallelSequence(sbp_obj); + } else { + for (int32_t i = 0; i < ASSERT(tensor->nd_sbp())->sbp_parallel_size(); i++) + sbp.emplace_back(ASSERT(tensor->nd_sbp())->sbp_parallel(i)); + } + + // placement + CHECK_OR_THROW(placement_obj == Py_None || functional::PyParallelDescCheck(placement_obj)) + << Error::TypeError() << "Invalid parameter placement with type " + << functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(placement_obj))); + if (placement_obj == Py_None) { + placement = ASSERT(tensor->parallel_desc()); + } else { + placement = functional::PyUnpackParallelDesc(placement_obj); + } + + // grad_sbp + CHECK_OR_THROW(grad_sbp_obj == Py_None || functional::PySbpParallelCheck(grad_sbp_obj) + || functional::PySbpParallelSequenceCheck(grad_sbp_obj)) + << Error::TypeError() + << "grad_sbp parameter must be type of oneflow.sbp.sbp or list/tuple of oneflow.sbp.sbp"; + if (functional::PySbpParallelCheck(grad_sbp_obj)) { + grad_sbp.emplace_back(functional::PyUnpackSbpParallel(grad_sbp_obj)); + } else if (functional::PySbpParallelSequenceCheck(grad_sbp_obj)) { + grad_sbp = functional::PyUnpackSbpParallelSequence(grad_sbp_obj); + } + return PyTensor_New( + ASSERT_PTR(functional::ToConsistent(tensor, placement, sbp, grad_sbp, check_meta))); + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_to_global(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_ERRORS + const auto& tensor = PyTensor_Unpack(self); + PyObject* result = NULL; + if (tensor->is_consistent()) + result = PyTensorObject_global_to_global(self, args, kwargs); + else { + result = PyTensorObject_local_to_global(self, args, kwargs); + } + if (PyErr_Occurred()) { throw py::error_already_set(); } + return result; + + END_HANDLE_ERRORS +} + +static PyObject* PyTensorObject_to_local(PyObject* self, PyObject* unused) { + HANDLE_ERRORS + auto tensor = PyTensor_Unpack(self); + CHECK_OR_THROW(tensor->is_consistent()) + << Error::RuntimeError() << "Expected global tensor for to_local but got local tensor!"; + return PyTensor_New(ASSERT_PTR(functional::ConsistentToLocal(tensor))); + END_HANDLE_ERRORS +} + +int PyTensorObject_setitem(PyObject* self, PyObject* item, PyObject* value) { + HANDLE_ERRORS + auto tensor = PyTensor_Unpack(self); + std::shared_ptr value_tensor; + CHECK_OR_THROW(functional::PyTensorIndexCheck(item)) + << Error::TypeError() << "tensor_setitem(): argument 'index' must be index, not " + << functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(item))); + CHECK_OR_THROW(functional::PyScalarCheck(value) || PyTensor_Check(value)) + << Error::TypeError() << "tensor_setitem(): argument 'value' must be tensor or scalar, not " + << functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(value))); + + if (tensor->is_consistent()) { + Symbol placement = ASSERT(tensor->parallel_desc()); + auto ndsbp = ASSERT(tensor->nd_sbp()); + std::vector> sbp(ndsbp->sbp_parallel_size(), + ASSERT(MakeBroadcastSbpParallel())); + if (functional::PyScalarCheck(value)) { + Scalar value_scalar = functional::PyUnpackScalar(value); + value_tensor = ASSERT_PTR( + functional::ConsistentConstant({1}, value_scalar, tensor->dtype(), placement, sbp)); + } else { + value_tensor = PyTensor_Unpack(value); + CHECK_OR_THROW(value_tensor->is_consistent()) + << Error::RuntimeError() + << "tensor_setitem(): value must be a global tensor when self is global"; + value_tensor = ASSERT_PTR(functional::ToConsistent(value_tensor, placement, sbp, {}, true)); + } + } else { + if (functional::PyScalarCheck(value)) { + Scalar value_scalar = functional::PyUnpackScalar(value); + value_tensor = ASSERT_PTR( + functional::Constant({1}, value_scalar, tensor->dtype(), ASSERT(tensor->device()))); + } else { + value_tensor = PyTensor_Unpack(value); + CHECK_OR_THROW(value_tensor->is_local()) + << Error::RuntimeError() + << "tensor_setitem(): value must be a local tensor when self is local"; + Optional> device = ASSERT(tensor->device()); + value_tensor = ASSERT_PTR(functional::To(value_tensor, device, value_tensor->dtype(), false)); + } + } + ASSERT(functional::TensorSetItem(tensor, functional::PyUnpackTensorIndex(item), value_tensor)); + return 0; + END_HANDLE_ERRORS_RET(-1) +} + PyMethodDef PyTensorObject_extra_methods[] = { {"byte", PyTensorObject_byte, METH_NOARGS, NULL}, {"size", (PyCFunction)PyTensorObject_size, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -655,6 +817,12 @@ PyMethodDef PyTensorObject_extra_methods[] = { {"half", PyTensorObject_half, METH_NOARGS, NULL}, {"float", PyTensorObject_float, METH_NOARGS, NULL}, {"double", PyTensorObject_double, METH_NOARGS, NULL}, + {"local_to_global", (PyCFunction)PyTensorObject_local_to_global, METH_VARARGS | METH_KEYWORDS, + NULL}, + {"global_to_global", (PyCFunction)PyTensorObject_global_to_global, METH_VARARGS | METH_KEYWORDS, + NULL}, + {"to_local", PyTensorObject_to_local, METH_NOARGS, NULL}, + {"to_global", (PyCFunction)PyTensorObject_to_global, METH_VARARGS | METH_KEYWORDS, NULL}, {"cpu", PyTensorObject_cpu, METH_NOARGS, NULL}, {"cuda", (PyCFunction)PyTensorObject_cuda, METH_VARARGS | METH_KEYWORDS, NULL}, {"var", (PyCFunction)PyTensorObject_var, METH_VARARGS | METH_KEYWORDS, NULL}, diff --git a/oneflow/api/python/functional/tensor_api.cpp b/oneflow/api/python/functional/tensor_api.cpp index 974edc7edbc..8378daa6157 100644 --- a/oneflow/api/python/functional/tensor_api.cpp +++ b/oneflow/api/python/functional/tensor_api.cpp @@ -120,11 +120,9 @@ class TensorWithOtherCtorFunctor { Maybe operator()(const std::shared_ptr& other) const { // NOTE(chengcheng): flow.Tensor or flow.tensor ONLY created by EagerTensor now. LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false); - bool pin_memory = false; - if (other->is_local()) { - pin_memory = JUST(JUST(other->AsMirroredTensor())->eager_blob_object())->pin_memory(); - } - return MakeTensorFromOtherTensor(other, pin_memory); + bool is_pinned = false; + if (other->is_local()) { is_pinned = JUST(CHECK_JUST(other->AsMirroredTensor())->is_pinned()); } + return MakeTensorFromOtherTensor(other, is_pinned); } }; @@ -145,9 +143,7 @@ class TensorWithDataCtorFunctor { if (PyTensor_Check(data)) { const auto& other = PyTensor_Unpack(data); const bool pin_memory = - other->is_local() - ? JUST(JUST(other->AsMirroredTensor())->eager_blob_object())->pin_memory() - : false; + other->is_local() ? JUST(JUST(other->AsMirroredTensor())->is_pinned()) : false; return MakeTensorFromOtherTensor(other, dtype, device, /*requires_grad=*/false, /*pin_memory=*/pin_memory); } @@ -255,17 +251,16 @@ class LocalTensorSharedNumpyDataFunctor { Symbol device = JUST(Device::New("cpu")); const npy_intp* stride_ptr = PyArray_STRIDES(array); // stride - auto strides_vec = DimVector(stride_ptr, stride_ptr + dim); + auto strides = std::make_shared(stride_ptr, stride_ptr + dim); auto element_size_in_bytes = PyArray_ITEMSIZE(array); // NumPy strides use bytes. OneFlow strides use element counts. - for (auto& stride : strides_vec) { - if (stride % element_size_in_bytes != 0) { + for (auto& stride_val : *strides) { + if (stride_val % element_size_in_bytes != 0) { return Error::RuntimeError() << "given numpy array strides not a multiple of the element " "byte size. Copy the numpy array to reallocate the memory."; } - stride /= element_size_in_bytes; + stride_val /= element_size_in_bytes; } - const auto strides = std::make_shared(strides_vec); auto tensor_meta = std::make_shared(shape, strides, data_type, device, 0); // Build TensorBuffer @@ -292,8 +287,10 @@ class LocalTensorSharedNumpyDataFunctor { // Init blob JUST(tensor_impl->InitEagerBlobObject(NewLocalDepObject(), /*pin_memory=*/false)); - const auto& stream = GetDefaultStreamByDevice(device); - JUST(tensor_impl->eager_blob_object())->set_last_used_stream(stream); + const auto& stream = JUST(GetDefaultStreamByDevice(device)); + const auto& eager_blob_object = JUST(tensor_impl->eager_blob_object()); + JUST(eager_blob_object->init_producer_stream(stream)); + eager_blob_object->set_last_used_stream(stream); std::shared_ptr out(new MirroredTensor(tensor_impl)); return out; } diff --git a/oneflow/api/python/symbol/placement_symbol.cpp b/oneflow/api/python/symbol/placement_symbol.cpp index c5defcf8001..8881002b010 100644 --- a/oneflow/api/python/symbol/placement_symbol.cpp +++ b/oneflow/api/python/symbol/placement_symbol.cpp @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include "oneflow/core/common/maybe.h" #include "oneflow/extension/python/numpy.h" #include "oneflow/api/python/framework/size.h" #include "oneflow/api/python/of_api_registry.h" @@ -63,6 +64,19 @@ struct PlacementSymbolExportUtil { return parallel_desc; } + static Maybe CreateParallelDesc(const std::string& proto_str) { + ParallelConf parallel_conf; + CHECK_OR_RETURN(TxtString2PbMessage(proto_str, ¶llel_conf)) + << " Get ParallelConf Pb from string failed."; + std::shared_ptr parallel_desc; + JUST(PhysicalRun([¶llel_desc, ¶llel_conf](InstructionsBuilder* builder) -> Maybe { + parallel_desc = JUST(builder->GetParallelDescSymbol(parallel_conf)); + return Maybe::Ok(); + })); + + return parallel_desc; + } + static Maybe> ParseAndFormatRanks(const py::dict& device_ids) { std::vector> machine_device_id_vec; for (const auto& pair : device_ids) { @@ -137,6 +151,10 @@ struct PlacementSymbolExportUtil { return SymbolOf(*JUST(CreateParallelDesc(type, *formated_machine_device_ids, shape))); } + static Maybe> CreateParallelDescSymbol(const std::string& proto_str) { + return SymbolOf(*JUST(CreateParallelDesc(proto_str))); + } + static Maybe> AllDevicePlacement(const std::string& type) { static thread_local HashMap> device_tag2placement; CHECK_NOTNULL((Global::Get())); @@ -213,6 +231,10 @@ ONEFLOW_API_PYBIND11_MODULE("", m) { return PlacementSymbolExportUtil::CreateParallelDescSymbol(type, ranks).GetOrThrow(); }), py::arg("type"), py::arg("ranks")) + .def(py::init([](const std::string& proto_str) { + return PlacementSymbolExportUtil::CreateParallelDescSymbol(proto_str).GetOrThrow(); + }), + py::arg("proto_str")) .def_property_readonly( "device_type", [](Symbol p) { diff --git a/oneflow/api/python/utils/tensor_utils.h b/oneflow/api/python/utils/tensor_utils.h index 4805b3365d1..fb71646ee4e 100644 --- a/oneflow/api/python/utils/tensor_utils.h +++ b/oneflow/api/python/utils/tensor_utils.h @@ -70,8 +70,8 @@ inline static Maybe EagerMirroredTensorToNumpy(PyObject* py_tensor) { const size_t ndim = tensor->ndim(); const auto shape = numpy::OFShapeToNumpyShape(tensor->shape()->dim_vec()); // NumPy strides use bytes. OneFlow strides use element counts. - const auto stride = numpy::OFStrideToNumpyStride(JUST(tensor->stride())->StrideVec(), - tensor->dtype()->data_type()); + const auto stride = + numpy::OFStrideToNumpyStride(*JUST(tensor->stride()), tensor->dtype()->data_type()); T* data_ptr = nullptr; const auto& Callback = [&](uint64_t ofblob_ptr) { diff --git a/oneflow/api/python/vm/id_generator.cpp b/oneflow/api/python/vm/id_generator.cpp deleted file mode 100644 index 03586b603d6..00000000000 --- a/oneflow/api/python/vm/id_generator.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include -#include "oneflow/api/python/of_api_registry.h" -#include "oneflow/core/vm/id_generator.h" - -namespace oneflow { -namespace vm { - -namespace py = pybind11; - -ONEFLOW_API_PYBIND11_MODULE("vm", m) { - py::class_>(m, "IdGenerator"); - py::class_>( - m, "PhysicalIdGenerator") - .def(py::init<>()) - .def("NewSymbolId", &PhysicalIdGenerator::NewSymbolId) - .def("NewObjectId", &PhysicalIdGenerator::NewSymbolId); - - py::class_>( - m, "LogicalIdGenerator") - .def(py::init<>()) - .def("NewSymbolId", &LogicalIdGenerator::NewSymbolId) - .def("NewObjectId", &LogicalIdGenerator::NewObjectId); -} - -} // namespace vm -} // namespace oneflow diff --git a/oneflow/core/autograd/gradient_funcs/logical_slice.cpp b/oneflow/core/autograd/gradient_funcs/logical_slice.cpp deleted file mode 100644 index ccc06f1cc77..00000000000 --- a/oneflow/core/autograd/gradient_funcs/logical_slice.cpp +++ /dev/null @@ -1,150 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/op_expr_grad_function.h" -#include "oneflow/core/framework/op_builder.h" -#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" -#include "oneflow/core/framework/op_expr.h" -#include "oneflow/core/functional/functional.h" - -namespace oneflow { -namespace one { - -struct LogicalSliceCaptureState : public AutoGradCaptureState { - Shape like_shape; - std::vector start; - std::vector stop; - std::vector step; - Symbol in_sbp; -}; - -class LogicalSlice : public OpExprGradFunction { - public: - Maybe Init(const OpExpr& op) override { - const auto* fw_op_expr = dynamic_cast(&op); - CHECK_NOTNULL_OR_RETURN(fw_op_expr) << "LogicalSlice op_expr is null"; - base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); - return Maybe::Ok(); - } - - Maybe Capture(LogicalSliceCaptureState* ctx, const TensorTuple& inputs, - const TensorTuple& outputs, const AttrMap& attrs) const override { - CHECK_EQ_OR_RETURN(inputs.size(), 1) << "LogicalSlice input size must be 1"; - CHECK_EQ_OR_RETURN(outputs.size(), 1) << "LogicalSlice output size must be 1"; - - ComposedAttrMap composed_attrs(attrs, base_attrs_); - ctx->start = JUST(composed_attrs.GetAttr>("start")); - ctx->stop = JUST(composed_attrs.GetAttr>("stop")); - ctx->step = JUST(composed_attrs.GetAttr>("step")); - ctx->like_shape = *(inputs[0]->shape()); - ctx->in_sbp = JUST(inputs[0]->nd_sbp()); - return Maybe::Ok(); - } - - Maybe Apply(const LogicalSliceCaptureState* ctx, const TensorTuple& out_grads, - TensorTuple* in_grads) const override { - in_grads->resize(1); - std::shared_ptr zeros; - if (out_grads[0]->is_local()) { - zeros = JUST(functional::Constant(ctx->like_shape, 0, out_grads[0]->dtype(), - JUST(out_grads[0]->device()))); - } else { - const auto& parallel_desc = JUST(out_grads[0]->parallel_desc()); - zeros = JUST(functional::ConsistentConstant(ctx->like_shape, 0, out_grads[0]->dtype(), - parallel_desc, *JUST(GetSbpList(ctx->in_sbp)))); - } - (*in_grads)[0] = - JUST(functional::LogicalSliceAssign(zeros, out_grads[0], ctx->start, ctx->stop, ctx->step)); - return Maybe::Ok(); - } - - private: - AttrMap base_attrs_; -}; - -struct LogicalSliceAssignCaptureState : public AutoGradCaptureState { - bool requires_grad_ref = false; - bool requires_grad_value = false; - std::vector start; - std::vector stop; - std::vector step; - Shape value_shape; // used to calculate ref gradient - Symbol value_sbp; -}; - -class LogicalSliceAssign : public OpExprGradFunction { - public: - Maybe Init(const OpExpr& op) override { - const auto* fw_op_expr = dynamic_cast(&op); - CHECK_NOTNULL_OR_RETURN(fw_op_expr) << "LogicalSliceAssign op_expr is null"; - - base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); - return Maybe::Ok(); - } - - Maybe Capture(LogicalSliceAssignCaptureState* ctx, const TensorTuple& inputs, - const TensorTuple& outputs, const AttrMap& attrs) const override { - CHECK_EQ_OR_RETURN(inputs.size(), 2) << "LogicalSliceAssign input size must be 2"; - CHECK_EQ_OR_RETURN(outputs.size(), 1) << "LogicalSliceAssign output size must be 1"; - ctx->requires_grad_ref = inputs[0]->requires_grad(); - ctx->requires_grad_value = inputs[1]->requires_grad(); - if (!ctx->requires_grad_ref && !ctx->requires_grad_value) { return Maybe::Ok(); } - - ComposedAttrMap composed_attrs(attrs, base_attrs_); - ctx->start = JUST(composed_attrs.GetAttr>("start")); - ctx->stop = JUST(composed_attrs.GetAttr>("stop")); - ctx->step = JUST(composed_attrs.GetAttr>("step")); - - if (ctx->requires_grad_ref) { - ctx->value_shape = *(inputs[1]->shape()); - ctx->value_sbp = JUST(inputs[1]->nd_sbp()); - } - return Maybe::Ok(); - } - - Maybe Apply(const LogicalSliceAssignCaptureState* ctx, const TensorTuple& out_grads, - TensorTuple* in_grads) const override { - in_grads->resize(2); - - if (ctx->requires_grad_ref) { - std::shared_ptr zeros; - if (out_grads[0]->is_local()) { - zeros = JUST(functional::Constant(ctx->value_shape, 0, out_grads[0]->dtype(), - JUST(out_grads[0]->device()))); - } else { - const auto& parallel_desc = JUST(out_grads[0]->parallel_desc()); - zeros = - JUST(functional::ConsistentConstant(ctx->value_shape, 0, out_grads[0]->dtype(), - parallel_desc, *JUST(GetSbpList(ctx->value_sbp)))); - } - (*in_grads)[0] = JUST(functional::LogicalSliceAssign( - JUST(functional::Identity(out_grads[0])), zeros, ctx->start, ctx->stop, ctx->step)); - } - if (ctx->requires_grad_value) { - (*in_grads)[1] = JUST(functional::LogicalSlice(out_grads[0], ctx->start, ctx->stop, ctx->step, - /*enable_view_slice=*/false)); - } - return Maybe::Ok(); - } - - private: - AttrMap base_attrs_; -}; - -REGISTER_OP_EXPR_GRAD_FUNCTION("logical_slice_assign", LogicalSliceAssign); -REGISTER_OP_EXPR_GRAD_FUNCTION("logical_slice", LogicalSlice); - -} // namespace one -} // namespace oneflow diff --git a/oneflow/core/autograd/gradient_funcs/nll.cpp b/oneflow/core/autograd/gradient_funcs/nll.cpp index 20e1a67653c..430009b9dd2 100644 --- a/oneflow/core/autograd/gradient_funcs/nll.cpp +++ b/oneflow/core/autograd/gradient_funcs/nll.cpp @@ -15,68 +15,84 @@ limitations under the License. */ #include "oneflow/core/framework/op_expr_grad_function.h" #include "oneflow/core/functional/functional.h" +#include "oneflow/core/common/container_util.h" namespace oneflow { + namespace one { -struct NllCaptureState : public AutoGradCaptureState { + +struct NLLCaptureState : public AutoGradCaptureState { bool requires_grad = false; int64_t ignore_index = -100; }; -class Nll : public OpExprGradFunction { +class NLLGradFunction : public OpExprGradFunction { public: Maybe Init(const OpExpr& op) override; - Maybe Capture(NllCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs, + Maybe Capture(NLLCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs, const AttrMap& attrs) const override; - Maybe Apply(const NllCaptureState* ctx, const TensorTuple& out_grads, + Maybe Apply(const NLLCaptureState* ctx, const TensorTuple& out_grads, TensorTuple* in_grads) const override; private: AttrMap base_attrs_; }; -Maybe Nll::Init(const OpExpr& op) { + +Maybe NLLGradFunction::Init(const OpExpr& op) { const auto* fw_op_expr = dynamic_cast(&op); CHECK_NOTNULL_OR_RETURN(fw_op_expr); base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); return Maybe::Ok(); } -Maybe Nll::Capture(NllCaptureState* ctx, const TensorTuple& inputs, - const TensorTuple& outputs, const AttrMap& attrs) const { - ctx->requires_grad = inputs.at(0)->requires_grad(); + +Maybe NLLGradFunction::Capture(NLLCaptureState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, const AttrMap& attrs) const { + auto input = JUST(VectorAt(inputs, 0)); + ctx->requires_grad = input->requires_grad(); if (!ctx->requires_grad) { return Maybe::Ok(); } ComposedAttrMap composed_attrs(attrs, base_attrs_); ctx->ignore_index = JUST(composed_attrs.GetAttr("ignore_index")); - ctx->SaveTensorForBackward(inputs.at(0)); // input - ctx->SaveTensorForBackward(inputs.at(1)); // target - ctx->SaveTensorForBackward(outputs.at(1)); // total_weight + ctx->SaveTensorForBackward(input); // input + ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 1))); // target if (inputs.size() == 3) { - ctx->SaveTensorForBackward(inputs.at(2)); // weight + ctx->SaveTensorForBackward(inputs[2]); // weight } return Maybe::Ok(); } -Maybe Nll::Apply(const NllCaptureState* ctx, const TensorTuple& out_grads, - TensorTuple* in_grads) const { + +Maybe NLLGradFunction::Apply(const NLLCaptureState* ctx, const TensorTuple& out_grads, + TensorTuple* in_grads) const { if (!ctx->requires_grad) { return Maybe::Ok(); } - CHECK_EQ_OR_RETURN(out_grads.size(), 2); - const auto& dy = out_grads.at(0); - const auto& input = ctx->SavedTensors().at(0); - const auto& target = ctx->SavedTensors().at(1); - const auto& total_weight = ctx->SavedTensors().at(2); + CHECK_EQ_OR_RETURN(out_grads.size(), 2) + << Error::RuntimeError() << "The number of out_grads is expected to be 2, got " + << out_grads.size(); + CHECK_GE_OR_RETURN(ctx->SavedTensors().size(), 2) + << Error::RuntimeError() + << "The number of saved tensors is expected to be greater than or equal to 2, got " + << ctx->SavedTensors().size(); + const auto& out_grad = out_grads[0]; + const auto& input = ctx->SavedTensors()[0]; + const auto& target = ctx->SavedTensors()[1]; - in_grads->resize(ctx->SavedTensors().size() - 1); + in_grads->resize(ctx->SavedTensors().size()); - if (ctx->SavedTensors().size() == 4) { - const auto& weight = ctx->SavedTensors().at(3); - in_grads->at(0) = - JUST(functional::NllLossGrad(dy, input, target, weight, total_weight, ctx->ignore_index)); + if (ctx->SavedTensors().size() == 2) { + JUST(VectorAt(*in_grads, 0)) = + JUST(functional::NLLGrad(out_grad, input, target, NullOpt, ctx->ignore_index)); } else { - in_grads->at(0) = - JUST(functional::NllLossGrad(dy, input, target, NullOpt, total_weight, ctx->ignore_index)); + // has weight + auto weight = JUST(VectorAt(ctx->SavedTensors(), 2)); + JUST(VectorAt(*in_grads, 0)) = + JUST(functional::NLLGrad(out_grad, input, target, weight, ctx->ignore_index)); } + return Maybe::Ok(); } -REGISTER_OP_EXPR_GRAD_FUNCTION("nll", Nll); + +REGISTER_OP_EXPR_GRAD_FUNCTION("nll", NLLGradFunction); + } // namespace one + } // namespace oneflow diff --git a/oneflow/core/autograd/gradient_funcs/slice.cpp b/oneflow/core/autograd/gradient_funcs/slice.cpp index ef16ac23394..cfa5d6472c8 100644 --- a/oneflow/core/autograd/gradient_funcs/slice.cpp +++ b/oneflow/core/autograd/gradient_funcs/slice.cpp @@ -23,7 +23,6 @@ namespace oneflow { namespace one { struct SliceCaptureState : public AutoGradCaptureState { - bool requires_grad; Shape like_shape; std::vector start; std::vector stop; @@ -34,31 +33,29 @@ class Slice : public OpExprGradFunction { public: Maybe Init(const OpExpr& op) override { const auto* fw_op_expr = dynamic_cast(&op); - CHECK_NOTNULL_OR_RETURN(fw_op_expr); + CHECK_NOTNULL_OR_RETURN(fw_op_expr) << "Slice op_expr is null"; base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); return Maybe::Ok(); } Maybe Capture(SliceCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs, const AttrMap& attrs) const override { - CHECK_EQ_OR_RETURN(inputs.size(), 1); - CHECK_EQ_OR_RETURN(outputs.size(), 1); - ctx->requires_grad = inputs.at(0)->requires_grad(); - if (!ctx->requires_grad) { return Maybe::Ok(); } + CHECK_EQ_OR_RETURN(inputs.size(), 1) << "Slice input size must be 1"; + CHECK_EQ_OR_RETURN(outputs.size(), 1) << "Slice output size must be 1"; ComposedAttrMap composed_attrs(attrs, base_attrs_); ctx->start = JUST(composed_attrs.GetAttr>("start")); ctx->stop = JUST(composed_attrs.GetAttr>("stop")); ctx->step = JUST(composed_attrs.GetAttr>("step")); - ctx->like_shape = *(inputs.at(0)->shape()); + ctx->like_shape = *(inputs[0]->shape()); return Maybe::Ok(); } Maybe Apply(const SliceCaptureState* ctx, const TensorTuple& out_grads, TensorTuple* in_grads) const override { in_grads->resize(1); - in_grads->at(0) = JUST( - functional::SliceGrad(out_grads.at(0), ctx->like_shape, ctx->start, ctx->stop, ctx->step)); + (*in_grads)[0] = JUST( + functional::SliceGrad(out_grads[0], ctx->like_shape, ctx->start, ctx->stop, ctx->step)); return Maybe::Ok(); } @@ -67,18 +64,20 @@ class Slice : public OpExprGradFunction { }; struct SliceUpdateCaptureState : public AutoGradCaptureState { - bool requires_grad_x; - bool requires_grad_update; + bool requires_grad_ref = false; + bool requires_grad_value = false; std::vector start; std::vector stop; std::vector step; + Shape value_shape; // used to calculate ref gradient + Symbol value_sbp; }; class SliceUpdate : public OpExprGradFunction { public: Maybe Init(const OpExpr& op) override { const auto* fw_op_expr = dynamic_cast(&op); - CHECK_NOTNULL_OR_RETURN(fw_op_expr); + CHECK_NOTNULL_OR_RETURN(fw_op_expr) << "SliceUpdate op_expr is null"; base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); return Maybe::Ok(); @@ -86,18 +85,21 @@ class SliceUpdate : public OpExprGradFunction { Maybe Capture(SliceUpdateCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs, const AttrMap& attrs) const override { - CHECK_EQ_OR_RETURN(inputs.size(), 2); - CHECK_EQ_OR_RETURN(outputs.size(), 1); - ctx->requires_grad_x = inputs.at(0)->requires_grad(); - ctx->requires_grad_update = inputs.at(1)->requires_grad(); - if (!ctx->requires_grad_x && !ctx->requires_grad_update) { return Maybe::Ok(); } + CHECK_EQ_OR_RETURN(inputs.size(), 2) << "SliceUpdate input size must be 2"; + CHECK_EQ_OR_RETURN(outputs.size(), 1) << "SliceUpdate output size must be 1"; + ctx->requires_grad_ref = inputs[0]->requires_grad(); + ctx->requires_grad_value = inputs[1]->requires_grad(); + if (!ctx->requires_grad_ref && !ctx->requires_grad_value) { return Maybe::Ok(); } ComposedAttrMap composed_attrs(attrs, base_attrs_); ctx->start = JUST(composed_attrs.GetAttr>("start")); ctx->stop = JUST(composed_attrs.GetAttr>("stop")); ctx->step = JUST(composed_attrs.GetAttr>("step")); - if (ctx->requires_grad_x) { ctx->SaveTensorForBackward(inputs.at(1)); } + if (ctx->requires_grad_ref) { + ctx->value_shape = *(inputs[1]->shape()); + if (inputs[1]->is_consistent()) { ctx->value_sbp = JUST(inputs[1]->nd_sbp()); } + } return Maybe::Ok(); } @@ -105,13 +107,21 @@ class SliceUpdate : public OpExprGradFunction { TensorTuple* in_grads) const override { in_grads->resize(2); - if (ctx->requires_grad_x) { - const auto& update = ctx->SavedTensors().at(0); - const auto& temp = JUST(functional::ZerosLike(update)); - (*in_grads)[0] = JUST(functional::SliceUpdate(out_grads[0], temp, ctx->start, ctx->stop, + if (ctx->requires_grad_ref) { + std::shared_ptr zeros; + if (out_grads[0]->is_local()) { + zeros = JUST(functional::Constant(ctx->value_shape, 0, out_grads[0]->dtype(), + JUST(out_grads[0]->device()))); + } else { + const auto& parallel_desc = JUST(out_grads[0]->parallel_desc()); + zeros = + JUST(functional::ConsistentConstant(ctx->value_shape, 0, out_grads[0]->dtype(), + parallel_desc, *JUST(GetSbpList(ctx->value_sbp)))); + } + (*in_grads)[0] = JUST(functional::SliceUpdate(out_grads[0], zeros, ctx->start, ctx->stop, ctx->step, /*inplace=*/false)); } - if (ctx->requires_grad_update) { + if (ctx->requires_grad_value) { (*in_grads)[1] = JUST(functional::Slice(out_grads[0], ctx->start, ctx->stop, ctx->step, /*enable_view_slice=*/false)); } @@ -122,8 +132,8 @@ class SliceUpdate : public OpExprGradFunction { AttrMap base_attrs_; }; -REGISTER_OP_EXPR_GRAD_FUNCTION("slice", Slice); REGISTER_OP_EXPR_GRAD_FUNCTION("slice_update", SliceUpdate); +REGISTER_OP_EXPR_GRAD_FUNCTION("slice", Slice); } // namespace one } // namespace oneflow diff --git a/oneflow/core/boxing/nd_sbp_dim_reduce_boxing.cpp b/oneflow/core/boxing/nd_sbp_dim_reduce_boxing.cpp index 6eea24e8f7c..0f38d912267 100644 --- a/oneflow/core/boxing/nd_sbp_dim_reduce_boxing.cpp +++ b/oneflow/core/boxing/nd_sbp_dim_reduce_boxing.cpp @@ -18,9 +18,9 @@ limitations under the License. #include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/functional/functional.h" -#include "oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h" #include "oneflow/core/common/decorator.h" #include "oneflow/core/operator/operator.h" +#include "oneflow/core/framework/sbp_infer_util.h" namespace oneflow { diff --git a/oneflow/core/boxing/slice_boxing_util.h b/oneflow/core/boxing/slice_boxing_util.h index 83fe2f619b9..d59cd6f6317 100644 --- a/oneflow/core/boxing/slice_boxing_util.h +++ b/oneflow/core/boxing/slice_boxing_util.h @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/framework/tensor.h" #include "oneflow/core/framework/placed_nd_sbp.h" +#include "oneflow/core/job/parallel_desc.h" namespace oneflow { diff --git a/oneflow/core/boxing/symmetric_b_to_s_boxing.cpp b/oneflow/core/boxing/symmetric_b_to_s_boxing.cpp index ad1b9141e8c..ac477b4b5ab 100644 --- a/oneflow/core/boxing/symmetric_b_to_s_boxing.cpp +++ b/oneflow/core/boxing/symmetric_b_to_s_boxing.cpp @@ -88,8 +88,8 @@ Maybe SymmetricB2S(const std::shared_ptr& tensor, Symb start.emplace_back(range.begin()); stop.emplace_back(range.end()); } - local_tensor = - JUST(one::functional::Slice(local_tensor, start, stop, step, /*enable_view_slice=*/false)); + local_tensor = JUST(one::functional::Slice(local_tensor, start, stop, step, + /*enable_view_slice=*/false)); } return JUST(one::functional::LocalToConsistent(local_tensor, out->placement(), diff --git a/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp b/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp index 8b96fec0b94..70c2e456c47 100644 --- a/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp +++ b/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp @@ -146,7 +146,7 @@ IBVerbsCommNet::IBVerbsCommNet() : CommNetIf(), poll_exit_flag_(ATOMIC_FLAG_INIT int64_t this_machine_id = GlobalProcessCtx::Rank(); qp_vec_.assign(Global::Get()->process_ranks().size(), nullptr); for (int64_t peer_id : peer_machine_id()) { - IBVerbsQP* cur_qp = new IBVerbsQP(context_, pd_, port, cq_, cq_); + IBVerbsQP* cur_qp = new IBVerbsQP(context_, pd_, port_attr, port, cq_, cq_); qp_vec_.at(peer_id) = cur_qp; IBVerbsConnectionInfo conn_info; conn_info.set_lid(port_attr.lid); diff --git a/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp b/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp index 1b4871e842e..bf96876cabc 100644 --- a/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp +++ b/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp @@ -32,8 +32,8 @@ constexpr uint64_t kDefaultMemBlockSize = 8388608; // 8M } // namespace -IBVerbsQP::IBVerbsQP(ibv_context* ctx, ibv_pd* pd, uint8_t port_num, ibv_cq* send_cq, - ibv_cq* recv_cq) { +IBVerbsQP::IBVerbsQP(ibv_context* ctx, ibv_pd* pd, const struct ibv_port_attr& port_attr, + uint8_t port_num, ibv_cq* send_cq, ibv_cq* recv_cq) { // ctx_, pd_ ctx_ = ctx; pd_ = pd; @@ -67,6 +67,7 @@ IBVerbsQP::IBVerbsQP(ibv_context* ctx, ibv_pd* pd, uint8_t port_num, ibv_cq* sen max_outstanding_send_wr_ = queue_depth; read_block_size_ = ParseIntegerFromEnv("ONEFLOW_COMM_NET_IB_MEM_BLOCK_SIZE", kDefaultMemBlockSize); + mtu_ = static_cast(port_attr.active_mtu); } IBVerbsQP::~IBVerbsQP() { @@ -114,7 +115,7 @@ void IBVerbsQP::Connect(const IBVerbsConnectionInfo& peer_info) { qp_attr.ah_attr.dlid = peer_info.lid(); } qp_attr.ah_attr.port_num = peer_info.port_num(); - qp_attr.path_mtu = static_cast(peer_info.mtu()); + qp_attr.path_mtu = static_cast(std::min(peer_info.mtu(), mtu_)); qp_attr.dest_qp_num = peer_info.qp_num(); qp_attr.rq_psn = 0; qp_attr.max_dest_rd_atomic = 1; diff --git a/oneflow/core/comm_network/ibverbs/ibverbs_qp.h b/oneflow/core/comm_network/ibverbs/ibverbs_qp.h index 198813350a7..ab505a36702 100644 --- a/oneflow/core/comm_network/ibverbs/ibverbs_qp.h +++ b/oneflow/core/comm_network/ibverbs/ibverbs_qp.h @@ -54,7 +54,8 @@ class IBVerbsQP final { public: OF_DISALLOW_COPY_AND_MOVE(IBVerbsQP); IBVerbsQP() = delete; - IBVerbsQP(ibv_context*, ibv_pd*, uint8_t port_num, ibv_cq* send_cq, ibv_cq* recv_cq); + IBVerbsQP(ibv_context*, ibv_pd*, const struct ibv_port_attr&, uint8_t port_num, ibv_cq* send_cq, + ibv_cq* recv_cq); ~IBVerbsQP(); uint32_t qp_num() const { return qp_->qp_num; } @@ -90,6 +91,7 @@ class IBVerbsQP final { uint32_t max_outstanding_send_wr_; std::queue> pending_send_wr_queue_; size_t read_block_size_; + int32_t mtu_; }; } // namespace oneflow diff --git a/oneflow/core/vm/stream_runtime_desc.cpp b/oneflow/core/common/array_ref.h similarity index 70% rename from oneflow/core/vm/stream_runtime_desc.cpp rename to oneflow/core/common/array_ref.h index 68d2eff4a81..1b88c7437b3 100644 --- a/oneflow/core/vm/stream_runtime_desc.cpp +++ b/oneflow/core/common/array_ref.h @@ -13,16 +13,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/vm/stream_runtime_desc.h" +#ifndef ONEFLOW_CORE_COMMON_ARRAY_REF_H_ +#define ONEFLOW_CORE_COMMON_ARRAY_REF_H_ + +#include "llvm/ADT/ArrayRef.h" namespace oneflow { -namespace vm { -void StreamRtDesc::__Init__(StreamDesc* stream_desc) { - const StreamType* stream_type = &stream_desc->stream_type(); - reset_stream_desc(stream_desc); - set_stream_type(stream_type); -} +template +using ArrayRef = llvm::ArrayRef; + +template +using MutableArrayRef = llvm::MutableArrayRef; -} // namespace vm } // namespace oneflow + +#endif diff --git a/oneflow/core/common/device_type.proto b/oneflow/core/common/device_type.proto index bc083768124..2b94416c8cb 100644 --- a/oneflow/core/common/device_type.proto +++ b/oneflow/core/common/device_type.proto @@ -5,5 +5,5 @@ enum DeviceType { kInvalidDevice = 0; kCPU = 1; kCUDA = 2; - kMockDevice = 3; + kMockDevice = 3; // pseudo device for test. } diff --git a/oneflow/core/common/shape.cpp b/oneflow/core/common/shape.cpp index 5d9b5a96a35..94631c6d5e4 100644 --- a/oneflow/core/common/shape.cpp +++ b/oneflow/core/common/shape.cpp @@ -19,7 +19,89 @@ limitations under the License. namespace oneflow { -Shape CreateReducedShape(const ShapeView& shape, const AxisVector& axis_vec) { +template +int64_t ConstShapeMixIn::elem_cnt() const { + return std::accumulate(tp()->begin(), tp()->end(), int64_t(1), std::multiplies<>()); +} + +template +int64_t ConstShapeMixIn::At(int64_t index) const { + CHECK_GE(index, 0); + CHECK_LT(index, tp()->NumAxes()) << " Shape: " << tp()->DebugStr() << " visit index: " << index + << " > num_axes: " << tp()->NumAxes(); + return (*tp())[index]; +} + +template +int64_t ConstShapeMixIn::Count(int64_t begin_axis, int64_t end_axis) const { + CHECK(0 <= begin_axis && begin_axis <= end_axis && end_axis <= tp()->NumAxes()) + << begin_axis << " " << end_axis; + int64_t cnt = 1; + for (int64_t i = begin_axis; i < end_axis; ++i) { cnt *= At(i); } + return cnt; +} +template +int64_t ConstShapeMixIn::Count(int64_t begin_axis) const { + return Count(begin_axis, tp()->NumAxes()); +} + +template +bool ConstShapeMixIn::Containing(ShapeView small_shape) const { + if (tp()->NumAxes() < small_shape.NumAxes()) { return false; } + FOR_RANGE(int, i, 0, small_shape.NumAxes()) { + if (tp()->At(i) != small_shape.At(i)) { return false; } + } + return true; +} + +template +bool ConstShapeMixIn::MatchBeforeLastDim(ShapeView next_shape) const { + if (tp()->NumAxes() != next_shape.NumAxes()) { return false; } + for (int64_t i = 0; i < tp()->NumAxes() - 1; ++i) { + if (next_shape.At(i) != tp()->At(i)) { return false; } + } + return true; +} + +template +std::string ConstShapeMixIn::ToString() const { + std::stringstream ss; + int32_t idx = 0; + ss << "("; + for (int64_t dim : *tp()) { + ss << dim; + if (++idx != tp()->size() || tp()->size() == 1) { ss << ","; } + } + ss << ")"; + return ss.str(); +} + +template +std::string ConstShapeMixIn::DebugStr() const { + return ToString(); +} + +template +void ConstShapeMixIn::ToProto(ShapeProto* ret) const { + *(ret->mutable_dim()) = PbRf(tp()->begin(), tp()->end()); +} + +template +bool ConstShapeMixIn::operator==(const T& rhs) const { + if (this->NumAxes() != rhs.NumAxes()) { return false; } + FOR_RANGE(int, i, 0, this->NumAxes()) { + if (this->At(i) != rhs.At(i)) { return false; } + } + return true; +} + +template struct ConstShapeMixIn; +template struct MutShapeMixIn; +template struct ConstShapeMixIn; +template struct ConstShapeMixIn; +template struct MutShapeMixIn; + +Shape CreateReducedShape(ShapeView shape, const AxisVector& axis_vec) { // For 0-dim Tensor if (axis_vec.empty()) { return Shape({}); } DimVector dim_vec; @@ -28,7 +110,7 @@ Shape CreateReducedShape(const ShapeView& shape, const AxisVector& axis_vec) { return Shape(std::move(dim_vec)); } -Shape CreateLeftExtendedShape(const ShapeView& shape, int ndims_left_extend_to) { +Shape CreateLeftExtendedShape(ShapeView shape, int ndims_left_extend_to) { CHECK_GE(ndims_left_extend_to, shape.NumAxes()); DimVector dim_vec(ndims_left_extend_to); const size_t left_ones_num = ndims_left_extend_to - shape.NumAxes(); @@ -38,16 +120,17 @@ Shape CreateLeftExtendedShape(const ShapeView& shape, int ndims_left_extend_to) return Shape(std::move(dim_vec)); } -Shape ZeroDimCompatiableShape(const Shape& shape) { - if (shape.NumAxes() == 0 && shape.elem_cnt() == 1) { - DimVector dim_vec; - dim_vec.emplace_back(1); - return Shape(dim_vec); - } +Shape ExpandDimIf0D(const Shape& shape) { + if (shape.NumAxes() == 0) { return {1}; } return shape; } -Shape CreateReducedShapeOrOnesShape(const ShapeView& shape, const AxisVector& axis_vec) { +Shape ExpandDimIf0D(ShapeView shape) { + if (shape.NumAxes() == 0) { return {1}; } + return Shape(shape); +} + +Shape CreateReducedShapeOrOnesShape(ShapeView shape, const AxisVector& axis_vec) { if (axis_vec.empty()) { return Shape::Ones(shape.NumAxes()); } return CreateReducedShape(shape, axis_vec); } @@ -63,14 +146,16 @@ Shape::Shape(const DimVector& dim_vec) : DimVector(dim_vec), is_initialized_(tru Shape::Shape(DimVector&& dim_vec) : DimVector(std::move(dim_vec)), is_initialized_(true) {} Shape::Shape(const ShapeProto& shape_proto) : DimVector(shape_proto.dim().begin(), shape_proto.dim().end()), is_initialized_(true) {} +Shape::Shape(ShapeView shape_view) + : DimVector(shape_view.begin(), shape_view.end()), is_initialized_(true) {} -Shape& Shape::CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view) { +Shape& Shape::CheckNumAxesIdenticalAndAssign(ShapeView shape_view) { CHECK_EQ(NumAxes(), shape_view.NumAxes()); std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), data()); return *this; } -Shape& Shape::LeftOnesExtendedAssign(const ShapeView& shape_view) { +Shape& Shape::LeftOnesExtendedAssign(ShapeView shape_view) { CHECK_GE(NumAxes(), shape_view.NumAxes()); size_t left_ones_size = NumAxes() - shape_view.NumAxes(); FOR_RANGE(int, i, 0, left_ones_size) { (*this)[i] = 1LL; } @@ -78,48 +163,6 @@ Shape& Shape::LeftOnesExtendedAssign(const ShapeView& shape_view) { return *this; } -std::string Shape::ToString() const { - std::stringstream ss; - int32_t idx = 0; - ss << "("; - for (int64_t dim : *this) { - ss << dim; - if (++idx != size() || size() == 1) { ss << ","; } - } - ss << ")"; - return ss.str(); -} - -std::string Shape::DebugStr() const { return ToString(); } - -void Shape::ToProto(ShapeProto* ret) const { - *(ret->mutable_dim()) = PbRf(begin(), end()); -} - -int64_t Shape::At(int64_t index) const { - CHECK_GE(index, 0); - CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index - << " > num_axes: " << this->NumAxes(); - return (*this)[index]; -} - -void Shape::Set(int64_t index, int64_t val) { - CHECK_GE(index, 0); - CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index - << " > num_axes: " << this->NumAxes(); - (*this)[index] = val; -} - -int64_t Shape::Count(int64_t begin_axis, int64_t end_axis) const { - CHECK(0 <= begin_axis && begin_axis <= end_axis && end_axis <= NumAxes()) - << begin_axis << " " << end_axis; - int64_t cnt = 1; - for (int64_t i = begin_axis; i < end_axis; ++i) { cnt *= At(i); } - return cnt; -} - -int64_t Shape::Count(int64_t begin_axis) const { return Count(begin_axis, NumAxes()); } - std::ostream& operator<<(std::ostream& out, const Shape& shape) { out << shape.DebugStr(); return out; @@ -153,36 +196,20 @@ Shape Shape::Ones(const int64_t num_axes) { return Shape(dim_vec); } -AxisVector Shape::Axes4BroadcastTo(const Shape& broadcast_shape) const { +AxisVector Shape::Axes4BroadcastTo(ShapeView broadcast_shape) const { AxisVector broadcast_axis_vec; CHECK_EQ(broadcast_shape.NumAxes(), NumAxes()); for (int64_t i = 0; i < NumAxes(); i++) { - if (this->dim_vec().at(i) != broadcast_shape.dim_vec().at(i) && this->dim_vec().at(i) == 1) { + if (this->dim_vec().at(i) != broadcast_shape[i] && this->dim_vec().at(i) == 1) { broadcast_axis_vec.emplace_back(i); } else { - CHECK_EQ(this->dim_vec().at(i), broadcast_shape.dim_vec().at(i)); + CHECK_EQ(this->dim_vec().at(i), broadcast_shape[i]); } } CHECK(!broadcast_axis_vec.empty()); return broadcast_axis_vec; } -bool Shape::Containing(const Shape& small_shape) const { - if (this->NumAxes() < small_shape.NumAxes()) { return false; } - FOR_RANGE(int, i, 0, small_shape.NumAxes()) { - if (this->At(i) != small_shape.At(i)) { return false; } - } - return true; -} - -bool Shape::MatchBeforeLastDim(const Shape& next_shape) const { - if (this->NumAxes() != next_shape.NumAxes()) { return false; } - for (int64_t i = 0; i < this->NumAxes() - 1; ++i) { - if (next_shape.At(i) != this->At(i)) { return false; } - } - return true; -} - Maybe Shape::Slice(int64_t start_dim, int64_t end_dim) const { CHECK_OR_RETURN(start_dim >= 0 && end_dim >= start_dim); int64_t ndims = this->NumAxes(); diff --git a/oneflow/core/common/shape.h b/oneflow/core/common/shape.h index 7a94ad85a6d..6805dc21caf 100644 --- a/oneflow/core/common/shape.h +++ b/oneflow/core/common/shape.h @@ -17,7 +17,6 @@ limitations under the License. #define ONEFLOW_CORE_COMMON_SHAPE_H_ #include "oneflow/core/common/shape.pb.h" -#include "oneflow/core/common/shape_view.h" #include "oneflow/core/common/util.h" #include "oneflow/core/common/maybe.h" #include "oneflow/core/common/shape_vec.h" @@ -26,13 +25,82 @@ limitations under the License. namespace oneflow { class ShapeView; +class MutShapeView; class ShapeProto; namespace cfg { class ShapeProto; } // namespace cfg -class Shape final : public DimVector { +/** + * NOTE: + * + * There are two widely used shape-related classes: Shape and ShapeView. + * The differences are: + * 1. Shape owns the data, and ShapeView does not. + * 2. ShapeView is very lightweight, whose size is only 16 bytes (two int64_t). + * So it should be passed by value. + * + * When adding new functions accepting a shape as a parameter, please follow + * the rules: + * 1. If your function doesn't modify the shape, prefer + * ShapeView. Shape can be implicitly converted to ShapeView so the method + * with ShapeView parameter can accept both Shape and ShapeView actually. + * 2. If your function modify the shape but doesn't affect + * its rank, prefer MutShapeView. The reason is the same with rule 1. + * 3. Use Shape otherwise. + * + * When adding new member methods of Shape or ShapeView, please follow + * the rules: + * 1. If the method is shared between Shape and ShapeView (like `NumAxes()`) + * please add it to ConstShapeMixIn. + * 2. If the method is shared between Shape and MutShapeView (like `Set()`) + * please add it to MutShapeMixIn. + * 3. Otherwise, add it to a concrete class (Shape, ShapeView or MutShapeView). + * + */ +template +struct ConstShapeMixIn { + using DimType = int64_t; + + int64_t NumAxes() const { return tp()->size(); } + int64_t elem_cnt() const; + int64_t At(int64_t index) const; + int64_t Count(int64_t begin_axis, int64_t end_axis) const; + int64_t Count(int64_t begin_axis) const; + bool Containing(ShapeView small_shape) const; + bool MatchBeforeLastDim(ShapeView next_shape) const; + std::string ToString() const; + + std::string DebugStr() const; + + void ToProto(ShapeProto* ret) const; + + template + void SerializeWithTextFormat(StreamT& out_stream) const { + for (int64_t dim : *this) { out_stream << std::to_string(dim) << ' '; } + } + + bool operator==(const T& rhs) const; + + protected: + // tp means "this pointer" + T* tp() { return static_cast(this); } + const T* tp() const { return static_cast(this); } +}; + +template +struct MutShapeMixIn : public ConstShapeMixIn { + void Set(int64_t index, int64_t val) { + CHECK_GE(index, 0); + CHECK_LT(index, this->tp()->NumAxes()) + << " Shape: " << this->tp()->DebugStr() << " visit index: " << index + << " > num_axes: " << this->tp()->NumAxes(); + (*this->tp())[index] = val; + } +}; + +class Shape final : public DimVector, public MutShapeMixIn { public: // OF_DISALLOW_COPY_AND_MOVE(Shape); using DimVector::DimVector; @@ -43,6 +111,7 @@ class Shape final : public DimVector { // explicit constructor from ShapeView explicit Shape(ShapeView shape_view); ~Shape() = default; + using DimVector::operator==; #define OVERRIDE_ADD_DATA_FUNC(func) \ template \ @@ -60,47 +129,24 @@ class Shape final : public DimVector { #undef OVERRIDE_ADD_DATA_FUNC - Shape& CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view); - Shape& LeftOnesExtendedAssign(const ShapeView& shape_view); - - std::string DebugStr() const; - std::string ToString() const; - - void ToProto(ShapeProto*) const; - - template - void SerializeWithTextFormat(StreamT& out_stream) const; + Shape& CheckNumAxesIdenticalAndAssign(ShapeView shape_view); + Shape& LeftOnesExtendedAssign(ShapeView shape_view); // Getters and Setters bool is_initialized() const { return is_initialized_; } const DimVector& dim_vec() const { return *this; } DimVector& dim_vec() { return *this; } - int64_t elem_cnt() const { - return std::accumulate(begin(), end(), int64_t(1), std::multiplies<>()); - } - int64_t At(int64_t index) const; - void Set(int64_t index, int64_t val); int64_t NumAxes() const { CHECK(is_initialized()); - return size(); + return ConstShapeMixIn::NumAxes(); } - int64_t Count(int64_t begin_axis, int64_t end_axis) const; - int64_t Count(int64_t begin_axis) const; - AxisVector ShiftNegativeAxisVec(const AxisVector& axis_vec) const; Shape RemoveOnes(const AxisVector& axis_vec) const; static Shape Ones(const int64_t num_axes); - AxisVector Axes4BroadcastTo(const Shape& broadcast_dim_vec) const; - - bool Containing(const Shape& small_shape) const; - bool MatchBeforeLastDim(const Shape& next_shape) const; + AxisVector Axes4BroadcastTo(ShapeView broadcast_dim_vec) const; Maybe Slice(int64_t start_dim, int64_t end_dim) const; - ShapeView ToShapeView() const { return ShapeView(data(), size()); } - - MutShapeView ToMutShapeView() { return MutShapeView(data(), size()); } - private: // Set default value here because some constructors are inherited from DimVector // TODO(daquexian): remove this field and make it initializied by construction @@ -109,14 +155,11 @@ class Shape final : public DimVector { int64_t ShiftNegativeAxis(int64_t axis, const int64_t num_axes); -Shape CreateReducedShape(const ShapeView& shape, const AxisVector& axis_vec); -Shape CreateLeftExtendedShape(const ShapeView& shape, int ndims_extend_to); -Shape ZeroDimCompatiableShape(const Shape& shape); -Shape CreateReducedShapeOrOnesShape(const ShapeView& shape, const AxisVector& axis_vec); -template -void Shape::SerializeWithTextFormat(StreamT& out_stream) const { - for (int64_t dim : *this) { out_stream << std::to_string(dim) << ' '; } -} +Shape CreateReducedShape(ShapeView shape, const AxisVector& axis_vec); +Shape CreateLeftExtendedShape(ShapeView shape, int ndims_extend_to); +Shape ExpandDimIf0D(const Shape& shape); +Shape ExpandDimIf0D(ShapeView shape); +Shape CreateReducedShapeOrOnesShape(ShapeView shape, const AxisVector& axis_vec); std::ostream& operator<<(std::ostream& out, const Shape& shape); diff --git a/oneflow/core/common/shape_view.cpp b/oneflow/core/common/shape_view.cpp index 648034665fe..f3aa8735582 100644 --- a/oneflow/core/common/shape_view.cpp +++ b/oneflow/core/common/shape_view.cpp @@ -19,89 +19,25 @@ limitations under the License. namespace oneflow { -ShapeView::ShapeView(const ShapeProto& shape_proto) - : ShapeViewBase(shape_proto.dim().data(), shape_proto.dim_size()) {} -ShapeView::ShapeView(const Shape& shape) - : ShapeViewBase(shape.dim_vec().data(), shape.dim_vec().size()) {} - -template -int64_t ShapeViewBase::At(int64_t index) const { - CHECK_GE(index, 0); - if (!(this->NumAxes() == 0 && this->elem_cnt() == 1)) { - CHECK_LT(index, num_axes_); - } else { - CHECK(index == 0); - } - return ptr_[index]; -} - -template -int64_t ShapeViewBase::Count(int64_t begin_axis) const { - return this->Count(begin_axis, NumAxes()); -} - -template -int64_t ShapeViewBase::Count(int64_t begin_axis, int64_t end_axis) const { - CHECK(0 <= begin_axis && begin_axis <= end_axis && end_axis <= this->NumAxes()) - << begin_axis << " " << end_axis; - int64_t cnt = 1; - for (int64_t i = begin_axis; i < end_axis; ++i) { cnt *= this->At(i); } - return cnt; -} - -template -int64_t ShapeViewBase::elem_cnt() const { - return this->Count(0); -} - -template -std::string ShapeViewBase::ToString() const { - std::stringstream ss; - ss << "("; - FOR_RANGE(int, i, 0, this->NumAxes()) { - int64_t dim = this->At(i); - ss << dim; - if (i != this->NumAxes() - 1 || this->NumAxes() == 1) { ss << ","; } - } - ss << ")"; - return ss.str(); +void ShapeView::ToDimVector(DimVector* dim_vec) const { + dim_vec->resize(this->size()); + dim_vec->assign(this->data(), this->data() + this->size()); } -template -void ShapeViewBase::ToDimVector(DimVector* dim_vec) const { - dim_vec->resize(num_axes_); - dim_vec->assign(ptr_, ptr_ + num_axes_); -} - -template -void ShapeViewBase::ToShape(Shape* shape) const { +void ShapeView::ToShape(Shape* shape) const { DimVector dim_vec; this->ToDimVector(&dim_vec); *shape = Shape(dim_vec); } -template class ShapeViewBase; -template class ShapeViewBase; - -std::ostream& operator<<(std::ostream& out, const ShapeView& shape) { +std::ostream& operator<<(std::ostream& out, ShapeView shape) { out << shape.ToString(); return out; } -void MutShapeView::Set(int64_t axis, int64_t val) { - CHECK_GE(axis, 0); - CHECK_LT(axis, NumAxes()); - dim_ptr()[axis] = val; -} - -void MutShapeView::set_shape(const Shape& shape) { - CHECK_EQ(NumAxes(), shape.NumAxes()); - std::copy(shape.dim_vec().data(), shape.dim_vec().data() + shape.NumAxes(), dim_ptr()); -} - -void MutShapeView::set_shape(const ShapeView& shape) { +void MutShapeView::set_shape(ShapeView shape) { CHECK_EQ(NumAxes(), shape.NumAxes()); - std::copy(shape.ptr(), shape.ptr() + shape.NumAxes(), dim_ptr()); + std::copy(shape.ptr(), shape.ptr() + shape.NumAxes(), mut_ptr()); } } // namespace oneflow diff --git a/oneflow/core/common/shape_view.h b/oneflow/core/common/shape_view.h index 3ad94e6a204..b679d35511b 100644 --- a/oneflow/core/common/shape_view.h +++ b/oneflow/core/common/shape_view.h @@ -16,79 +16,47 @@ limitations under the License. #ifndef ONEFLOW_CORE_REGISTER_SHAPE_VIEW_H_ #define ONEFLOW_CORE_REGISTER_SHAPE_VIEW_H_ +#include "oneflow/core/common/array_ref.h" #include "oneflow/core/common/util.h" -#include "oneflow/core/common/shape_vec.h" +#include "oneflow/core/common/shape.h" namespace oneflow { class ShapeProto; class Shape; -template -class ShapeViewBase { +class ShapeView : public ArrayRef, public ConstShapeMixIn { public: - using DimType = DimT; - ShapeViewBase(DimType* ptr, int64_t num_axes) : ptr_(ptr), num_axes_(num_axes) {} - ShapeViewBase(const ShapeViewBase& rhs) = default; - ~ShapeViewBase() = default; - - int64_t NumAxes() const { return num_axes_; } - int64_t At(int64_t index) const; - int64_t Count(int64_t begin_axis) const; - int64_t Count(int64_t begin_axis, int64_t end_axis) const; - int64_t elem_cnt() const; - const DimType* ptr() const { return ptr_; } - - bool operator==(const ShapeViewBase& rhs) const; - std::string ToString() const; - void ToDimVector(DimVector* dim_vec) const; - void ToShape(Shape* shape) const; + ShapeView() = default; + // NOLINTNEXTLINE + ShapeView(const ShapeProto& shape_proto) + : ArrayRef(shape_proto.dim().data(), shape_proto.dim_size()){}; + // NOLINTNEXTLINE + ShapeView(const Shape& shape) + : ArrayRef(shape.dim_vec().data(), shape.dim_vec().size()){}; - void set_ptr(DimType* ptr) { ptr_ = ptr; } + using ArrayRef::ArrayRef; - protected: - DimType* dim_ptr() const { return ptr_; } - - private: - DimType* ptr_; - int64_t num_axes_; -}; + const DimType* ptr() const { return this->data(); } -class ShapeView final : public ShapeViewBase { - public: - ShapeView() : ShapeViewBase(nullptr, 0) {} - ShapeView(const int64_t* ptr, int64_t num_axes) : ShapeViewBase(ptr, num_axes) {} - ShapeView(const ShapeProto& shape_proto); - ShapeView(const Shape& shape); - ShapeView(const ShapeView& rhs) = default; - ~ShapeView() = default; + void ToDimVector(DimVector* dim_vec) const; + void ToShape(Shape* shape) const; }; -std::ostream& operator<<(std::ostream& out, const ShapeView& shape); +std::ostream& operator<<(std::ostream& out, ShapeView shape); -class MutShapeView final : public ShapeViewBase { +class MutShapeView final : public MutableArrayRef, public MutShapeMixIn { public: - MutShapeView() : ShapeViewBase(nullptr, 0) {} - MutShapeView(int64_t* ptr, int64_t num_axes) : ShapeViewBase(ptr, num_axes) {} - MutShapeView(const MutShapeView& rhs) = default; - ~MutShapeView() = default; + using MutableArrayRef::MutableArrayRef; + // NOLINTNEXTLINE + MutShapeView(Shape& shape) + : MutableArrayRef(shape.dim_vec().data(), shape.dim_vec().size()){}; - int64_t* mut_ptr() const { return dim_ptr(); } - void Set(int64_t axis, int64_t val); + int64_t* mut_ptr() const { return this->data(); } - void set_shape(const Shape& val); - void set_shape(const ShapeView& shape); + void set_shape(ShapeView shape); }; -template -bool ShapeViewBase::operator==(const ShapeViewBase& rhs) const { - if (this->NumAxes() != rhs.NumAxes()) { return false; } - FOR_RANGE(int, i, 0, this->NumAxes()) { - if (At(i) != rhs.At(i)) { return false; } - } - return true; -} - } // namespace oneflow #endif // ONEFLOW_CORE_REGISTER_SHAPE_VIEW_H_ diff --git a/oneflow/core/eager/cpu_blob_instruction_type.cpp b/oneflow/core/common/singleton_ptr.h similarity index 55% rename from oneflow/core/eager/cpu_blob_instruction_type.cpp rename to oneflow/core/common/singleton_ptr.h index b33a1e607c2..eecb0a4cdee 100644 --- a/oneflow/core/eager/cpu_blob_instruction_type.cpp +++ b/oneflow/core/common/singleton_ptr.h @@ -13,21 +13,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/eager/blob_instruction_type.h" -#include "oneflow/core/vm/cpu_stream_type.h" +#ifndef ONEFLOW_CORE_COMMON_SINGLETON_PTR_H_ +#define ONEFLOW_CORE_COMMON_SINGLETON_PTR_H_ + +#include namespace oneflow { -namespace vm { -class CpuAccessBlobByCallbackInstructionType final : public AccessBlobByCallbackInstructionType { - public: - CpuAccessBlobByCallbackInstructionType() = default; - ~CpuAccessBlobByCallbackInstructionType() override = default; +namespace private_detail { + +template +const T* GlobalSingletonPtr() { + static std::unique_ptr value(new T()); + return value.get(); +} - using stream_type = vm::CpuStreamType; -}; -COMMAND(vm::RegisterInstructionType( - "cpu.AccessBlobByCallback")); +} // namespace private_detail + +template +const T* SingletonPtr() { + thread_local const T* value = private_detail::GlobalSingletonPtr(); + return value; +} -} // namespace vm } // namespace oneflow + +#endif // ONEFLOW_CORE_COMMON_SINGLETON_PTR_H_ diff --git a/oneflow/core/common/steady_vector.h b/oneflow/core/common/steady_vector.h new file mode 100644 index 00000000000..f2a7e06877a --- /dev/null +++ b/oneflow/core/common/steady_vector.h @@ -0,0 +1,102 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_COMMON_STEADY_VECTOR_H_ +#define ONEFLOW_CORE_COMMON_STEADY_VECTOR_H_ + +#include +#include +#include +#include +#include + +namespace oneflow { + +template +class SteadyVector { + public: + SteadyVector() : size_(0) {} + ~SteadyVector() = default; + + using value_type = const T; + using size_type = size_t; + + // thread safe. + size_t size() const { return size_; } + + // thread safe. + const T& at(size_t index) const { + CHECK_GE(index, 0); + CHECK_LT(index, size_); + return (*this)[index]; + } + + // thread safe. + const T& operator[](size_t index) const { + int gran = 0; + size_t start = 0; + GetGranularityAndStart(index, &gran, &start); + return granularity2data_[gran].get()[index - start]; + } + + void push_back(const T& elem) { *MutableOrAdd(size_) = elem; } + + // `index` shoule be <= size() + T* MutableOrAdd(size_t index) { + std::unique_lock lock(mutex_); + size_t size = size_; + CHECK_LE(index, size) << "index out of range"; + if (index == size) { + int granularity = GetGranularity(size); + if (size + 1 == (1 << granularity)) { + CHECK_LT(granularity, N); + granularity2data_[granularity].reset(new T[1 << granularity]); + } + ++size_; + } + return Mutable(index); + } + + private: + T* Mutable(size_t index) { + int gran = 0; + size_t start = 0; + GetGranularityAndStart(index, &gran, &start); + return &granularity2data_[gran].get()[index - start]; + } + + static void GetGranularityAndStart(size_t index, int* gran, size_t* start) { + *gran = GetGranularity(index); + *start = (1 << *gran) - 1; + } + +#ifdef __GNUC__ +#define LOG2(x) ((unsigned)(8 * sizeof(unsigned long long) - __builtin_clzll((x)) - 1)) +#else +#define LOG2(x) std::log2(x) +#endif + + static int GetGranularity(size_t index) { return LOG2(index + 1); } + +#undef LOG2 + + std::atomic size_; + std::mutex mutex_; + std::array, N> granularity2data_; +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_COMMON_STEADY_VECTOR_H_ diff --git a/oneflow/core/vm/stream_desc.cpp b/oneflow/core/common/steady_vector_test.cpp similarity index 51% rename from oneflow/core/vm/stream_desc.cpp rename to oneflow/core/common/steady_vector_test.cpp index d026186d935..bfc5fdb19b8 100644 --- a/oneflow/core/vm/stream_desc.cpp +++ b/oneflow/core/common/steady_vector_test.cpp @@ -13,24 +13,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/vm/stream_desc.h" +#include "gtest/gtest.h" +#include "oneflow/core/common/steady_vector.h" namespace oneflow { -namespace vm { +namespace test { -void StreamDesc::__Init__(const StreamType* stream_type, int32_t num_streams_per_machine, - int32_t num_streams_per_thread) { - set_stream_type(stream_type); - set_num_streams_per_machine(num_streams_per_machine); - set_num_streams_per_thread(num_streams_per_thread); +void TestSteadyVector(int granularity) { + CHECK_GT(granularity, 0); + SteadyVector vec; + ASSERT_EQ(vec.size(), 0); + for (int i = 0; i < (1 << granularity); ++i) { + vec.push_back(i); + ASSERT_EQ(vec.at(i), i); + ASSERT_EQ(vec.size(), i + 1); + } } -int32_t StreamDesc::num_threads() const { - int32_t num_devices = num_streams_per_machine(); - if (num_devices == 0) { return 0; } - CHECK_EQ(num_devices % num_streams_per_thread(), 0); - return num_devices / num_streams_per_thread(); -} +TEST(SteadyVector, simple) { TestSteadyVector(6); } -} // namespace vm +} // namespace test } // namespace oneflow diff --git a/oneflow/core/common/stream_role.h b/oneflow/core/common/stream_role.h index 27fdd4256e0..9e7e5b47fa5 100644 --- a/oneflow/core/common/stream_role.h +++ b/oneflow/core/common/stream_role.h @@ -19,44 +19,44 @@ limitations under the License. #include #include #include "oneflow/core/common/preprocessor.h" +#include "glog/logging.h" namespace oneflow { -#define STREAM_ROLE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(kCompute) \ - OF_PP_MAKE_TUPLE_SEQ(kHost2Device) \ - OF_PP_MAKE_TUPLE_SEQ(kDevice2Host) \ - OF_PP_MAKE_TUPLE_SEQ(kSyncedLaunchedCommNet) \ - OF_PP_MAKE_TUPLE_SEQ(kAsyncedLaunchedCommNet) \ - OF_PP_MAKE_TUPLE_SEQ(kCriticalSection) - enum class StreamRole { kInvalid = 0, -#define DECLARE_STREAM_ROLE(stream_role) stream_role, - OF_PP_FOR_EACH_TUPLE(DECLARE_STREAM_ROLE, STREAM_ROLE_SEQ) -#undef DECLARE_STREAM_ROLE + kCompute, + kHost2Device, + kDevice2Host, + kSyncedLaunchedCommNet, + kAsyncedLaunchedCommNet, + kBarrier, + kCriticalSection, + kLazyJobLauncher }; -static constexpr int kStreamRoleSize = 1 + OF_PP_SEQ_SIZE(STREAM_ROLE_SEQ); - -// Act as a class for overloading functions -template -struct StreamRoleCase {}; - -template -auto StreamRoleSwitch(StreamRole stream_role, Args&&... args) - -> decltype(Functor::Case(StreamRoleCase(), - std::forward(args)...)) { - switch (stream_role) { -#define MAKE_ENTRY(stream_role) \ - case StreamRole::stream_role: \ - return Functor::Case(StreamRoleCase(), std::forward(args)...); - OF_PP_FOR_EACH_TUPLE(MAKE_ENTRY, STREAM_ROLE_SEQ) -#undef MAKE_ENTRY - default: - return Functor::Case(StreamRoleCase(), std::forward(args)...); +template +struct StreamRoleVisitor { + template + static auto Visit(StreamRole stream_role, Args&&... args) { + switch (stream_role) { + case StreamRole::kInvalid: LOG(FATAL) << "invalid stream role"; + case StreamRole::kCompute: return DerivedT::VisitCompute(std::forward(args)...); + case StreamRole::kHost2Device: return DerivedT::VisitHost2Device(std::forward(args)...); + case StreamRole::kDevice2Host: return DerivedT::VisitDevice2Host(std::forward(args)...); + case StreamRole::kSyncedLaunchedCommNet: + return DerivedT::VisitSyncedLaunchedCommNet(std::forward(args)...); + case StreamRole::kAsyncedLaunchedCommNet: + return DerivedT::VisitAsyncedLaunchedCommNet(std::forward(args)...); + case StreamRole::kBarrier: return DerivedT::VisitBarrier(std::forward(args)...); + case StreamRole::kCriticalSection: + return DerivedT::VisitCriticalSection(std::forward(args)...); + case StreamRole::kLazyJobLauncher: + return DerivedT::VisitLazyJobLauncher(std::forward(args)...); + } + LOG(FATAL) << "invalid stream role"; } -} +}; } // namespace oneflow diff --git a/oneflow/core/common/stride.cpp b/oneflow/core/common/stride.cpp index 40da3972fe8..38552a832f9 100644 --- a/oneflow/core/common/stride.cpp +++ b/oneflow/core/common/stride.cpp @@ -23,15 +23,15 @@ namespace oneflow { Stride::Stride(const Shape& shape) { if (shape.is_initialized()) { const int64_t ndim = shape.NumAxes(); - stride_vec_.resize(shape.NumAxes()); + resize(shape.NumAxes()); if (ndim > 0 && shape.elem_cnt() > 0) { - std::exclusive_scan(shape.dim_vec().rbegin(), shape.dim_vec().rend(), stride_vec_.rbegin(), 1, + std::exclusive_scan(shape.dim_vec().rbegin(), shape.dim_vec().rend(), rbegin(), (int64_t)1, std::multiplies<>{}); } else if (ndim > 0 && shape.elem_cnt() == 0) { // 0-size shape std::vector tmp_shape(ndim); for (int64_t i = 0; i < ndim; ++i) { tmp_shape[i] = shape.At(i) > 0 ? shape.At(i) : 1; } - std::exclusive_scan(tmp_shape.rbegin(), tmp_shape.rend(), stride_vec_.rbegin(), 1, + std::exclusive_scan(tmp_shape.rbegin(), tmp_shape.rend(), rbegin(), (int64_t)1, std::multiplies<>{}); } } @@ -39,45 +39,29 @@ Stride::Stride(const Shape& shape) { Stride::Stride(const std::shared_ptr& shape) : Stride(*shape) {} -Stride::Stride(const std::initializer_list& stride_vec) : stride_vec_(stride_vec) {} -Stride::Stride(const DimVector& stride_vec) : stride_vec_(stride_vec) {} -Stride::Stride(DimVector&& stride_vec) : stride_vec_(std::move(stride_vec)) {} -Stride::Stride(const Int64ListProto& stride_proto) { - stride_vec_.assign(stride_proto.dim().begin(), stride_proto.dim().end()); -} - -Stride& Stride::assign(const DimVector& stride_vec) { - stride_vec_ = stride_vec; - return *this; -} +Stride::Stride(const Int64ListProto& stride_proto) + : DimVector(stride_proto.dim().begin(), stride_proto.dim().end()) {} Stride& Stride::CheckNumAxesIdenticalAndAssign(const Stride& stride) { - CHECK_EQ(NumAxes(), stride.NumAxes()); - stride_vec_.assign(stride.StrideVec().begin(), stride.StrideVec().end()); + CHECK_EQ(size(), stride.size()); + assign(stride); return *this; } -Stride& Stride::operator=(const Stride& stride) { - stride_vec_ = stride.stride_vec_; - return *this; -} - -bool Stride::operator==(const Stride& rhs) const { return stride_vec_ == rhs.stride_vec_; } - std::string Stride::ToString() const { std::stringstream ss; int32_t idx = 0; ss << "("; - for (int64_t dim : stride_vec_) { + for (int64_t dim : *this) { ss << dim; - if (++idx != stride_vec_.size() || stride_vec_.size() == 1) { ss << ","; } + if (++idx != this->size() || this->size() == 1) { ss << ","; } } ss << ")"; return ss.str(); } void Stride::ToProto(Int64ListProto* ret) const { - *(ret->mutable_dim()) = PbRf(stride_vec_.begin(), stride_vec_.end()); + *(ret->mutable_dim()) = PbRf(begin(), end()); } } // namespace oneflow diff --git a/oneflow/core/common/stride.h b/oneflow/core/common/stride.h index 0de42636848..5f583bea614 100644 --- a/oneflow/core/common/stride.h +++ b/oneflow/core/common/stride.h @@ -18,6 +18,7 @@ limitations under the License. #define ONEFLOW_CORE_FRAMEWORK_STRIDE_H_ #include "oneflow/core/common/shape.h" +#include "oneflow/core/common/shape_vec.h" #include "oneflow/core/common/sequential.pb.h" #include "oneflow/core/common/util.h" @@ -25,34 +26,18 @@ namespace oneflow { class Int64ListProto; -class Stride final { +class Stride final : public DimVector { public: Stride() = default; + using DimVector::DimVector; explicit Stride(const Shape& shape); explicit Stride(const std::shared_ptr& shape); - explicit Stride(DimVector&& stride_vec); - explicit Stride(const DimVector& stride_vec); explicit Stride(const Int64ListProto& stride_proto); - Stride(const std::initializer_list& stride_vec); - Stride& operator=(const Stride& stride); - Stride& assign(const DimVector& stride_vec); Stride& CheckNumAxesIdenticalAndAssign(const Stride& stride); ~Stride() = default; - bool operator==(const Stride& rhs) const; - bool operator!=(const Stride& rhs) const { return !(*this == rhs); } - std::string ToString() const; void ToProto(Int64ListProto*) const; - - // Getters and Setters - const DimVector& StrideVec() const { return stride_vec_; } - int64_t NumAxes() const { return stride_vec_.size(); } - int64_t At(int64_t index) const { return stride_vec_.at(index); } - void Set(int64_t index, int64_t val) { stride_vec_.at(index) = val; } - - private: - DimVector stride_vec_; }; } // namespace oneflow @@ -62,8 +47,8 @@ namespace std { template<> struct hash { size_t operator()(const oneflow::Stride& stride) const { - size_t ret = stride.NumAxes(); - FOR_RANGE(int, i, 0, stride.NumAxes()) { oneflow::AddHash(&ret, stride.At(i)); } + size_t ret = stride.size(); + FOR_RANGE(int, i, 0, stride.size()) { oneflow::AddHash(&ret, stride.at(i)); } return ret; } }; diff --git a/oneflow/core/common/tensor_buffer.h b/oneflow/core/common/tensor_buffer.h index 4c027613844..8fd8c1270d6 100644 --- a/oneflow/core/common/tensor_buffer.h +++ b/oneflow/core/common/tensor_buffer.h @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/common/util.h" #include "oneflow/core/common/shape.h" +#include "oneflow/core/common/shape_view.h" #include "oneflow/core/common/data_type.h" namespace oneflow { @@ -82,6 +83,7 @@ class TensorBuffer final { bool is_allocated() const { return bool(impl_); } const Shape& shape() const; + ShapeView shape_view() const { return shape(); } DataType data_type() const; int64_t elem_cnt() const { return shape().elem_cnt(); } size_t nbytes() const { return elem_cnt() * GetSizeOfDataType(data_type()); } diff --git a/oneflow/core/cuda/softmax.cuh b/oneflow/core/cuda/softmax.cuh index 940cf45e19c..160daeb7405 100644 --- a/oneflow/core/cuda/softmax.cuh +++ b/oneflow/core/cuda/softmax.cuh @@ -712,7 +712,7 @@ template inline typename std::enable_if::value, cudaError_t>::type DispatchSoftmax(cudaStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) { - if (cols <= 1024) { + if (cols < 1024) { return DispatchSoftmaxWarpImpl( stream, load, store, rows, cols); } else { diff --git a/oneflow/core/device/cuda_util.cpp b/oneflow/core/device/cuda_util.cpp index 0049edd41a9..c1cc28374ca 100644 --- a/oneflow/core/device/cuda_util.cpp +++ b/oneflow/core/device/cuda_util.cpp @@ -51,8 +51,8 @@ const char* CublasGetErrorString(cublasStatus_t error) { #if CUDA_VERSION >= 6050 case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; #endif + default: return "Unknown cublas status"; } - return "Unknown cublas status"; } const char* CurandGetErrorString(curandStatus_t error) { @@ -70,8 +70,8 @@ const char* CurandGetErrorString(curandStatus_t error) { case CURAND_STATUS_INITIALIZATION_FAILED: return "CURAND_STATUS_INITIALIZATION_FAILED"; case CURAND_STATUS_ARCH_MISMATCH: return "CURAND_STATUS_ARCH_MISMATCH"; case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR"; + default: return "Unknown curand status"; } - return "Unknown curand status"; } #if CUDA_VERSION >= 10020 @@ -89,8 +89,8 @@ const char* NvjpegGetErrorString(nvjpegStatus_t error) { case NVJPEG_STATUS_INTERNAL_ERROR: return "NVJPEG_STATUS_INTERNAL_ERROR"; case NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED: return "NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED"; + default: return "Unknown nvjpeg status"; } - return "Unknown nvjpeg status"; } #endif diff --git a/oneflow/core/eager/blob_instruction_type.cpp b/oneflow/core/eager/blob_instruction_type.cpp index 3a4454ed8d7..65f04e2dbc9 100644 --- a/oneflow/core/eager/blob_instruction_type.cpp +++ b/oneflow/core/eager/blob_instruction_type.cpp @@ -46,7 +46,7 @@ void AccessBlobByCallbackInstructionType::ComputeInstrMsg( const auto* ptr = dynamic_cast(phy_instr_operand.get()); CHECK_NOTNULL(ptr); - DeviceCtx* device_ctx = instr_msg.phy_instr_stream()->device_ctx().get(); + DeviceCtx* device_ctx = instr_msg.stream().device_ctx().get(); auto* blob = ptr->eager_blob_object()->blob(); OfBlob ofblob(device_ctx->stream(), blob); ptr->callback()(reinterpret_cast(&ofblob)); diff --git a/oneflow/core/eager/blob_instruction_type.h b/oneflow/core/eager/blob_instruction_type.h index c3d1d6121b0..b2182dbf703 100644 --- a/oneflow/core/eager/blob_instruction_type.h +++ b/oneflow/core/eager/blob_instruction_type.h @@ -13,17 +13,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ONEFLOW_CORE_EAGER_BLOB_INSTRUCTION_TYPE_H_ +#define ONEFLOW_CORE_EAGER_BLOB_INSTRUCTION_TYPE_H_ + #include "oneflow/core/intrusive/flat_msg_view.h" #include "oneflow/core/vm/instruction_type.h" +#include "oneflow/core/common/stream_role.h" +#include "oneflow/core/common/singleton_ptr.h" +#include "oneflow/core/vm/cuda_optional_event_record_status_querier.h" +#include "oneflow/core/vm/stream.h" +#include "oneflow/core/device/cuda_event.h" namespace oneflow { namespace vm { -class AccessBlobByCallbackInstructionType : public vm::InstructionType { +class AccessBlobByCallbackInstructionType final : public vm::InstructionType { public: AccessBlobByCallbackInstructionType() = default; ~AccessBlobByCallbackInstructionType() override = default; + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { + return "AccessBlobByCallback"; + } void Compute(vm::Instruction* instruction) const override; void ComputeInFuseMode(vm::InstructionMsg* instruction_msg) const override; @@ -31,13 +42,86 @@ class AccessBlobByCallbackInstructionType : public vm::InstructionType { void ComputeInstrMsg(const vm::InstructionMsg& instruction_msg) const; }; -class RecordEventInstructionType : public vm::InstructionType { +class CpuRecordEventInstructionType final : public vm::InstructionType { + public: + CpuRecordEventInstructionType() = default; + ~CpuRecordEventInstructionType() override = default; + + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { + return "RecordEvent"; + } + void Compute(vm::Instruction* instruction) const override {} +}; + +#ifdef WITH_CUDA + +class CudaRecordEventInstructionType final : public vm::InstructionType { public: - RecordEventInstructionType() = default; - ~RecordEventInstructionType() override = default; + CudaRecordEventInstructionType() = default; + ~CudaRecordEventInstructionType() override = default; + InstructionFuseType fuse_type() const override { return kEnableInstructionFuseAsTailOnly; } + + void InitInstructionStatus(Instruction* instruction) const override { + auto* status_buffer = instruction->mut_status_buffer(); + auto* stream = instruction->mut_stream(); + instruction->stream_type().InitInstructionStatus(*stream, status_buffer); + auto* event_provider = dynamic_cast(stream->device_ctx().get()); + const auto& cuda_event = CHECK_NOTNULL(event_provider)->GetCudaEvent(); + auto* data_ptr = status_buffer->mut_buffer()->mut_data(); + CudaOptionalEventRecordStatusQuerier::MutCast(data_ptr)->reset_cuda_event(cuda_event); + } + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { + return "RecordEvent"; + } void Compute(vm::Instruction* instruction) const override {} }; +#endif + } // namespace vm + +struct GetRecordEventInstructionType : public StreamRoleVisitor { + static Maybe VisitCompute(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitHost2Device(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitDevice2Host(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitSyncedLaunchedCommNet(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitAsyncedLaunchedCommNet(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitBarrier(DeviceType device_type) { + UNIMPLEMENTED_THEN_RETURN(); + } + static Maybe VisitCriticalSection(DeviceType device_type) { + UNIMPLEMENTED_THEN_RETURN(); + } + static Maybe VisitLazyJobLauncher(DeviceType device_type) { + UNIMPLEMENTED_THEN_RETURN(); + } + + private: + static Maybe GetInstructionType(DeviceType device_type) { + if (device_type == DeviceType::kCPU) { + return SingletonPtr(); + } else if (device_type == DeviceType::kCUDA) { +#ifdef WITH_CUDA + return SingletonPtr(); +#else + UNIMPLEMENTED_THEN_RETURN(); +#endif + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + } +}; + } // namespace oneflow +#endif // ONEFLOW_CORE_EAGER_BLOB_INSTRUCTION_TYPE_H_ diff --git a/oneflow/core/eager/cpu_opkernel_instruction_type.cpp b/oneflow/core/eager/cpu_opkernel_instruction_type.cpp deleted file mode 100644 index 7d3ee257397..00000000000 --- a/oneflow/core/eager/cpu_opkernel_instruction_type.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/util.h" -#include "oneflow/core/job/job_desc.h" -#include "oneflow/core/eager/opkernel_instruction_type.h" -#include "oneflow/core/vm/stream.h" -#include "oneflow/core/vm/cpu_stream_type.h" -#include "oneflow/core/vm/instruction.h" - -namespace oneflow { -namespace vm { - -class CpuLocalCallOpKernelInstructionType final : public LocalCallOpKernelInstructionType { - public: - CpuLocalCallOpKernelInstructionType() = default; - ~CpuLocalCallOpKernelInstructionType() override = default; - - using stream_type = vm::CpuStreamType; -}; -COMMAND(vm::RegisterInstructionType("cpu.LocalCallOpKernel")); - -} // namespace vm -} // namespace oneflow diff --git a/oneflow/core/eager/critical_section_instruction_type.cpp b/oneflow/core/eager/critical_section_instruction_type.h similarity index 92% rename from oneflow/core/eager/critical_section_instruction_type.cpp rename to oneflow/core/eager/critical_section_instruction_type.h index 1a4bd0b292d..f96b27b3e95 100644 --- a/oneflow/core/eager/critical_section_instruction_type.cpp +++ b/oneflow/core/eager/critical_section_instruction_type.h @@ -13,9 +13,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ONEFLOW_CORE_EAGER_CRITICAL_SECTION_INSTRUCTION_TYPE_H_ +#define ONEFLOW_CORE_EAGER_CRITICAL_SECTION_INSTRUCTION_TYPE_H_ -#include "oneflow/core/eager/critical_section_stream_type.h" -#include "oneflow/core/eager/critical_section_status_querier.h" +#include "oneflow/core/vm/critical_section_status_querier.h" #include "oneflow/core/eager/critical_section_phy_instr_operand.h" #include "oneflow/core/job/critical_section_instance.h" #include "oneflow/core/framework/nn_graph_if.h" @@ -44,8 +45,9 @@ class CriticalSectionBeginInstructionType final : public InstructionType { CriticalSectionBeginInstructionType() = default; ~CriticalSectionBeginInstructionType() = default; - using stream_type = CriticalSectionStreamType; - + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { + return "CriticalSectionBegin"; + } void Compute(vm::Instruction* instruction) const override { OF_PROFILER_RANGE_GUARD("CriticalSectionBegin"); { @@ -107,8 +109,6 @@ class CriticalSectionBeginInstructionType final : public InstructionType { } }; -COMMAND(RegisterInstructionType("CriticalSectionBegin")); - class CriticalSectionEndInstructionType final : public InstructionType { public: CriticalSectionEndInstructionType(const CriticalSectionEndInstructionType&) = delete; @@ -118,8 +118,9 @@ class CriticalSectionEndInstructionType final : public InstructionType { CriticalSectionEndInstructionType() = default; ~CriticalSectionEndInstructionType() = default; - using stream_type = CriticalSectionStreamType; - + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { + return "CriticalSectionEnd"; + } void Compute(vm::Instruction* instruction) const override { const auto* ptr = instruction->instr_msg().phy_instr_operand().get(); const auto* phy_instr_operand = dynamic_cast(ptr); @@ -130,7 +131,6 @@ class CriticalSectionEndInstructionType final : public InstructionType { } }; -COMMAND(RegisterInstructionType("CriticalSectionEnd")); - } // namespace vm } // namespace oneflow +#endif // ONEFLOW_CORE_EAGER_CRITICAL_SECTION_INSTRUCTION_TYPE_H_ diff --git a/oneflow/core/eager/critical_section_phy_instr_operand.cpp b/oneflow/core/eager/critical_section_phy_instr_operand.cpp index ec6facb370d..bc4f2b7d21e 100644 --- a/oneflow/core/eager/critical_section_phy_instr_operand.cpp +++ b/oneflow/core/eager/critical_section_phy_instr_operand.cpp @@ -22,6 +22,7 @@ limitations under the License. #include "oneflow/core/device/ep_based_event_record.h" #include "oneflow/core/register/ofblob.h" #include "oneflow/core/common/container_util.h" +#include "oneflow/core/vm/stream.h" namespace oneflow { namespace vm { @@ -38,21 +39,9 @@ void CriticalSectionEndPhyInstrOperand::ForEachMirroredObject( DoEach(CHECK_JUST(eager_blob_object_->compute_local_dep_object())); } -namespace { - -Maybe RawCriticalSectionLocalDepObject() { - const auto& device = JUST(Device::New("cpu")); - return Stream::New(device, StreamRole::kCriticalSection)->mut_schedule_local_dep_object(); -} - -constexpr auto* CriticalSectionLocalDepObject = - DECORATE(&RawCriticalSectionLocalDepObject, ThreadLocal); - -} // namespace - void CriticalSectionBeginPhyInstrOperand::ForEachMutMirroredObject( const std::function& DoEach) const { - DoEach(CHECK_JUST(CriticalSectionLocalDepObject())); + DoEach(vm_stream_->schedule_local_dep_object().get()); } void CriticalSectionBeginPhyInstrOperand::FinishInvalidInterfaceEventRecords() { @@ -121,7 +110,7 @@ void OutputCriticalSectionBeginPhyInstrOperand::AccessBlobByOpName(uint64_t of_b void CriticalSectionEndPhyInstrOperand::ForEachMutMirroredObject( const std::function& DoEach) const { - DoEach(CHECK_JUST(CriticalSectionLocalDepObject())); + DoEach(vm_stream_->schedule_local_dep_object().get()); } } // namespace vm diff --git a/oneflow/core/eager/critical_section_phy_instr_operand.h b/oneflow/core/eager/critical_section_phy_instr_operand.h index f294dde1135..2627c3d6339 100644 --- a/oneflow/core/eager/critical_section_phy_instr_operand.h +++ b/oneflow/core/eager/critical_section_phy_instr_operand.h @@ -33,6 +33,8 @@ using EagerBlobObjectListPtr = namespace vm { +class Stream; + class CriticalSectionBeginPhyInstrOperand : public PhyInstrOperand { public: CriticalSectionBeginPhyInstrOperand(const CriticalSectionBeginPhyInstrOperand&) = delete; @@ -46,10 +48,12 @@ class CriticalSectionBeginPhyInstrOperand : public PhyInstrOperand { const std::shared_ptr& nn_graph, const one::EagerBlobObjectListPtr& eager_blob_objects, const std::shared_ptr>>& - op_name2end_event_record) + op_name2end_event_record, + vm::Stream* vm_stream) : nn_graph_(nn_graph), eager_blob_objects_(eager_blob_objects), - op_name2end_event_record_(op_name2end_event_record) {} + op_name2end_event_record_(op_name2end_event_record), + vm_stream_(vm_stream) {} const std::shared_ptr& nn_graph() const { return nn_graph_; } const one::EagerBlobObjectListPtr& eager_blob_objects() const { return eager_blob_objects_; } @@ -77,6 +81,7 @@ class CriticalSectionBeginPhyInstrOperand : public PhyInstrOperand { std::shared_ptr>> op_name2end_event_record_; HashMap op_name2interface_index_; + vm::Stream* vm_stream_; }; class InputCriticalSectionBeginPhyInstrOperand final : public CriticalSectionBeginPhyInstrOperand { @@ -85,8 +90,10 @@ class InputCriticalSectionBeginPhyInstrOperand final : public CriticalSectionBeg const std::shared_ptr& nn_graph, const one::EagerBlobObjectListPtr& eager_blob_objects, const std::shared_ptr>>& - op_name2end_event_record) - : CriticalSectionBeginPhyInstrOperand(nn_graph, eager_blob_objects, op_name2end_event_record), + op_name2end_event_record, + vm::Stream* vm_stream) + : CriticalSectionBeginPhyInstrOperand(nn_graph, eager_blob_objects, op_name2end_event_record, + vm_stream), input_dependences_(), output_dependences_() { ForEachConstMirroredObject(SetInserter(&input_dependences_)); @@ -141,8 +148,10 @@ class OutputCriticalSectionBeginPhyInstrOperand final : public CriticalSectionBe const std::shared_ptr& nn_graph, const one::EagerBlobObjectListPtr& eager_blob_objects, const std::shared_ptr>>& - op_name2end_event_record) - : CriticalSectionBeginPhyInstrOperand(nn_graph, eager_blob_objects, op_name2end_event_record), + op_name2end_event_record, + vm::Stream* vm_stream) + : CriticalSectionBeginPhyInstrOperand(nn_graph, eager_blob_objects, op_name2end_event_record, + vm_stream), input_dependences_(), output_dependences_() { ForEachConstMirroredObject(SetInserter(&input_dependences_)); @@ -195,8 +204,9 @@ class OutputCriticalSectionBeginPhyInstrOperand final : public CriticalSectionBe class CriticalSectionEndPhyInstrOperand : public PhyInstrOperand { public: CriticalSectionEndPhyInstrOperand(const std::shared_ptr& eager_blob_object, - const std::shared_ptr& event_record) - : eager_blob_object_(eager_blob_object), event_record_(event_record) {} + const std::shared_ptr& event_record, + vm::Stream* vm_stream) + : eager_blob_object_(eager_blob_object), event_record_(event_record), vm_stream_(vm_stream) {} virtual ~CriticalSectionEndPhyInstrOperand() = default; const std::shared_ptr& event_record() const { return event_record_; } @@ -208,13 +218,15 @@ class CriticalSectionEndPhyInstrOperand : public PhyInstrOperand { private: std::shared_ptr eager_blob_object_; std::shared_ptr event_record_; + vm::Stream* vm_stream_; }; class InputCriticalSecondEndPhyInstrOperand final : public CriticalSectionEndPhyInstrOperand { public: InputCriticalSecondEndPhyInstrOperand(const std::shared_ptr& eager_blob_object, - const std::shared_ptr& event_record) - : CriticalSectionEndPhyInstrOperand(eager_blob_object, event_record), + const std::shared_ptr& event_record, + vm::Stream* vm_stream) + : CriticalSectionEndPhyInstrOperand(eager_blob_object, event_record, vm_stream), input_dependences_(), output_dependences_() { ForEachConstMirroredObject(SetInserter(&input_dependences_)); @@ -241,8 +253,9 @@ class InputCriticalSecondEndPhyInstrOperand final : public CriticalSectionEndPhy class OutputCriticalSecondEndPhyInstrOperand final : public CriticalSectionEndPhyInstrOperand { public: OutputCriticalSecondEndPhyInstrOperand(const std::shared_ptr& eager_blob_object, - const std::shared_ptr& event_record) - : CriticalSectionEndPhyInstrOperand(eager_blob_object, event_record), + const std::shared_ptr& event_record, + vm::Stream* vm_stream) + : CriticalSectionEndPhyInstrOperand(eager_blob_object, event_record, vm_stream), input_dependences_(), output_dependences_() { ForEachConstMirroredObject(SetInserter(&input_dependences_)); diff --git a/oneflow/core/eager/cuda_blob_instruction_type.cpp b/oneflow/core/eager/cuda_blob_instruction_type.cpp deleted file mode 100644 index 940afcd6d16..00000000000 --- a/oneflow/core/eager/cuda_blob_instruction_type.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/vm/cpu_stream_type.h" -#ifdef WITH_CUDA -#include "oneflow/core/eager/blob_instruction_type.h" -#include "oneflow/core/vm/cuda_stream_type.h" -#include "oneflow/core/vm/cuda_optional_event_record_status_querier.h" -#include "oneflow/core/vm/stream.h" -#include "oneflow/core/vm/async_cuda_stream_type.h" -#include "oneflow/core/device/cuda_event.h" - -namespace oneflow { -namespace vm { - -class GpuAccessBlobByCallbackInstructionType final : public AccessBlobByCallbackInstructionType { - public: - GpuAccessBlobByCallbackInstructionType() = default; - ~GpuAccessBlobByCallbackInstructionType() override = default; - using stream_type = vm::CudaStreamType; -}; -COMMAND(vm::RegisterInstructionType( - "cuda.AccessBlobByCallback")); - -class GpuRecordEventInstructionType : public RecordEventInstructionType { - public: - GpuRecordEventInstructionType() = default; - ~GpuRecordEventInstructionType() override = default; - using stream_type = vm::CudaStreamType; - - InstructionFuseType fuse_type() const override { return kEnableInstructionFuseAsTailOnly; } - - void InitInstructionStatus(Instruction* instruction) const override { - auto* status_buffer = instruction->mut_status_buffer(); - auto* stream = instruction->mut_stream(); - instruction->stream_type().InitInstructionStatus(*stream, status_buffer); - auto* event_provider = dynamic_cast(stream->device_ctx().get()); - const auto& cuda_event = CHECK_NOTNULL(event_provider)->GetCudaEvent(); - auto* data_ptr = status_buffer->mut_buffer()->mut_data(); - CudaOptionalEventRecordStatusQuerier::MutCast(data_ptr)->reset_cuda_event(cuda_event); - } -}; -COMMAND(vm::RegisterInstructionType("cuda.RecordEvent")); - -} // namespace vm -} // namespace oneflow -#endif diff --git a/oneflow/core/eager/cuda_opkernel_instruction_type.cpp b/oneflow/core/eager/cuda_opkernel_instruction_type.cpp deleted file mode 100644 index d6a431d02cd..00000000000 --- a/oneflow/core/eager/cuda_opkernel_instruction_type.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_CUDA - -#include "oneflow/core/common/util.h" -#include "oneflow/core/job/job_desc.h" -#include "oneflow/core/eager/opkernel_instruction_type.h" -#include "oneflow/core/vm/stream.h" -#include "oneflow/core/vm/cuda_stream_type.h" -#include "oneflow/core/vm/async_cuda_stream_type.h" -#include "oneflow/core/vm/cuda_copy_h2d_stream_type.h" -#include "oneflow/core/vm/cuda_copy_d2h_stream_type.h" -#include "oneflow/core/vm/instruction.h" - -namespace oneflow { -namespace vm { - -class CudaLocalCallOpKernelInstructionType final : public LocalCallOpKernelInstructionType { - public: - CudaLocalCallOpKernelInstructionType() = default; - ~CudaLocalCallOpKernelInstructionType() override = default; - - using stream_type = vm::CudaStreamType; -}; -COMMAND( - vm::RegisterInstructionType("cuda.LocalCallOpKernel")); - -class AsyncCudaLocalCallOpKernelInstructionType final : public LocalCallOpKernelInstructionType { - public: - AsyncCudaLocalCallOpKernelInstructionType() = default; - ~AsyncCudaLocalCallOpKernelInstructionType() override = default; - - using stream_type = vm::AsyncCudaStreamType; -}; -COMMAND(vm::RegisterInstructionType( - "async.cuda.LocalCallOpKernel")); - -class CudaH2DLocalCallOpKernelInstructionType final : public LocalCallOpKernelInstructionType { - public: - CudaH2DLocalCallOpKernelInstructionType() = default; - ~CudaH2DLocalCallOpKernelInstructionType() override = default; - - using stream_type = vm::CudaCopyH2DStreamType; -}; -COMMAND(vm::RegisterInstructionType( - "cuda_h2d.LocalCallOpKernel")); - -class CudaD2HLocalCallOpKernelInstructionType final : public LocalCallOpKernelInstructionType { - public: - CudaD2HLocalCallOpKernelInstructionType() = default; - ~CudaD2HLocalCallOpKernelInstructionType() override = default; - - using stream_type = vm::CudaCopyD2HStreamType; -}; -COMMAND(vm::RegisterInstructionType( - "cuda_d2h.LocalCallOpKernel")); - -} // namespace vm -} // namespace oneflow - -#endif diff --git a/oneflow/core/eager/eager_blob_object.h b/oneflow/core/eager/eager_blob_object.h index 6003b690f94..cb10a32c1d1 100644 --- a/oneflow/core/eager/eager_blob_object.h +++ b/oneflow/core/eager/eager_blob_object.h @@ -52,15 +52,15 @@ class TensorStorage { blob_bytes_ = bytes; } - const Optional>& producer_stream() const { return producer_stream_; } - Maybe init_producer_stream(Symbol producer_stream) { + const Optional>& producer_stream() const { return producer_stream_; } + Maybe init_producer_stream(Symbol<::oneflow::Stream> producer_stream) { CHECK_OR_RETURN(!producer_stream_.has_value()); producer_stream_ = producer_stream; return Maybe::Ok(); } - const Optional>& last_used_stream() const { return last_used_stream_; } - void set_last_used_stream(Symbol last_used_stream) { + const Optional>& last_used_stream() const { return last_used_stream_; } + void set_last_used_stream(Symbol<::oneflow::Stream> last_used_stream) { last_used_stream_ = last_used_stream; } @@ -77,8 +77,8 @@ class TensorStorage { size_t blob_bytes_; std::unique_ptr> blob_dptr_; std::unique_ptr non_pod_allocator_; - Optional> producer_stream_; - Optional> last_used_stream_; + Optional> producer_stream_; + Optional> last_used_stream_; std::vector> storage_delete_hooks_; }; @@ -125,17 +125,17 @@ class EagerBlobObject final { void set_is_shape_synced(bool val) { is_shape_synced_ = val; } - const Optional>& producer_stream() const { + const Optional>& producer_stream() const { return tensor_storage_->producer_stream(); } - Maybe init_producer_stream(Symbol producer_stream) { + Maybe init_producer_stream(Symbol<::oneflow::Stream> producer_stream) { return tensor_storage_->init_producer_stream(producer_stream); } - const Optional>& last_used_stream() const { + const Optional>& last_used_stream() const { return tensor_storage_->last_used_stream(); } - void set_last_used_stream(Symbol last_used_stream) { + void set_last_used_stream(Symbol<::oneflow::Stream> last_used_stream) { tensor_storage_->set_last_used_stream(last_used_stream); } diff --git a/oneflow/core/eager/lazy_job_instruction_type.cpp b/oneflow/core/eager/lazy_job_instruction_type.h similarity index 93% rename from oneflow/core/eager/lazy_job_instruction_type.cpp rename to oneflow/core/eager/lazy_job_instruction_type.h index 369d602e70e..b2b8949fff3 100644 --- a/oneflow/core/eager/lazy_job_instruction_type.cpp +++ b/oneflow/core/eager/lazy_job_instruction_type.h @@ -13,9 +13,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ONEFLOW_CORE_EAGER_LAZY_JOB_INSTRUCTION_TYPE_H_ +#define ONEFLOW_CORE_EAGER_LAZY_JOB_INSTRUCTION_TYPE_H_ -#include "oneflow/core/eager/lazy_job_stream_type.h" -#include "oneflow/core/eager/lazy_job_device_context.h" +#include "oneflow/core/vm/lazy_job_device_context.h" #include "oneflow/core/eager/lazy_job_phy_instr_operand.h" #include "oneflow/core/framework/nn_graph_if.h" #include "oneflow/core/common/container_util.h" @@ -33,8 +34,6 @@ limitations under the License. namespace oneflow { -namespace { - class LazyJobInstance final : public JobInstance { public: LazyJobInstance(const LazyJobInstance&) = delete; @@ -62,8 +61,6 @@ class LazyJobInstance final : public JobInstance { const std::function finish_cb_; }; -} // namespace - namespace vm { class LaunchLazyJobInstructionType final : public InstructionType { // NOLINT @@ -72,7 +69,10 @@ class LaunchLazyJobInstructionType final : public InstructionType { // NOLINT LaunchLazyJobInstructionType(LaunchLazyJobInstructionType&&) = delete; LaunchLazyJobInstructionType() = default; ~LaunchLazyJobInstructionType() = default; - using stream_type = LazyJobStreamType; + + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { + return "LaunchLazyJob"; + } void Compute(vm::Instruction* instruction) const override { const auto& cur_nn_graph = GetCurNNGraph(instruction); auto* device_ctx = GetLazyJobDeviceCtx(instruction); @@ -127,7 +127,6 @@ class LaunchLazyJobInstructionType final : public InstructionType { // NOLINT } }; -COMMAND(RegisterInstructionType("LaunchLazyJob")); - } // namespace vm } // namespace oneflow +#endif // ONEFLOW_CORE_EAGER_LAZY_JOB_INSTRUCTION_TYPE_H_ diff --git a/oneflow/core/eager/lazy_job_phy_instr_operand.cpp b/oneflow/core/eager/lazy_job_phy_instr_operand.cpp index 4eed1c2e3ea..ab9c2c1c375 100644 --- a/oneflow/core/eager/lazy_job_phy_instr_operand.cpp +++ b/oneflow/core/eager/lazy_job_phy_instr_operand.cpp @@ -13,42 +13,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/common/decorator.h" #include "oneflow/core/eager/lazy_job_phy_instr_operand.h" #include "oneflow/core/common/container_util.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/framework/stream.h" +#include "oneflow/core/vm/virtual_machine.h" namespace oneflow { namespace vm { -namespace { - -#ifdef WITH_CUDA -Maybe RawGetEagerNcclLocalDepObject(StreamRole stream_role) { - // NOTE(chengcheng): - // Lazy Job instruction need mutual exclusion nccl with Eager nccl. However, when the number of - // processes is more than the number of physical GPUs, the following processes will make an - // error when using local rank to create a EagerNcclLocalDepObject, but we only need an legal - // device so we use device 0. - const auto& device = JUST(Device::New("cpu", 0)); - const auto& stream = Stream::New(device, stream_role); - const auto& local_dep_object = stream->mut_transport_local_dep_object(); - CHECK_OR_RETURN(local_dep_object.has_value()); - return JUST(local_dep_object); -} - -static constexpr auto* GetEagerNcclLocalDepObject = - DECORATE(&RawGetEagerNcclLocalDepObject, ThreadLocalCopiable); -#endif // WITH_CUDA - -} // namespace - void LaunchLazyJobPhyInstrOperand::ForEachMutMirroredObject( const std::function& DoEach) const { for (const auto& eager_blob_object : *param_blob_objects_) { DoEach(CHECK_JUST(eager_blob_object->compute_local_dep_object())); } - DoEach(GetStaticGlobalTransportLocalDepObject()); + DoEach( + CHECK_JUST(GlobalMaybe())->FindOrCreateTransportLocalDepObject().Mutable()); } } // namespace vm diff --git a/oneflow/core/eager/opkernel_instruction_type.cpp b/oneflow/core/eager/op_call_instruction_type.cpp similarity index 72% rename from oneflow/core/eager/opkernel_instruction_type.cpp rename to oneflow/core/eager/op_call_instruction_type.cpp index 89f3c341fd4..20133f01731 100644 --- a/oneflow/core/eager/opkernel_instruction_type.cpp +++ b/oneflow/core/eager/op_call_instruction_type.cpp @@ -23,9 +23,8 @@ limitations under the License. #include "oneflow/core/eager/eager_blob_object.h" #include "oneflow/core/vm/stream.h" #include "oneflow/core/vm/thread_ctx.h" -#include "oneflow/core/vm/cuda_stream_type.h" -#include "oneflow/core/eager/opkernel_instruction_type.h" -#include "oneflow/core/eager/local_call_opkernel_phy_instr_operand.h" +#include "oneflow/core/eager/op_call_instruction_type.h" +#include "oneflow/core/eager/op_call_phy_instr_operand.h" #include "oneflow/core/vm/instruction.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/framework/user_op_registry_manager.h" @@ -33,7 +32,7 @@ limitations under the License. #include "oneflow/core/register/ofblob.h" #include "oneflow/core/vm/symbol_storage.h" #include "oneflow/core/operator/op_conf_symbol.h" -#include "oneflow/user/kernels/stateful_local_opkernel.h" +#include "oneflow/user/kernels/stateful_opkernel.h" #include "oneflow/core/profiler/profiler.h" #include "oneflow/core/profiler/collection.h" #include "oneflow/core/common/cpp_attribute.h" @@ -41,12 +40,12 @@ limitations under the License. namespace oneflow { namespace vm { -struct LocalCallOpKernelUtil final { +struct OpCallInstructionUtil final { static inline Maybe Compute(const vm::InstructionMsg& instr_msg) { OF_PROFILER_RANGE_PUSH("ResetPrior"); - auto* operand = LocalCallOpKernelUtil::GetLocalCallOpKernelPhyInstrOperand(instr_msg); + auto* operand = OpCallInstructionUtil::GetCallPhyInstrOperand(instr_msg); operand->mut_opkernel()->composed_attrs_for_scheduler_thread()->ResetPrior(operand->attrs()); - DeviceCtx* device_ctx = instr_msg.phy_instr_stream()->device_ctx().get(); + DeviceCtx* device_ctx = instr_msg.stream().device_ctx().get(); OF_PROFILER_RANGE_POP(); OF_PROFILER_RANGE_PUSH("AllocateOutputBlobsMemory"); JUST(AllocateOutputBlobsMemory(operand, device_ctx)); @@ -70,14 +69,13 @@ struct LocalCallOpKernelUtil final { return Maybe::Ok(); } - static inline LocalCallOpKernelPhyInstrOperand* GetLocalCallOpKernelPhyInstrOperand( - const vm::InstructionMsg& instr_msg) { + static inline OpCallPhyInstrOperand* GetCallPhyInstrOperand(const vm::InstructionMsg& instr_msg) { auto* operand = CHECK_NOTNULL(instr_msg.phy_instr_operand().get()); - return CHECK_NOTNULL(dynamic_cast(operand)); + return CHECK_NOTNULL(dynamic_cast(operand)); } private: - static inline void InferTempStorageBlobDesc(LocalCallOpKernelPhyInstrOperand* operand) { + static inline void InferTempStorageBlobDesc(OpCallPhyInstrOperand* operand) { const auto& InferTmpSizeFn = operand->opkernel().GetInferTmpSizeFn(operand->user_opkernel()); auto* temp_eager_blob_object = operand->mut_opkernel()->mut_temp_blob_object(); CHECK(temp_eager_blob_object->data_type() == DataType::kChar); @@ -93,7 +91,7 @@ struct LocalCallOpKernelUtil final { op_infer_ctx->Update(nullptr, nullptr, nullptr); } - static inline void TryInitOpKernelStateAndCache(LocalCallOpKernelPhyInstrOperand* operand, + static inline void TryInitOpKernelStateAndCache(OpCallPhyInstrOperand* operand, DeviceCtx* device_ctx, user_op::OpKernelState** state, user_op::OpKernelCache** cache) { @@ -108,7 +106,7 @@ struct LocalCallOpKernelUtil final { operand->consistent_tensor_infer_result().get(), state, cache); } - static inline Maybe AllocateOutputBlobsMemory(LocalCallOpKernelPhyInstrOperand* operand, + static inline Maybe AllocateOutputBlobsMemory(OpCallPhyInstrOperand* operand, DeviceCtx* device_ctx) { for (const auto& blob_object : *operand->outputs()) { JUST(blob_object->TryAllocateBlobBodyMemory(device_ctx)); @@ -116,13 +114,13 @@ struct LocalCallOpKernelUtil final { return Maybe::Ok(); } - static inline Maybe TryAllocateTempStorageBlobMemory( - LocalCallOpKernelPhyInstrOperand* operand, DeviceCtx* device_ctx) { + static inline Maybe TryAllocateTempStorageBlobMemory(OpCallPhyInstrOperand* operand, + DeviceCtx* device_ctx) { return operand->mut_opkernel()->mut_temp_blob_object()->TryAllocateBlobBodyMemory(device_ctx); } - static inline void OpKernelCompute(LocalCallOpKernelPhyInstrOperand* operand, - DeviceCtx* device_ctx, user_op::OpKernelState* state, + static inline void OpKernelCompute(OpCallPhyInstrOperand* operand, DeviceCtx* device_ctx, + user_op::OpKernelState* state, const user_op::OpKernelCache* cache) { auto* opkernel = operand->mut_opkernel(); auto* compute_ctx = @@ -138,14 +136,14 @@ struct LocalCallOpKernelUtil final { : nullptr, [compute_ctx]() -> int64_t { const auto cal_memory_size = [compute_ctx](const one::ArgVec& args) -> int64_t { - return std::accumulate( - args.begin(), args.end(), static_cast(0), - [compute_ctx](int64_t memory_size, const auto& pair) { - const auto tensor = - compute_ctx->Tensor4ArgNameAndIndex(pair.first, pair.second); - return memory_size - + tensor->shape().elem_cnt() * GetSizeOfDataType(tensor->data_type()); - }); + return std::accumulate(args.begin(), args.end(), static_cast(0), + [compute_ctx](int64_t memory_size, const auto& pair) { + const auto tensor = compute_ctx->Tensor4ArgNameAndIndex( + pair.first, pair.second); + return memory_size + + tensor->shape_view().elem_cnt() + * GetSizeOfDataType(tensor->data_type()); + }); }; return cal_memory_size(compute_ctx->inputs()) + cal_memory_size(compute_ctx->outputs()); }, @@ -161,30 +159,28 @@ struct LocalCallOpKernelUtil final { operand->user_opkernel()->Compute(compute_ctx, state, cache); } OF_PROFILER_RANGE_POP(); - // tensor tuples are not allowed to be hold by StatefulLocalOpKernel + // tensor tuples are not allowed to be hold by StatefulOpKernel opkernel->UpdateComputeContext(nullptr, nullptr, nullptr, nullptr); } - static inline Maybe DeallocateTempStorageBlobMemory( - LocalCallOpKernelPhyInstrOperand* operand, DeviceCtx* device_ctx) { + static inline Maybe DeallocateTempStorageBlobMemory(OpCallPhyInstrOperand* operand, + DeviceCtx* device_ctx) { return operand->mut_opkernel()->mut_temp_blob_object()->DeallocateBlobDataPtr(); } }; -void LocalCallOpKernelInstructionType::Compute(vm::Instruction* instruction) const { - CHECK_JUST(LocalCallOpKernelUtil::Compute(instruction->instr_msg())); +void OpCallInstructionType::Compute(vm::Instruction* instruction) const { + CHECK_JUST(OpCallInstructionUtil::Compute(instruction->instr_msg())); } -void LocalCallOpKernelInstructionType::ComputeInFuseMode(vm::InstructionMsg* instr_msg) const { - CHECK_JUST(LocalCallOpKernelUtil::Compute(*instr_msg)); +void OpCallInstructionType::ComputeInFuseMode(vm::InstructionMsg* instr_msg) const { + CHECK_JUST(OpCallInstructionUtil::Compute(*instr_msg)); } -std::string LocalCallOpKernelInstructionType::DebugOpTypeName( - const vm::InstructionMsg& instr_msg) const { +std::string OpCallInstructionType::DebugName(const vm::InstructionMsg& instr_msg) const { auto* operand = CHECK_NOTNULL(instr_msg.phy_instr_operand().get()); - return CHECK_NOTNULL(dynamic_cast(operand)) - ->opkernel() - .op_type_name(); + return CHECK_NOTNULL(dynamic_cast(operand))->opkernel().op_type_name() + + ":Call"; } } // namespace vm diff --git a/oneflow/core/eager/opkernel_instruction_type.h b/oneflow/core/eager/op_call_instruction_type.h similarity index 70% rename from oneflow/core/eager/opkernel_instruction_type.h rename to oneflow/core/eager/op_call_instruction_type.h index bc860a6df05..31aacb6fd7b 100644 --- a/oneflow/core/eager/opkernel_instruction_type.h +++ b/oneflow/core/eager/op_call_instruction_type.h @@ -13,10 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_CORE_EAGER_CALL_OPKERNEL_INSTRUCTION_H_ -#define ONEFLOW_CORE_EAGER_CALL_OPKERNEL_INSTRUCTION_H_ +#ifndef ONEFLOW_CORE_EAGER_OP_CALL_INSTRUCTION_TYPE_H_ +#define ONEFLOW_CORE_EAGER_OP_CALL_INSTRUCTION_TYPE_H_ -#include "oneflow/core/vm/instr_type_id.h" #include "oneflow/core/vm/instruction.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/memory/memory_case.pb.h" @@ -24,19 +23,19 @@ limitations under the License. namespace oneflow { namespace vm { -class LocalCallOpKernelInstructionType : public vm::InstructionType { +class OpCallInstructionType final : public vm::InstructionType { public: + OpCallInstructionType() = default; + ~OpCallInstructionType() = default; + void Compute(vm::Instruction* instruction) const override; void ComputeInFuseMode(vm::InstructionMsg* instr_msg) const override; InstructionFuseType fuse_type() const override { return kEnableInstructionFuseAtAnyPosition; } - std::string DebugOpTypeName(const vm::InstructionMsg& instr_msg) const override; + std::string DebugName(const vm::InstructionMsg& instr_msg) const override; protected: - LocalCallOpKernelInstructionType() = default; - virtual ~LocalCallOpKernelInstructionType() = default; - private: Maybe MaybeCompute(vm::Instruction* instruction) const; }; @@ -44,4 +43,4 @@ class LocalCallOpKernelInstructionType : public vm::InstructionType { } // namespace vm } // namespace oneflow -#endif // ONEFLOW_CORE_EAGER_CALL_OPKERNEL_INSTRUCTION_H_ +#endif // ONEFLOW_CORE_EAGER_OP_CALL_INSTRUCTION_TYPE_H_ diff --git a/oneflow/core/eager/local_call_opkernel_phy_instr_operand.cpp b/oneflow/core/eager/op_call_phy_instr_operand.cpp similarity index 78% rename from oneflow/core/eager/local_call_opkernel_phy_instr_operand.cpp rename to oneflow/core/eager/op_call_phy_instr_operand.cpp index 07250c580ae..cd553b59a54 100644 --- a/oneflow/core/eager/local_call_opkernel_phy_instr_operand.cpp +++ b/oneflow/core/eager/op_call_phy_instr_operand.cpp @@ -13,21 +13,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/eager/local_call_opkernel_phy_instr_operand.h" -#include "oneflow/user/kernels/stateful_local_opkernel.h" +#include "oneflow/core/eager/op_call_phy_instr_operand.h" +#include "oneflow/user/kernels/stateful_opkernel.h" #include "oneflow/core/eager/dev_vm_dep_object_consume_mode.h" #include "oneflow/core/framework/stream_is_comm_net_stream.h" +#include "oneflow/core/vm/stream.h" namespace oneflow { namespace vm { -Maybe LocalCallOpKernelPhyInstrOperand::Init() { +Maybe OpCallPhyInstrOperand::Init() { JUST(mut_opkernel()->ChooseOpKernel(&user_opkernel_, &need_temp_storage_, attrs(), inputs().get(), outputs().get(), consistent_tensor_infer_result().get())); return Maybe::Ok(); } -void LocalCallOpKernelPhyInstrOperand::ForEachConstMirroredObject( +void OpCallPhyInstrOperand::ForEachConstMirroredObject( const std::function& DoEach) const { const auto& input_list = inputs(); for (int64_t index : opkernel().input_tuple_indexes4const_ibns()) { @@ -36,10 +37,9 @@ void LocalCallOpKernelPhyInstrOperand::ForEachConstMirroredObject( } } -void LocalCallOpKernelPhyInstrOperand::InitStreamSequentialDependence() { - const auto& stream = opkernel().stream(); - auto* device_schedule_dep_object = stream->mut_schedule_local_dep_object(); - if (StreamRoleSwitch(stream->stream_role())) { +void OpCallPhyInstrOperand::InitStreamSequentialDependence() { + auto* device_schedule_dep_object = vm_stream_->schedule_local_dep_object().get(); + if (IsCommNetStream::Visit(vm_stream_->stream_role())) { // Sequantialize nccl instructions to avoid deadlock stream_sequential_dependence_ = device_schedule_dep_object; } else { @@ -53,11 +53,10 @@ void LocalCallOpKernelPhyInstrOperand::InitStreamSequentialDependence() { } } -void LocalCallOpKernelPhyInstrOperand::ForEachMutMirroredObject( +void OpCallPhyInstrOperand::ForEachMutMirroredObject( const std::function& DoEach) const { - const auto& stream = opkernel().stream(); - const auto& opt_transport_dep_object = stream->mut_transport_local_dep_object(); - if (opt_transport_dep_object.has_value()) { DoEach(CHECK_JUST(opt_transport_dep_object)); } + const auto& opt_transport_dep_object = vm_stream_->transport_local_dep_object(); + if (opt_transport_dep_object.has_value()) { DoEach(CHECK_JUST(opt_transport_dep_object)->get()); } const auto& input_list = inputs(); for (int64_t index : opkernel().input_tuple_indexes4mut_ibns()) { @@ -71,7 +70,7 @@ void LocalCallOpKernelPhyInstrOperand::ForEachMutMirroredObject( } } -void LocalCallOpKernelPhyInstrOperand::ForEachMut2MirroredObject( +void OpCallPhyInstrOperand::ForEachMut2MirroredObject( const std::function& DoEach) const { const auto& output_list = outputs(); for (int64_t index : opkernel().output_tuple_indexes4mut2_obns()) { diff --git a/oneflow/core/eager/local_call_opkernel_phy_instr_operand.h b/oneflow/core/eager/op_call_phy_instr_operand.h similarity index 78% rename from oneflow/core/eager/local_call_opkernel_phy_instr_operand.h rename to oneflow/core/eager/op_call_phy_instr_operand.h index 90cec6beb18..3a67d1f5995 100644 --- a/oneflow/core/eager/local_call_opkernel_phy_instr_operand.h +++ b/oneflow/core/eager/op_call_phy_instr_operand.h @@ -13,8 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_CORE_EAGER_LOCAL_CALL_OPKERNEL_PHY_INSTR_OPERAND_H_ -#define ONEFLOW_CORE_EAGER_LOCAL_CALL_OPKERNEL_PHY_INSTR_OPERAND_H_ +#ifndef ONEFLOW_CORE_EAGER_OP_CALL_PHY_INSTR_OPERAND_H_ +#define ONEFLOW_CORE_EAGER_OP_CALL_PHY_INSTR_OPERAND_H_ #include "oneflow/core/vm/phy_instr_operand.h" #include "oneflow/core/eager/dev_vm_dep_object_consume_mode.h" @@ -23,9 +23,14 @@ limitations under the License. #include "oneflow/core/framework/op_interpreter.h" namespace oneflow { + +namespace vm { +class Stream; +} + namespace one { -class StatefulLocalOpKernel; +class StatefulOpKernel; class ConsistentTensorInferResult; using EagerBlobObjectList = std::vector>; @@ -42,20 +47,20 @@ class OpKernel; namespace vm { -class LocalCallOpKernelPhyInstrOperand final : public vm::PhyInstrOperand { +class OpCallPhyInstrOperand final : public vm::PhyInstrOperand { public: - LocalCallOpKernelPhyInstrOperand(const LocalCallOpKernelPhyInstrOperand&) = delete; - LocalCallOpKernelPhyInstrOperand(LocalCallOpKernelPhyInstrOperand&&) = delete; - ~LocalCallOpKernelPhyInstrOperand() override = default; + OpCallPhyInstrOperand(const OpCallPhyInstrOperand&) = delete; + OpCallPhyInstrOperand(OpCallPhyInstrOperand&&) = delete; + ~OpCallPhyInstrOperand() override = default; template - static Maybe New(Args&&... args) { - auto* ptr = new LocalCallOpKernelPhyInstrOperand(std::forward(args)...); + static Maybe New(Args&&... args) { + auto* ptr = new OpCallPhyInstrOperand(std::forward(args)...); JUST(ptr->Init()); - return std::shared_ptr(ptr); + return std::shared_ptr(ptr); } - const one::StatefulLocalOpKernel& opkernel() const { return *opkernel_; } + const one::StatefulOpKernel& opkernel() const { return *opkernel_; } const one::EagerBlobObjectListPtr& inputs() const { return inputs_; } const one::EagerBlobObjectListPtr& outputs() const { return outputs_; } const AttrMap& attrs() const { return op_interp_ctx_.attrs; } @@ -64,7 +69,7 @@ class LocalCallOpKernelPhyInstrOperand final : public vm::PhyInstrOperand { return dev_vm_dep_object_consume_mode_; } - one::StatefulLocalOpKernel* mut_opkernel() { return opkernel_.get(); } + one::StatefulOpKernel* mut_opkernel() { return opkernel_.get(); } template Maybe ForEachOutputTensor(const DoEachT& DoEach) { @@ -90,13 +95,14 @@ class LocalCallOpKernelPhyInstrOperand final : public vm::PhyInstrOperand { } private: - LocalCallOpKernelPhyInstrOperand( - const std::shared_ptr& opkernel, + OpCallPhyInstrOperand( + vm::Stream* vm_stream, const std::shared_ptr& opkernel, const one::EagerBlobObjectListPtr& inputs, const one::EagerBlobObjectListPtr& outputs, const std::shared_ptr& consistent_tensor_infer_result, const one::OpExprInterpContext& op_interp_ctx_, const one::DevVmDepObjectConsumeMode dev_vm_dep_object_consume_mode) - : opkernel_(opkernel), + : vm_stream_(vm_stream), + opkernel_(opkernel), inputs_(inputs), outputs_(outputs), consistent_tensor_infer_result_(consistent_tensor_infer_result), @@ -113,7 +119,8 @@ class LocalCallOpKernelPhyInstrOperand final : public vm::PhyInstrOperand { Maybe Init(); void InitStreamSequentialDependence(); - std::shared_ptr opkernel_; + vm::Stream* vm_stream_; + std::shared_ptr opkernel_; one::EagerBlobObjectListPtr inputs_; one::EagerBlobObjectListPtr outputs_; std::shared_ptr consistent_tensor_infer_result_; @@ -128,4 +135,4 @@ class LocalCallOpKernelPhyInstrOperand final : public vm::PhyInstrOperand { } // namespace vm } // namespace oneflow -#endif // ONEFLOW_CORE_EAGER_LOCAL_CALL_OPKERNEL_PHY_INSTR_OPERAND_H_ +#endif // ONEFLOW_CORE_EAGER_OP_CALL_PHY_INSTR_OPERAND_H_ diff --git a/oneflow/core/eager/release_tensor_arg_phy_instr_operand.h b/oneflow/core/eager/release_tensor_arg_phy_instr_operand.h index 742847f4c1c..f958a087cde 100644 --- a/oneflow/core/eager/release_tensor_arg_phy_instr_operand.h +++ b/oneflow/core/eager/release_tensor_arg_phy_instr_operand.h @@ -26,6 +26,7 @@ limitations under the License. #include "oneflow/core/common/optional.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/framework/stream.h" +#include "oneflow/core/vm/stream.h" namespace oneflow { @@ -36,11 +37,11 @@ class EagerBlobObject; class ReleaseTensorArgPhyInstrOperand : public PhyInstrOperand { public: ReleaseTensorArgPhyInstrOperand(const std::shared_ptr& eager_blob_object, - const Optional>& stream) + const Optional& stream) : eager_blob_object_(eager_blob_object), output_dependences_() { output_dependences_.push_back(CHECK_JUST(eager_blob_object->compute_local_dep_object())); if (stream.has_value()) { - stream_sequential_dependence_ = CHECK_JUST(stream)->mut_schedule_local_dep_object(); + stream_sequential_dependence_ = CHECK_JUST(stream)->schedule_local_dep_object().get(); } } ~ReleaseTensorArgPhyInstrOperand() override = default; diff --git a/oneflow/core/eager/release_tensor_instruction_type.cpp b/oneflow/core/eager/release_tensor_instruction_type.h similarity index 53% rename from oneflow/core/eager/release_tensor_instruction_type.cpp rename to oneflow/core/eager/release_tensor_instruction_type.h index 682b04587b6..427581a1d08 100644 --- a/oneflow/core/eager/release_tensor_instruction_type.cpp +++ b/oneflow/core/eager/release_tensor_instruction_type.h @@ -13,28 +13,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ONEFLOW_CORE_EAGER_RELEASE_TENSOR_INSTRUCTION_TYPE_H_ +#define ONEFLOW_CORE_EAGER_RELEASE_TENSOR_INSTRUCTION_TYPE_H_ + #include "oneflow/core/vm/instruction.h" +#include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/eager/release_tensor_arg_phy_instr_operand.h" #include "oneflow/core/eager/eager_blob_object.h" -#include "oneflow/core/vm/cuda_stream_type.h" -#include "oneflow/core/vm/async_cuda_stream_type.h" -#include "oneflow/core/vm/cuda_copy_h2d_stream_type.h" -#include "oneflow/core/vm/cuda_copy_d2h_stream_type.h" -#include "oneflow/core/vm/cpu_stream_type.h" #include "oneflow/core/vm/cuda_optional_event_record_status_querier.h" +#include "oneflow/core/common/stream_role.h" +#include "oneflow/core/common/singleton_ptr.h" namespace oneflow { namespace vm { -template class ReleaseTensorInstructionType : public vm::InstructionType { public: ReleaseTensorInstructionType() = default; ~ReleaseTensorInstructionType() override = default; - using stream_type = StreamT; - InstructionFuseType fuse_type() const override { return kEnableInstructionFuseAtAnyPosition; } void Release(const vm::InstructionMsg& instr_msg) const { @@ -45,19 +43,16 @@ class ReleaseTensorInstructionType : public vm::InstructionType { CHECK_NOTNULL(ptr); CHECK_JUST(ptr->eager_blob_object()->DeallocateBlobDataPtr()); } + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { + return "ReleaseTensor"; + } void Compute(vm::Instruction* instruction) const override { Release(instruction->instr_msg()); } void ComputeInFuseMode(vm::InstructionMsg* instr_msg) const override { Release(*instr_msg); } }; -COMMAND( - vm::RegisterInstructionType>("cpu.ReleaseTensor")); -COMMAND(vm::RegisterInstructionType>( - "comm_net.ReleaseTensor")); - #ifdef WITH_CUDA -template -class CudaReleaseTensorInstructionType : public ReleaseTensorInstructionType { +class CudaReleaseTensorInstructionType : public ReleaseTensorInstructionType { public: CudaReleaseTensorInstructionType() = default; ~CudaReleaseTensorInstructionType() override = default; @@ -71,17 +66,51 @@ class CudaReleaseTensorInstructionType : public ReleaseTensorInstructionType>( - "cuda.ReleaseTensor")); -COMMAND(vm::RegisterInstructionType>( - "cuda_h2d.ReleaseTensor")); -COMMAND(vm::RegisterInstructionType>( - "cuda_d2h.ReleaseTensor")); -COMMAND(vm::RegisterInstructionType>( - "sync_launched_nccl.ReleaseTensor")); -COMMAND(vm::RegisterInstructionType>( - "async_launched_nccl.ReleaseTensor")); #endif } // namespace vm + +struct GetReleaseInstructionType : public StreamRoleVisitor { + static Maybe VisitCompute(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitHost2Device(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitDevice2Host(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitSyncedLaunchedCommNet(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitAsyncedLaunchedCommNet(DeviceType device_type) { + return GetInstructionType(device_type); + } + static Maybe VisitBarrier(DeviceType device_type) { + UNIMPLEMENTED_THEN_RETURN(); + } + static Maybe VisitCriticalSection(DeviceType device_type) { + UNIMPLEMENTED_THEN_RETURN(); + } + static Maybe VisitLazyJobLauncher(DeviceType device_type) { + UNIMPLEMENTED_THEN_RETURN(); + } + + private: + static Maybe GetInstructionType(DeviceType device_type) { + if (device_type == DeviceType::kCPU) { + return SingletonPtr(); + } else if (device_type == DeviceType::kCUDA) { +#ifdef WITH_CUDA + return SingletonPtr(); +#else + UNIMPLEMENTED_THEN_RETURN(); +#endif + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + } +}; + } // namespace oneflow +#endif // ONEFLOW_CORE_EAGER_RELEASE_TENSOR_INSTRUCTION_TYPE_H_ diff --git a/oneflow/core/framework/dtype.cpp b/oneflow/core/framework/dtype.cpp index d3a16ba5f42..44ca536e521 100644 --- a/oneflow/core/framework/dtype.cpp +++ b/oneflow/core/framework/dtype.cpp @@ -66,13 +66,22 @@ Maybe DTypeMeta4DataType(DataType data_type) { {DataType::kFloat, DTypeMeta("oneflow.float32", true, true, false)}, {DataType::kDouble, DTypeMeta("oneflow.float64", true, true, false)}, {DataType::kInt8, DTypeMeta("oneflow.int8", true, false, false)}, + {DataType::kInt16, DTypeMeta("oneflow.int16", true, false, false)}, {DataType::kInt32, DTypeMeta("oneflow.int32", true, false, false)}, {DataType::kInt64, DTypeMeta("oneflow.int64", true, false, false)}, + {DataType::kInt128, DTypeMeta("oneflow.int128", true, false, false)}, {DataType::kUInt8, DTypeMeta("oneflow.uint8", false, false, false)}, + {DataType::kUInt16, DTypeMeta("oneflow.uint16", false, false, false)}, + {DataType::kUInt32, DTypeMeta("oneflow.uint32", false, false, false)}, + {DataType::kUInt64, DTypeMeta("oneflow.uint64", false, false, false)}, + {DataType::kUInt128, DTypeMeta("oneflow.uint128", false, false, false)}, {DataType::kOFRecord, DTypeMeta("oneflow.of_record", false, false, false)}, {DataType::kTensorBuffer, DTypeMeta("oneflow.tensor_buffer", false, false, false)}, {DataType::kBFloat16, DTypeMeta("oneflow.bfloat16", true, true, false)}, {DataType::kBool, DTypeMeta("oneflow.bool", false, false, false)}, + {DataType::kComplex32, DTypeMeta("oneflow.complex32", false, false, true)}, + {DataType::kComplex64, DTypeMeta("oneflow.complex64", false, false, true)}, + {DataType::kComplex128, DTypeMeta("oneflow.complex128", false, false, true)}, }; return MapAt(data_type2dtype_meta, data_type); }; diff --git a/oneflow/core/framework/instructions_builder.cpp b/oneflow/core/framework/instructions_builder.cpp index 6d2121bb5b2..f3b15dcd15c 100644 --- a/oneflow/core/framework/instructions_builder.cpp +++ b/oneflow/core/framework/instructions_builder.cpp @@ -26,21 +26,25 @@ limitations under the License. #include "oneflow/core/common/container_util.h" #include "oneflow/core/common/decorator.h" #include "oneflow/core/common/blocking_counter.h" +#include "oneflow/core/common/singleton_ptr.h" #include "oneflow/core/rpc/include/global_process_ctx.h" #include "oneflow/core/vm/barrier_phy_instr_operand.h" #include "oneflow/core/vm/access_blob_arg_cb_phy_instr_operand.h" #include "oneflow/core/vm/consume_local_dep_object_phy_instr_operand.h" -#include "oneflow/core/eager/release_tensor_arg_phy_instr_operand.h" +#include "oneflow/core/eager/release_tensor_instruction_type.h" +#include "oneflow/core/eager/blob_instruction_type.h" +#include "oneflow/core/eager/op_call_instruction_type.h" +#include "oneflow/core/vm/barrier_instruction_type.h" #include "oneflow/core/vm/virtual_machine.h" #include "oneflow/core/vm/vm_util.h" #include "oneflow/core/framework/consistent_tensor_infer_cache.h" #include "oneflow/core/eager/local_dep_object.h" +#include "oneflow/core/eager/critical_section_instruction_type.h" +#include "oneflow/core/eager/lazy_job_instruction_type.h" #include "oneflow/core/framework/tensor.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/framework/stream.h" #include "oneflow/core/framework/stream_need_soft_sync.h" -#include "oneflow/core/framework/stream_get_call_instruction_name.h" -#include "oneflow/core/framework/stream_get_release_instruction_name.h" #include "oneflow/core/framework/stream_is_comm_net_stream.h" #include "oneflow/core/job/env_desc.h" #include "oneflow/core/profiler/profiler.h" @@ -57,24 +61,29 @@ Maybe> RawGetCriticalSectionStream() { static constexpr auto* GetCriticalSectionStream = DECORATE(&RawGetCriticalSectionStream, ThreadLocal); +Maybe> RawGetLazyJobLauncherStream() { + return Stream::New(JUST(Device::New("cpu")), StreamRole::kLazyJobLauncher); +} + +static constexpr auto* GetLazyJobLauncherStream = + DECORATE(&RawGetLazyJobLauncherStream, ThreadLocal); + } // namespace template Maybe InstructionsBuilder::MakeCriticalSectionBegin( - const std::shared_ptr& phy_instr_operand) { + vm::Stream* vm_stream, const std::shared_ptr& phy_instr_operand) { auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), "CriticalSectionBegin", - std::shared_ptr(), phy_instr_operand); + vm_stream, SingletonPtr(), phy_instr_operand); instruction_list_->EmplaceBack(std::move(instruction)); return Maybe::Ok(); } template Maybe InstructionsBuilder::MakeCriticalSectionEnd( - const std::shared_ptr& phy_instr_operand) { + vm::Stream* vm_stream, const std::shared_ptr& phy_instr_operand) { auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), "CriticalSectionEnd", - std::shared_ptr(), phy_instr_operand); + vm_stream, SingletonPtr(), phy_instr_operand); instruction_list_->EmplaceBack(std::move(instruction)); return Maybe::Ok(); } @@ -138,10 +147,13 @@ Maybe InstructionsBuilder::LaunchLazyJob(const one::EagerBlobObjectListPtr const auto& event_record = std::make_shared(); CHECK_OR_RETURN(input_op_name2end_event_record->emplace(op_name, event_record).second); } + + auto stream = JUST(GetCriticalSectionStream()); + auto* vm_stream = JUST(Global::Get()->GetVmStream(stream)); const auto& phy_instr_operand = std::make_shared( - nn_graph, inputs, input_op_name2end_event_record); - JUST(MakeCriticalSectionBegin(phy_instr_operand)); + nn_graph, inputs, input_op_name2end_event_record, vm_stream); + JUST(MakeCriticalSectionBegin(vm_stream, phy_instr_operand)); } const auto& output_op_name2end_event_record = std::make_shared>>(); @@ -150,34 +162,39 @@ Maybe InstructionsBuilder::LaunchLazyJob(const one::EagerBlobObjectListPtr const auto& event_record = std::make_shared(); CHECK_OR_RETURN(output_op_name2end_event_record->emplace(op_name, event_record).second); } + auto stream = JUST(GetCriticalSectionStream()); + auto* vm_stream = JUST(Global::Get()->GetVmStream(stream)); const auto& phy_instr_operand = std::make_shared( - nn_graph, outputs, output_op_name2end_event_record); - JUST(MakeCriticalSectionBegin(phy_instr_operand)); + nn_graph, outputs, output_op_name2end_event_record, vm_stream); + JUST(MakeCriticalSectionBegin(vm_stream, phy_instr_operand)); } { const auto& phy_instr_operand = std::make_shared(nn_graph, parameters); + auto stream = JUST(GetLazyJobLauncherStream()); + auto* vm_stream = JUST(Global::Get()->GetVmStream(stream)); auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), "LaunchLazyJob", - std::shared_ptr(), phy_instr_operand); + vm_stream, SingletonPtr(), phy_instr_operand); instruction_list_->EmplaceBack(std::move(instruction)); } + auto stream = JUST(GetCriticalSectionStream()); + auto* vm_stream = JUST(Global::Get()->GetVmStream(stream)); for (int i = 0; i < nn_graph->inputs_op_names().size(); ++i) { const auto& eager_blob_object = inputs->at(i); const auto& op_name = nn_graph->inputs_op_names().at(i); const auto& event_record = JUST(MapAt(*input_op_name2end_event_record, op_name)); const auto& phy_instr_operand = std::make_shared( - eager_blob_object, event_record); - JUST(MakeCriticalSectionEnd(phy_instr_operand)); + eager_blob_object, event_record, vm_stream); + JUST(MakeCriticalSectionEnd(vm_stream, phy_instr_operand)); } for (int i = 0; i < nn_graph->outputs_op_names().size(); ++i) { const auto& eager_blob_object = outputs->at(i); const auto& op_name = nn_graph->outputs_op_names().at(i); const auto& event_record = JUST(MapAt(*output_op_name2end_event_record, op_name)); const auto& phy_instr_operand = std::make_shared( - eager_blob_object, event_record); - JUST(MakeCriticalSectionEnd(phy_instr_operand)); + eager_blob_object, event_record, vm_stream); + JUST(MakeCriticalSectionEnd(vm_stream, phy_instr_operand)); } } return Maybe::Ok(); @@ -191,26 +208,29 @@ Maybe InstructionsBuilder::SoftSyncNNGraphBuffers( return Maybe::Ok(); } -Maybe InstructionsBuilder::CreateSymbolId() { return JUST(id_generator_->NewSymbolId()); } +namespace { + +int64_t NewSymbolId() { + static std::atomic cnt(0); + return cnt.fetch_add(1, std::memory_order_relaxed); +} + +} // namespace Maybe InstructionsBuilder::GetJobConfSymbol(const JobConfigProto& job_conf) { - return Global>::Get()->FindOrCreate( - job_conf, [&] { return this->CreateSymbolId(); }); + return Global>::Get()->FindOrCreate(job_conf, &NewSymbolId); } Maybe InstructionsBuilder::GetParallelDescSymbol(const ParallelConf& parallel_conf) { - return Global>::Get()->FindOrCreate( - parallel_conf, [&] { return this->CreateSymbolId(); }); + return Global>::Get()->FindOrCreate(parallel_conf, &NewSymbolId); } Maybe InstructionsBuilder::GetScopeSymbol(const ScopeProto& scope_proto) { - return Global>::Get()->FindOrCreate( - scope_proto, [&] { return this->CreateSymbolId(); }); + return Global>::Get()->FindOrCreate(scope_proto, &NewSymbolId); } Maybe InstructionsBuilder::GetOpConfSymbol(const OperatorConf& op_conf) { - return Global>::Get()->FindOrCreate( - op_conf, [&] { return this->CreateSymbolId(); }); + return Global>::Get()->FindOrCreate(op_conf, &NewSymbolId); } Maybe InstructionsBuilder::BuildInitialScope( @@ -337,32 +357,27 @@ Maybe InstructionsBuilder::BuildScopeByProtoStrSetter( return GetScopeSymbol(*scope_proto); } -Maybe InstructionsBuilder::LocalCallOpKernel( - const std::shared_ptr& opkernel, - const one::EagerBlobObjectListPtr& input_eager_blob_objects, - const one::EagerBlobObjectListPtr& output_eager_blob_objects, - const one::OpExprInterpContext& ctx, Symbol stream) { - return LocalCallOpKernel(opkernel, input_eager_blob_objects, output_eager_blob_objects, nullptr, - ctx, stream); +Maybe InstructionsBuilder::Call(const std::shared_ptr& opkernel, + const one::EagerBlobObjectListPtr& input_eager_blob_objects, + const one::EagerBlobObjectListPtr& output_eager_blob_objects, + const one::OpExprInterpContext& ctx, Symbol stream) { + return Call(opkernel, input_eager_blob_objects, output_eager_blob_objects, nullptr, ctx, stream); } -Maybe InstructionsBuilder::LocalCallOpKernel( - const std::shared_ptr& opkernel, +Maybe InstructionsBuilder::Call( + const std::shared_ptr& opkernel, const one::EagerBlobObjectListPtr& input_eager_blob_objects, const one::EagerBlobObjectListPtr& output_eager_blob_objects, const std::shared_ptr& consistent_tensor_infer_result, const one::OpExprInterpContext& ctx, Symbol stream) { - const auto& parallel_desc_sym = JUST(Placement4Device(stream->device())).shared_from_symbol(); JUST(SoftSyncStream(output_eager_blob_objects, stream)); JUST(SoftSyncStream(input_eager_blob_objects, stream)); - auto phy_instr_operand = JUST(vm::LocalCallOpKernelPhyInstrOperand::New( - opkernel, input_eager_blob_objects, output_eager_blob_objects, consistent_tensor_infer_result, - ctx, *one::CurrentDevVmDepObjectConsumeMode())); - const auto& instruction_name = JUST(StreamRoleSwitch( - stream->stream_role(), stream->device()->enum_type())); + auto* vm_stream = JUST(Global::Get()->GetVmStream(stream)); + auto phy_instr_operand = JUST(vm::OpCallPhyInstrOperand::New( + vm_stream, opkernel, input_eager_blob_objects, output_eager_blob_objects, + consistent_tensor_infer_result, ctx, *one::CurrentDevVmDepObjectConsumeMode())); auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), instruction_name, parallel_desc_sym, - phy_instr_operand); + vm_stream, SingletonPtr(), phy_instr_operand); instruction_list_->EmplaceBack(std::move(instruction)); for (const auto& output : *output_eager_blob_objects) { if (!output->producer_stream().has_value()) { JUST(output->init_producer_stream(stream)); } @@ -372,14 +387,13 @@ Maybe InstructionsBuilder::LocalCallOpKernel( } Maybe InstructionsBuilder::ReleaseTensor( - const std::shared_ptr& eager_blob_object, - const std::shared_ptr& parallel_desc) { - if (pthread_fork::IsForkedSubProcess() && parallel_desc - && parallel_desc->device_type() != DeviceType::kCPU) { - return Maybe::Ok(); - } + const std::shared_ptr& eager_blob_object) { const auto& last_used_stream = JUST(eager_blob_object->last_used_stream()); const auto& producer_stream = JUST(eager_blob_object->producer_stream()); + if (pthread_fork::IsForkedSubProcess() + && producer_stream->device()->enum_type() != DeviceType::kCPU) { + return Maybe::Ok(); + } if (last_used_stream != producer_stream) { JUST(SoftSyncStream({JUST(eager_blob_object->compute_local_dep_object())}, "mut", last_used_stream)); @@ -387,23 +401,26 @@ Maybe InstructionsBuilder::ReleaseTensor( Optional> stream{}; if (*one::CurrentDevVmDepObjectConsumeMode() == one::DevVmDepObjectConsumeMode::NONE) { stream = Optional>(NullOpt); - } else if (StreamRoleSwitch(last_used_stream->stream_role())) { + } else if (IsCommNetStream::Visit(last_used_stream->stream_role())) { // Disable inter-device instruction sequential for tensor used by communicative stream. // It's not acceptable for us that cuda compute stream is blocked by cuda nccl stream. stream = Optional>(NullOpt); - } else if (StreamRoleSwitch(producer_stream->stream_role())) { + } else if (IsCommNetStream::Visit(producer_stream->stream_role())) { // Disable inter-device instruction sequential for tensor produced by communicative stream. stream = Optional>(NullOpt); } else { stream = producer_stream; } + auto vm_stream = stream.map([](Symbol stream) -> vm::Stream* { + return CHECK_JUST(Global::Get()->GetVmStream(stream)); + }); const auto& phy_instr_operand = - std::make_shared(eager_blob_object, stream); + std::make_shared(eager_blob_object, vm_stream); + StreamRole stream_role = producer_stream->stream_role(); DeviceType device_type = producer_stream->device()->enum_type(); - const auto& instruction_name = JUST( - StreamRoleSwitch(producer_stream->stream_role(), device_type)); auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), instruction_name, parallel_desc, phy_instr_operand); + JUST(Global::Get()->GetVmStream(producer_stream)), + JUST(GetReleaseInstructionType::Visit(stream_role, device_type)), phy_instr_operand); instruction_list_->EmplaceBack(std::move(instruction)); return Maybe::Ok(); } @@ -435,39 +452,22 @@ Maybe InstructionsBuilder::SoftSyncStream( Maybe InstructionsBuilder::SoftSyncStream( std::vector>&& compute_local_dep_objects, - const std::string& modifier, Symbol stream) { - DeviceType device_type = stream->device()->enum_type(); - if (!StreamRoleSwitch(stream->stream_role(), device_type)) { + const std::string& modifier, Symbol last_used_stream) { + DeviceType device_type = last_used_stream->device()->enum_type(); + if (!NeedSoftSync::Visit(last_used_stream->stream_role(), device_type)) { return Maybe::Ok(); } OF_PROFILER_RANGE_GUARD("SoftStream"); - const auto& parallel_desc = JUST(Placement4Device(stream->device())).shared_from_symbol(); const auto& phy_instr_operand = std::make_shared( std::move(compute_local_dep_objects), modifier); + StreamRole stream_role = last_used_stream->stream_role(); auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), parallel_desc->device_tag() + ".RecordEvent", - parallel_desc, phy_instr_operand); + JUST(Global::Get()->GetVmStream(last_used_stream)), + JUST(GetRecordEventInstructionType::Visit(stream_role, device_type)), phy_instr_operand); instruction_list_->EmplaceBack(std::move(instruction)); return Maybe::Ok(); } -namespace { - -const std::shared_ptr& GetParallelDesc( - const std::shared_ptr tensor) { - const auto& device = CHECK_JUST(tensor->device()); - const auto& placement = CHECK_JUST(Placement4Device(device)); - return placement.shared_from_symbol(); -} - -const std::shared_ptr& GetParallelDesc( - const one::EagerMirroredTensorImpl* tensor) { - const auto& placement = CHECK_JUST(Placement4Device(tensor->device())); - return placement.shared_from_symbol(); -} - -} // namespace - template Maybe InstructionsBuilder::SyncAccessBlobByCallback( const T tensor, const std::shared_ptr& btb, @@ -520,17 +520,41 @@ template Maybe InstructionsBuilder::SyncAccessBlobByCallback( const one::EagerMirroredTensorImpl* tensor, const std::shared_ptr& btb, const std::function& Callback, const std::string& modifier); +namespace { + +Maybe> GetDevice(const std::shared_ptr& tensor) { + return tensor->device(); // return Maybe> +} + +Maybe> GetDevice(const one::EagerMirroredTensorImpl* tensor) { + return tensor->device(); // return const Symbol& +} + +} // namespace + template Maybe InstructionsBuilder::AccessBlobByCallback(const T tensor, const std::function& callback, const std::string& modifier) { - const auto& parallel_desc = GetParallelDesc(tensor); const std::shared_ptr& eager_blob_object = JUST(tensor->eager_blob_object()); const auto& phy_instr_operand = std::make_shared(eager_blob_object, callback, modifier); + Symbol device = JUST(GetDevice(tensor)); + Symbol stream = JUST(GetDefaultStreamByDevice(device)); + // Do not use producer_stream or last_used_stream. + // Bug case when using producer_stream or last_used_stream: + // + // ```python + // tensor = oneflow.ones((1024, 1024, 1024), device='cuda').cpu() + // ndarray = tensor.numpy() # share memory + // + // ``` + // `ndarray` may not be ones because instruction AccessBlobByCallback is prescheduled before + // oneflow.ones actually finished. auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), - parallel_desc->device_tag() + ".AccessBlobByCallback", parallel_desc, phy_instr_operand); + // Never replace `stream` with producer_stream or last_used_stream. + JUST(Global::Get()->GetVmStream(stream)), + SingletonPtr(), phy_instr_operand); instruction_list_->EmplaceBack(std::move(instruction)); return Maybe::Ok(); } @@ -543,29 +567,38 @@ template Maybe InstructionsBuilder::AccessBlobByCallback( const one::EagerMirroredTensorImpl* tensor, const std::function& callback, const std::string& modifier); -Maybe InstructionsBuilder::ComputeRankFrontSeqCallback( - const std::function& callback) { - const auto& phy_instr_operand = std::make_shared(callback); +namespace { + +Maybe> GetBarrierStream() { + auto device = JUST(Device::New("cpu")); + return Stream::New(device, StreamRole::kBarrier); +} + +} // namespace + +Maybe InstructionsBuilder::GlobalSync() { + const auto& phy_instr_operand = std::make_shared([]() {}); + auto stream = JUST(GetBarrierStream()); auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), "ComputeRankFrontSeqCallback", - std::shared_ptr(), phy_instr_operand); + JUST(Global::Get()->GetVmStream(stream)), + SingletonPtr(), phy_instr_operand); instruction_list_->PushBack(instruction.Mutable()); return Maybe::Ok(); } -Maybe InstructionsBuilder::ComputeGlobalFrontSeqBarrier() { - const auto& phy_instr_operand = std::make_shared([] {}); +Maybe InstructionsBuilder::Barrier(const std::function& Callback) { + const auto& phy_instr_operand = std::make_shared(Callback); + auto stream = JUST(GetBarrierStream()); auto instruction = intrusive::make_shared( - Global::Get()->mut_vm(), "ComputeGlobalFrontSeqBarrier", - std::shared_ptr(), phy_instr_operand); + JUST(Global::Get()->GetVmStream(stream)), + SingletonPtr(), phy_instr_operand); instruction_list_->PushBack(instruction.Mutable()); return Maybe::Ok(); } Maybe PhysicalRun(const std::function(InstructionsBuilder*)>& Build) { vm::InstructionMsgList instruction_list; - InstructionsBuilder instructions_builder(std::make_shared(), - &instruction_list); + InstructionsBuilder instructions_builder(&instruction_list); JUST(Build(&instructions_builder)); JUST(vm::Run(instructions_builder.mut_instruction_list())); return Maybe::Ok(); diff --git a/oneflow/core/framework/instructions_builder.h b/oneflow/core/framework/instructions_builder.h index 8bf70c203b8..ddbb017d986 100644 --- a/oneflow/core/framework/instructions_builder.h +++ b/oneflow/core/framework/instructions_builder.h @@ -16,10 +16,9 @@ limitations under the License. #ifndef ONEFLOW_CORE_FRAMEWORK_INSTRUCTIONS_BUILDER_H_ #define ONEFLOW_CORE_FRAMEWORK_INSTRUCTIONS_BUILDER_H_ -#include "oneflow/core/eager/local_call_opkernel_phy_instr_operand.h" +#include "oneflow/core/eager/op_call_phy_instr_operand.h" #include "oneflow/core/eager/lazy_job_phy_instr_operand.h" #include "oneflow/core/vm/instruction.h" -#include "oneflow/core/vm/id_generator.h" #include "oneflow/core/job/job_desc.h" #include "oneflow/core/job/parallel_desc.h" #include "oneflow/core/job/scope.h" @@ -33,7 +32,7 @@ limitations under the License. namespace oneflow { namespace one { -class StatefulLocalOpKernel; +class StatefulOpKernel; class TensorTuple; class MirroredTensor; class ConsistentTensorInferResult; @@ -47,12 +46,10 @@ class InstructionsBuilder : public std::enable_shared_from_this& id_generator, - vm::InstructionMsgList* instruction_list) - : id_generator_(id_generator), instruction_list_(instruction_list) {} + explicit InstructionsBuilder(vm::InstructionMsgList* instruction_list) + : instruction_list_(instruction_list) {} ~InstructionsBuilder() { instruction_list_->Clear(); } - const std::shared_ptr& id_generator() const { return id_generator_; } const vm::InstructionMsgList& instruction_list() const { return *instruction_list_; } vm::InstructionMsgList* mut_instruction_list() { return instruction_list_; } @@ -67,8 +64,6 @@ class InstructionsBuilder : public std::enable_shared_from_this SoftSyncNNGraphBuffers(const one::EagerBlobObjectListPtr& eager_blob_objects, const std::shared_ptr& nn_graph); - Maybe CreateSymbolId(); - Maybe GetJobConfSymbol(const JobConfigProto& job_conf); Maybe GetParallelDescSymbol(const ParallelConf& parallel_conf); @@ -77,8 +72,7 @@ class InstructionsBuilder : public std::enable_shared_from_this GetOpConfSymbol(const OperatorConf& op_conf); - Maybe ReleaseTensor(const std::shared_ptr& eager_blob_object, - const std::shared_ptr& parallel_desc); + Maybe ReleaseTensor(const std::shared_ptr& eager_blob_object); template Maybe SyncAccessBlobByCallback(const T tensor, const std::shared_ptr& btb, @@ -89,9 +83,8 @@ class InstructionsBuilder : public std::enable_shared_from_this AccessBlobByCallback(const T tensor, const std::function& callback, const std::string& modifier); - Maybe ComputeRankFrontSeqCallback(const std::function& callback); - - Maybe ComputeGlobalFrontSeqBarrier(); + Maybe GlobalSync(); + Maybe Barrier(const std::function& callback); Maybe BuildInitialScope(int64_t session_id, const JobConfigProto& job_conf, const std::string& device_tag, @@ -122,13 +115,13 @@ class InstructionsBuilder : public std::enable_shared_from_this& scope, const std::function& StrSetter); - Maybe LocalCallOpKernel(const std::shared_ptr& opkernel, - const one::EagerBlobObjectListPtr& input_eager_blob_objects, - const one::EagerBlobObjectListPtr& output_eager_blob_objects, - const one::OpExprInterpContext& ctx, Symbol stream); + Maybe Call(const std::shared_ptr& opkernel, + const one::EagerBlobObjectListPtr& input_eager_blob_objects, + const one::EagerBlobObjectListPtr& output_eager_blob_objects, + const one::OpExprInterpContext& ctx, Symbol stream); - Maybe LocalCallOpKernel( - const std::shared_ptr& opkernel, + Maybe Call( + const std::shared_ptr& opkernel, const one::EagerBlobObjectListPtr& input_eager_blob_objects, const one::EagerBlobObjectListPtr& output_eager_blob_objects, const std::shared_ptr& consistent_tensor_infer_result, @@ -141,16 +134,15 @@ class InstructionsBuilder : public std::enable_shared_from_this>&& compute_local_dep_objects, const std::string& modifier, Symbol stream); - vm::IdGenerator* mut_id_generator() { return id_generator_.get(); } - private: template - Maybe MakeCriticalSectionBegin(const std::shared_ptr& phy_instr_operand); + Maybe MakeCriticalSectionBegin(vm::Stream* vm_stream, + const std::shared_ptr& phy_instr_operand); template - Maybe MakeCriticalSectionEnd(const std::shared_ptr& phy_instr_operand); + Maybe MakeCriticalSectionEnd(vm::Stream* vm_stream, + const std::shared_ptr& phy_instr_operand); - std::shared_ptr id_generator_; vm::InstructionMsgList* instruction_list_; }; diff --git a/oneflow/core/framework/op_expr.cpp b/oneflow/core/framework/op_expr.cpp index 916c049728e..27e4f65b55a 100644 --- a/oneflow/core/framework/op_expr.cpp +++ b/oneflow/core/framework/op_expr.cpp @@ -24,7 +24,7 @@ limitations under the License. #include "oneflow/core/framework/user_op_registry_manager.h" #include "oneflow/core/framework/consistent_tensor_infer_cache.h" #include "oneflow/core/operator/op_conf.pb.h" -#include "oneflow/user/kernels/stateful_local_opkernel.h" +#include "oneflow/user/kernels/stateful_opkernel.h" namespace oneflow { namespace one { @@ -122,7 +122,7 @@ Maybe BuiltinOpExprImpl::BuildOpConf(OperatorConf* op_conf, return Maybe::Ok(); } -Maybe UserOpExpr::MutKernel4Stream(Symbol stream) const { +Maybe UserOpExpr::MutKernel4Stream(Symbol stream) const { const auto& it = stream2kernel_.find(stream); if (it != stream2kernel_.end()) { return it->second; } @@ -130,8 +130,8 @@ Maybe UserOpExpr::MutKernel4Stream(Symbol stream) JUST(BuildOpConf(op_conf.get(), {})); op_conf->set_device_tag(stream->device()->type()); auto parallel_desc = JUST(Placement4Device(stream->device())).shared_from_symbol(); - const auto& opkernel = JUST(StatefulLocalOpKernel::New( - op_conf, stream, base_attrs(), parallel_desc, input_arg_tuple(), output_arg_tuple())); + const auto& opkernel = JUST(StatefulOpKernel::New(op_conf, stream, base_attrs(), parallel_desc, + input_arg_tuple(), output_arg_tuple())); stream2kernel_.emplace(stream, opkernel); return opkernel; } diff --git a/oneflow/core/framework/op_expr.h b/oneflow/core/framework/op_expr.h index 5f76213a687..3806724c408 100644 --- a/oneflow/core/framework/op_expr.h +++ b/oneflow/core/framework/op_expr.h @@ -125,7 +125,7 @@ class BuiltinOpExprImpl : public BuiltinOpExpr { mutable std::shared_ptr op_grad_func_; }; -class StatefulLocalOpKernel; +class StatefulOpKernel; class ConsistentTensorInferCache; class UserOpExpr final : public BuiltinOpExprImpl { @@ -139,7 +139,7 @@ class UserOpExpr final : public BuiltinOpExprImpl { const AttrMap& base_attrs() const { return base_attrs_; } - Maybe MutKernel4Stream(Symbol stream) const; + Maybe MutKernel4Stream(Symbol stream) const; bool has_device_and_stream_infer_fn() const { return static_cast(device_and_stream_infer_fn_); @@ -172,7 +172,7 @@ class UserOpExpr final : public BuiltinOpExprImpl { user_op::TensorDescInferFn tensor_desc_infer_fn_; user_op::DataTypeInferFn dtype_infer_fn_; user_op::DeviceAndStreamInferFn device_and_stream_infer_fn_; - mutable HashMap, std::shared_ptr> stream2kernel_; + mutable HashMap, std::shared_ptr> stream2kernel_; std::shared_ptr consistent_tensor_infer_cache_; }; diff --git a/oneflow/core/framework/op_interpreter.h b/oneflow/core/framework/op_interpreter.h index c8df6da0563..6236a41161e 100644 --- a/oneflow/core/framework/op_interpreter.h +++ b/oneflow/core/framework/op_interpreter.h @@ -33,6 +33,8 @@ namespace one { struct OpExprInterpContext { OpExprInterpContext(const AttrMap& attrs_arg) : attrs(attrs_arg) {} + OpExprInterpContext(const AttrMap& attrs_arg, const bool inplace) + : attrs(attrs_arg), inplace(inplace) {} OpExprInterpContext(const AttrMap& attrs_arg, Symbol device_arg) : attrs(attrs_arg), device(device_arg) {} OpExprInterpContext(const AttrMap& attrs_arg, Symbol device_arg, const bool pin_memory) @@ -56,6 +58,7 @@ struct OpExprInterpContext { Optional> parallel_desc; // for consistent op Optional> nd_sbp; // for consistent op Optional pin_memory; // for pin_memory related op + Optional inplace; // for inplace operation op std::shared_ptr state; }; diff --git a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp index c72f1a764ac..4c71d4f7300 100644 --- a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp @@ -29,7 +29,7 @@ limitations under the License. #include "oneflow/core/operator/operator.h" #include "oneflow/core/autograd/autograd_mode.h" #include "oneflow/core/boxing/eager_boxing_interpreter_mgr.h" -#include "oneflow/user/kernels/stateful_local_opkernel.h" +#include "oneflow/user/kernels/stateful_opkernel.h" #include "oneflow/core/framework/consistency_check.h" #include "oneflow/core/framework/tensor_rpc_util.h" #include "oneflow/core/framework/tensor_consistent_id.h" @@ -50,7 +50,7 @@ Maybe> GetParallelDesc(const TensorTuple& inputs, } std::string GetDynamicOpConsistentFailedDebugString(const UserOpExpr& user_op_expr, - const StatefulLocalOpKernel& kernel) { + const StatefulOpKernel& kernel) { CHECK(!kernel.output_tuple_indexes4mut2_obns().empty()); std::string plentysuffix = kernel.output_tuple_indexes4mut2_obns().size() == 1 ? "s" : ""; std::stringstream ss; @@ -147,7 +147,7 @@ Maybe Interpret(const UserOpExpr& user_op_expr, const TensorTuple& inputs, if (unlikely(JUST(CachedIsAllZeroSizeTensorMeta(output_tensor_metas)))) { return Maybe::Ok(); } - // Run instruction LocalCallOpKernel + // Run instruction Call const auto& kernel = JUST(user_op_expr.MutKernel4Stream(result->stream())); CHECK_EQ_OR_RETURN(kernel->output_tuple_indexes4mut2_obns().size(), 0) << Error::UnimplementedError() @@ -179,8 +179,8 @@ Maybe Interpret(const UserOpExpr& user_op_expr, const TensorTuple& inputs, output_eager_blob_objects->at(i) = JUST(local_tensor->eager_blob_object()); } JUST(PhysicalRun([&](InstructionsBuilder* builder) -> Maybe { - return builder->LocalCallOpKernel(kernel, input_eager_blob_objects, output_eager_blob_objects, - result, ctx, result->stream()); + return builder->Call(kernel, input_eager_blob_objects, output_eager_blob_objects, result, ctx, + result->stream()); })); return Maybe::Ok(); } diff --git a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp index 8034dbfefb4..39353714be1 100644 --- a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp @@ -29,7 +29,7 @@ limitations under the License. #include "oneflow/core/common/stride.h" #include "oneflow/core/memory/memory_case_util.h" #include "oneflow/core/operator/operator.h" -#include "oneflow/user/kernels/stateful_local_opkernel.h" +#include "oneflow/user/kernels/stateful_opkernel.h" #include "oneflow/core/vm/vm_util.h" #include "oneflow/core/autograd/autograd_mode.h" #include "oneflow/core/framework/placement_sbp_util.h" @@ -119,7 +119,7 @@ Maybe NaiveInterpret(const UserOpExpr& user_op_expr, const TensorTuple& in // Infer devices if (!user_op_expr.has_device_and_stream_infer_fn()) { - stream = GetDefaultStreamByDevice(default_device); + stream = JUST(GetDefaultStreamByDevice(default_device)); for (int i = 0; i < outputs->size(); i++) { auto* tensor_impl = JUST(TensorImpl4Tensor(outputs->at(i))); *JUST(tensor_impl->mut_device()) = default_device; @@ -175,8 +175,7 @@ Maybe NaiveInterpret(const UserOpExpr& user_op_expr, const TensorTuple& in } JUST(PhysicalRun([&](InstructionsBuilder* builder) -> Maybe { - return builder->LocalCallOpKernel(kernel, input_eager_blob_objects, output_eager_blob_objects, - ctx, stream); + return builder->Call(kernel, input_eager_blob_objects, output_eager_blob_objects, ctx, stream); })); return Maybe::Ok(); } diff --git a/oneflow/core/framework/op_interpreter/op_interpreter.cpp b/oneflow/core/framework/op_interpreter/op_interpreter.cpp index 1c0d2ded729..6dea92f954c 100644 --- a/oneflow/core/framework/op_interpreter/op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/op_interpreter.cpp @@ -90,6 +90,7 @@ Maybe AutogradInterpreter::Apply(const OpExpr& op_expr, const TensorTuple& std::any_of(inputs.begin(), inputs.end(), [](const std::shared_ptr& tensor) { return tensor->requires_grad(); }); } + // NOTE: if this op not support stride, then need to tensor->contiguous() #define HANDLE_NON_CONTIGUOUS_INPUT(tensor_tuple_ptr) \ TensorTuple tmp_inputs; \ @@ -104,6 +105,8 @@ Maybe AutogradInterpreter::Apply(const OpExpr& op_expr, const TensorTuple& { autograd::AutoGradMode mode(false); + const bool inplace = ctx.inplace.value_or(false); + if (inplace) { *outputs = *inputs_ptr; } JUST(internal_->Apply(op_expr, *inputs_ptr, outputs, ctx)); } // Lazy mode will construct backward compute graph in passes, so disable autograd if lazy mode. @@ -152,6 +155,7 @@ Maybe AutogradInterpreter::Apply(const OpExpr& op_expr, const TensorTuple& requires_grad && IsSupportRequireGradDataType(output->dtype()->data_type()))); } } + if (requires_grad && !LazyMode::is_enabled()) { // Capture inputs and outputs after `AddBackwardFuncPtr` because of that grad function // node has been attached to them. diff --git a/oneflow/core/framework/placement_sbp_util.cpp b/oneflow/core/framework/placement_sbp_util.cpp index 2b0ba8dc42d..dd4cb6b6ebd 100644 --- a/oneflow/core/framework/placement_sbp_util.cpp +++ b/oneflow/core/framework/placement_sbp_util.cpp @@ -40,10 +40,10 @@ namespace { using IndexVector = DimVector; Maybe GetIndexesFromOffset(const Stride& strides, int64_t offset, IndexVector* indexes) { - indexes->resize(strides.NumAxes()); - for (int i = 0; i < strides.NumAxes(); ++i) { - indexes->at(i) = offset / strides.At(i); - offset = offset % strides.At(i); + indexes->resize(strides.size()); + for (int i = 0; i < strides.size(); ++i) { + indexes->at(i) = offset / strides.at(i); + offset = offset % strides.at(i); } CHECK_EQ_OR_RETURN(offset, 0); return Maybe::Ok(); @@ -51,10 +51,10 @@ Maybe GetIndexesFromOffset(const Stride& strides, int64_t offset, IndexVec Maybe GetOffsetFromIndexes(const Stride& strides, const IndexVector& indexes, int64_t* offset) { - CHECK_EQ_OR_RETURN(strides.NumAxes(), indexes.size()) + CHECK_EQ_OR_RETURN(strides.size(), indexes.size()) << Error::RuntimeError() << "Expected size of strides to match that of indexes"; *offset = 0; - for (int i = 0; i < strides.NumAxes(); ++i) { *offset += indexes.at(i) * strides.At(i); } + for (int i = 0; i < strides.size(); ++i) { *offset += indexes.at(i) * strides.at(i); } return Maybe::Ok(); } @@ -124,7 +124,7 @@ Maybe> CalcSubParallelDesc4Axis(Symbol parall int64_t index = CalcIndex4Axis(parallel_id, hierarchy_strides, axis); - int64_t stride = hierarchy_strides.At(axis); + int64_t stride = hierarchy_strides.at(axis); int64_t start_parallel_id = parallel_id - index * stride; ParallelConf parallel_conf; @@ -708,13 +708,12 @@ Maybe RawCheckIsNdSbpBoxingAcyclicWithDecompose(Symbol in, } // namespace int64_t CalcIndex4Axis(int64_t offset, const Stride& stride, int axis) { - CHECK_LT(axis, stride.NumAxes()) - << "Expected axis (" << axis << ") to be less than size of stride (" << stride.NumAxes() - << ")"; + CHECK_LT(axis, stride.size()) << "Expected axis (" << axis << ") to be less than size of stride (" + << stride.size() << ")"; if (axis == 0) { - return offset / stride.At(0); + return offset / stride.at(0); } else { - return offset % stride.At(axis - 1) / stride.At(axis); + return offset % stride.at(axis - 1) / stride.at(axis); } } diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 96f49089006..7b1d1129e93 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -16,7 +16,6 @@ limitations under the License. #include "oneflow/core/framework/sbp_infer_util.h" #include "oneflow/core/auto_parallel/boxing_collector.h" -#include "oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h" #include "oneflow/core/boxing/eager_boxing_interpreter_mgr.h" #include "oneflow/core/common/util.h" #include "oneflow/core/job/lazy_mode.h" @@ -47,10 +46,11 @@ bool CheckNdSbp(const NdSbp& nd_sbp) { double Penalty4PartialInConsumer(double logical_blob_size, int32_t producer_parallel_num, int32_t consumer_parallel_num) { - static const int64_t PartialInConsumerType = ParseIntegerFromEnv("PartialInConsumerTag", 2); - if (PartialInConsumerType == PartialInConsumerTag::kSlight) { + static const int64_t penalty4partial_in_consumer_tag = + ParseIntegerFromEnv("ONEFLOW_PENALTY_FOR_PARTIAL_IN_CONSUMER_POLICY", 2); + if (penalty4partial_in_consumer_tag == Penalty4PartialInConsumerTag::kSlight) { return 1.0; - } else if (PartialInConsumerType == PartialInConsumerTag::kMiddle) { + } else if (penalty4partial_in_consumer_tag == Penalty4PartialInConsumerTag::kMiddle) { return 4 * logical_blob_size * (producer_parallel_num + consumer_parallel_num); } else { return kUnsupportedBoxing; @@ -381,8 +381,103 @@ Maybe GetComputeCopyCostFunc() { } } +void CollaborativeParallelDimReduce(const ParallelDesc& in_parallel_desc, + const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, + const NdSbp& out_nd_sbp, ParallelDesc* reduced_in_parallel_desc, + ParallelDesc* reduced_out_parallel_desc, + NdSbp* reduced_in_nd_sbp, NdSbp* reduced_out_nd_sbp) { + const auto& in_hierarchy = in_parallel_desc.hierarchy(); + const auto& out_hierarchy = out_parallel_desc.hierarchy(); + CHECK_EQ(in_hierarchy->NumAxes(), out_hierarchy->NumAxes()); + + DimVector reduced_in_hierarchy; + DimVector reduced_out_hierarchy; + FOR_RANGE(int64_t, i, 0, in_hierarchy->NumAxes()) { + if (in_hierarchy->At(i) != 1 || out_hierarchy->At(i) != 1) { + if (reduced_in_nd_sbp->sbp_parallel().empty() + || (in_nd_sbp.sbp_parallel(i) + != reduced_in_nd_sbp->sbp_parallel(reduced_in_nd_sbp->sbp_parallel_size() - 1) + || out_nd_sbp.sbp_parallel(i) + != reduced_out_nd_sbp->sbp_parallel(reduced_out_nd_sbp->sbp_parallel_size() + - 1))) { + reduced_in_hierarchy.emplace_back(in_hierarchy->At(i)); + *reduced_in_nd_sbp->add_sbp_parallel() = in_nd_sbp.sbp_parallel(i); + + reduced_out_hierarchy.emplace_back(out_hierarchy->At(i)); + *reduced_out_nd_sbp->add_sbp_parallel() = out_nd_sbp.sbp_parallel(i); + } else { + reduced_in_hierarchy.back() *= in_hierarchy->At(i); + reduced_out_hierarchy.back() *= out_hierarchy->At(i); + } + } + } + if (reduced_in_hierarchy.empty()) { + reduced_in_hierarchy.emplace_back(in_hierarchy->At(0)); + *reduced_in_nd_sbp->add_sbp_parallel() = in_nd_sbp.sbp_parallel(0); + + reduced_out_hierarchy.emplace_back(out_hierarchy->At(0)); + *reduced_out_nd_sbp->add_sbp_parallel() = out_nd_sbp.sbp_parallel(0); + } + + ParallelConf reduced_in_parallel_conf = in_parallel_desc.parallel_conf(); + Shape(reduced_in_hierarchy).ToProto(reduced_in_parallel_conf.mutable_hierarchy()); + *reduced_in_parallel_desc = ParallelDesc(reduced_in_parallel_conf); + + ParallelConf reduced_out_parallel_conf = out_parallel_desc.parallel_conf(); + Shape(reduced_out_hierarchy).ToProto(reduced_out_parallel_conf.mutable_hierarchy()); + *reduced_out_parallel_desc = ParallelDesc(reduced_out_parallel_conf); +} + } // namespace +void NdSbpDimReduce(const ParallelDesc& parallel_desc, const NdSbp& nd_sbp, + ParallelDesc* reduced_parallel_desc, NdSbp* reduced_nd_sbp) { + const auto& hierarchy = parallel_desc.hierarchy(); + DimVector reduced_hierarchy; + FOR_RANGE(int64_t, i, 0, hierarchy->NumAxes()) { + if (hierarchy->At(i) != 1) { + if (reduced_nd_sbp->sbp_parallel().empty() + || (nd_sbp.sbp_parallel(i) + != reduced_nd_sbp->sbp_parallel(reduced_nd_sbp->sbp_parallel_size() - 1))) { + reduced_hierarchy.emplace_back(hierarchy->At(i)); + *reduced_nd_sbp->add_sbp_parallel() = nd_sbp.sbp_parallel(i); + } else { + reduced_hierarchy.back() *= hierarchy->At(i); + } + } + } + // [1, 1, ..., 1]: Any --> [1]: (B) + if (reduced_hierarchy.empty()) { + reduced_hierarchy.emplace_back(hierarchy->At(0)); + reduced_nd_sbp->add_sbp_parallel()->mutable_broadcast_parallel(); + } + ParallelConf reduced_parallel_conf = parallel_desc.parallel_conf(); + Shape(reduced_hierarchy).ToProto(reduced_parallel_conf.mutable_hierarchy()); + *reduced_parallel_desc = ParallelDesc(reduced_parallel_conf); +} + +void InOutParallelDimReduce(const ParallelDesc& in_parallel_desc, + const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, + const NdSbp& out_nd_sbp, ParallelDesc* reduced_in_parallel_desc, + ParallelDesc* reduced_out_parallel_desc, NdSbp* reduced_in_nd_sbp, + NdSbp* reduced_out_nd_sbp) { + const int64_t in_hierarchy_axes = in_parallel_desc.hierarchy()->NumAxes(); + const int64_t out_hierarchy_axes = out_parallel_desc.hierarchy()->NumAxes(); + if (in_hierarchy_axes == 1 && out_hierarchy_axes == 1) { + *reduced_in_parallel_desc = in_parallel_desc; + *reduced_out_parallel_desc = out_parallel_desc; + *reduced_in_nd_sbp = in_nd_sbp; + *reduced_out_nd_sbp = out_nd_sbp; + } else if (in_hierarchy_axes != out_hierarchy_axes) { + NdSbpDimReduce(in_parallel_desc, in_nd_sbp, reduced_in_parallel_desc, reduced_in_nd_sbp); + NdSbpDimReduce(out_parallel_desc, out_nd_sbp, reduced_out_parallel_desc, reduced_out_nd_sbp); + } else { + CollaborativeParallelDimReduce(in_parallel_desc, out_parallel_desc, in_nd_sbp, out_nd_sbp, + reduced_in_parallel_desc, reduced_out_parallel_desc, + reduced_in_nd_sbp, reduced_out_nd_sbp); + } +} + Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel, const NdSbp& consumer_sbp_parallel, const BlobDesc& logical_blob_desc, @@ -626,7 +721,6 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, // Decide the priority to infer sbp double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, const NdSbp& consumer_sbp_parallel, - const BlobDesc& logical_blob_desc, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, bool requires_same_sbp) { ParallelDesc reduced_in_parallel_desc = producer_parallel_desc; @@ -650,9 +744,9 @@ double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, } } else { // This blob supports boxing - if (reduced_in_nd_sbp == reduced_out_nd_sbp - && *reduced_in_parallel_desc.hierarchy() == *reduced_out_parallel_desc.hierarchy()) { + if (reduced_in_nd_sbp == reduced_out_nd_sbp) { // Highest priority: this blob have the same sbp on both the producer and consumer + // Not just [0-3] -> [4-7], but also cpu:[0] -> cuda:[0-3] return 0.0; } else { // Normal priority: transfer occurs diff --git a/oneflow/core/framework/sbp_infer_util.h b/oneflow/core/framework/sbp_infer_util.h index e9c30cc4c8c..42c01e29e57 100644 --- a/oneflow/core/framework/sbp_infer_util.h +++ b/oneflow/core/framework/sbp_infer_util.h @@ -27,12 +27,21 @@ enum SbpInferRuleTag : int { kMinCost = 3 // Lowest cost }; -enum PartialInConsumerTag : int { +enum Penalty4PartialInConsumerTag : int { kSlight = 1, // Slight penalty kMiddle = 2, // Make sure we do not select P in the consumer kStrict = 3 // Not allow a transfer to P }; +void NdSbpDimReduce(const ParallelDesc& parallel_desc, const NdSbp& nd_sbp, + ParallelDesc* reduced_parallel_desc, NdSbp* reduced_nd_sbp); + +void InOutParallelDimReduce(const ParallelDesc& in_parallel_desc, + const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, + const NdSbp& out_nd_sbp, ParallelDesc* reduced_in_parallel_desc, + ParallelDesc* reduced_out_parallel_desc, NdSbp* reduced_in_nd_sbp, + NdSbp* reduced_out_nd_sbp); + double GetValidMaxCopyCost(); double GetTransferCost(); @@ -84,7 +93,6 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, // 2.0: Penality, the same as infinity double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, const NdSbp& consumer_sbp_parallel, - const BlobDesc& logical_blob_desc, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, bool requires_same_sbp); diff --git a/oneflow/core/framework/stream.cpp b/oneflow/core/framework/stream.cpp index c10bf0cf4fa..ba9facf5b6f 100644 --- a/oneflow/core/framework/stream.cpp +++ b/oneflow/core/framework/stream.cpp @@ -17,49 +17,37 @@ limitations under the License. #include "oneflow/core/framework/stream_is_comm_net_stream.h" #include "oneflow/core/common/decorator.h" #include "oneflow/core/common/static_global.h" +#include "oneflow/core/common/global.h" #include "oneflow/core/job/parallel_desc.h" -#include "oneflow/core/vm/vm_object.h" -#include "oneflow/core/intrusive/intrusive.h" +#include "oneflow/core/framework/stream_mgr.h" namespace oneflow { -namespace { - -intrusive::shared_ptr RawGetStaticGlobalTransportLocalDepObject() { - return intrusive::make_shared(); -} +Stream::Stream(Symbol device, StreamRole stream_role) + : device_(device), stream_role_(stream_role), unique_stream_id_(-1) {} -intrusive::shared_ptr RawNewComputeDepObject(Symbol, StreamRole) { - return intrusive::make_shared(); +Maybe Stream::Init(size_t unique_stream_id) { + unique_stream_id_ = unique_stream_id; + return Maybe::Ok(); } -} // namespace - -LocalDepObject* GetStaticGlobalTransportLocalDepObject() { - static constexpr auto* GetLocalDepObject = - DECORATE(&RawGetStaticGlobalTransportLocalDepObject, StaticGlobalCopiable); - return GetLocalDepObject().Mutable(); +/*static*/ Maybe> Stream::RawNew(Symbol device, StreamRole stream_role) { + std::shared_ptr stream(new Stream(device, stream_role)); + return JUST(GlobalMaybe()) + ->AddStreamSymbol(*stream, [&](size_t unique_stream_id) -> Maybe> { + JUST(stream->Init(unique_stream_id)); + return SymbolOf(*stream); + }); } -Stream::Stream(Symbol device, StreamRole stream_role) - : device_(device), - stream_role_(stream_role), - schedule_local_dep_object_(nullptr), - transport_local_dep_object_(NullOpt) { - static constexpr auto* GetComputeDep = DECORATE(&RawNewComputeDepObject, StaticGlobalCopiable); - schedule_local_dep_object_ = GetComputeDep(device, stream_role).Mutable(); - if (StreamRoleSwitch(stream_role)) { - transport_local_dep_object_ = GetStaticGlobalTransportLocalDepObject(); - } +/*static*/ Maybe> Stream::New(Symbol device, StreamRole stream_role) { + constexpr auto* Make = DECORATE(&Stream::RawNew, ThreadLocal); + return Make(device, stream_role); } namespace { -Symbol RawNewStream(Symbol device, StreamRole stream_role) { - return SymbolOf(Stream(device, stream_role)); -} - -Symbol RawGetDefaultStreamByDevice(Symbol device) { +Maybe> RawGetDefaultStreamByDevice(Symbol device) { return Stream::New(device, StreamRole::kCompute); } @@ -69,8 +57,6 @@ Maybe> RawGetDefaultStreamByPlacement(Symbol parall } // namespace -decltype(Stream::New) Stream::New = DECORATE(&RawNewStream, ThreadLocal); - decltype(GetDefaultStreamByDevice) GetDefaultStreamByDevice = DECORATE(&RawGetDefaultStreamByDevice, ThreadLocal); diff --git a/oneflow/core/framework/stream.h b/oneflow/core/framework/stream.h index 52af85eb9d5..e851eb1e8e6 100644 --- a/oneflow/core/framework/stream.h +++ b/oneflow/core/framework/stream.h @@ -25,11 +25,6 @@ limitations under the License. namespace oneflow { -namespace vm { -class MirroredObject; -} -using LocalDepObject = vm::MirroredObject; - class Stream final { public: Stream(const Stream&) = default; @@ -41,29 +36,25 @@ class Stream final { } bool operator!=(const Stream& that) const { return !(*this == that); } - Stream(Symbol device, StreamRole stream_role); - - static Symbol (*New)(Symbol device, StreamRole stream_role); + static Maybe> New(Symbol device, StreamRole stream_role); Symbol device() const { return device_; } StreamRole stream_role() const { return stream_role_; } - - LocalDepObject* mut_schedule_local_dep_object() const { return schedule_local_dep_object_; } - const Optional& mut_transport_local_dep_object() const { - return transport_local_dep_object_; - } + size_t unique_stream_id() const { return unique_stream_id_; } private: + Stream(Symbol device, StreamRole stream_role); + + static Maybe> RawNew(Symbol device, StreamRole stream_role); + + Maybe Init(size_t unique_stream_id); + Symbol device_; StreamRole stream_role_; - - LocalDepObject* schedule_local_dep_object_; - Optional transport_local_dep_object_; + size_t unique_stream_id_; }; -LocalDepObject* GetStaticGlobalTransportLocalDepObject(); - -extern Symbol (*GetDefaultStreamByDevice)(Symbol); +extern Maybe> (*GetDefaultStreamByDevice)(Symbol); class ParallelDesc; extern Maybe> (*GetDefaultStreamByPlacement)(Symbol); diff --git a/oneflow/core/framework/stream_get_call_instruction_name.h b/oneflow/core/framework/stream_get_call_instruction_name.h deleted file mode 100644 index 774a3e2aaff..00000000000 --- a/oneflow/core/framework/stream_get_call_instruction_name.h +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_FRAMEWORK_STREAM_GET_CALL_INSTRUCTION_NAME_H_ -#define ONEFLOW_CORE_FRAMEWORK_STREAM_GET_CALL_INSTRUCTION_NAME_H_ - -#include -#include -#include "oneflow/core/common/stream_role.h" -#include "oneflow/core/common/device_type.h" -#include "oneflow/core/common/maybe.h" -#include "oneflow/core/framework/to_string.h" - -namespace oneflow { - -struct GetCallInstructionName { - static Maybe Case(StreamRoleCase, - DeviceType device_type) { // NOLINT - static constexpr auto* Get = DECORATE(&Call::Invalid, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::Compute, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::Host2Device, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::Device2Host, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::SyncedLaunchedCommNet, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::AsyncedLaunchedCommNet, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::CriticalSection, ThreadLocal); - return *JUST(Get(device_type)); - } - - private: - struct Call { - static Maybe Invalid(DeviceType device_type) { // NOLINT - UNIMPLEMENTED_THEN_RETURN(); - } - static Maybe Compute(DeviceType device_type) { - return *JUST(DeviceTag4DeviceType(device_type)) + ".LocalCallOpKernel"; - } - static Maybe Host2Device(DeviceType device_type) { - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("cuda_h2d.LocalCallOpKernel"); - } - static Maybe Device2Host(DeviceType device_type) { - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("cuda_d2h.LocalCallOpKernel"); - } - static Maybe SyncedLaunchedCommNet(DeviceType device_type) { - if (device_type == kCPU) { return std::string("cpu.LocalCallOpKernel"); } - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("cuda.LocalCallOpKernel"); - } - static Maybe AsyncedLaunchedCommNet(DeviceType device_type) { - if (device_type == kCPU) { return std::string("cpu.LocalCallOpKernel"); } - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("async.cuda.LocalCallOpKernel"); - } - static Maybe CriticalSection(DeviceType device_type) { - UNIMPLEMENTED_THEN_RETURN(); - } - }; -}; - -} // namespace oneflow - -#endif // ONEFLOW_CORE_FRAMEWORK_STREAM_GET_CALL_INSTRUCTION_NAME_H_ diff --git a/oneflow/core/framework/stream_get_release_instruction_name.h b/oneflow/core/framework/stream_get_release_instruction_name.h deleted file mode 100644 index 262da8c29cc..00000000000 --- a/oneflow/core/framework/stream_get_release_instruction_name.h +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_FRAMEWORK_STREAM_GET_RELEASE_INSTRUCTION_NAME_H_ -#define ONEFLOW_CORE_FRAMEWORK_STREAM_GET_RELEASE_INSTRUCTION_NAME_H_ - -#include -#include -#include "oneflow/core/common/stream_role.h" -#include "oneflow/core/common/device_type.h" -#include "oneflow/core/common/maybe.h" -#include "oneflow/core/framework/to_string.h" - -namespace oneflow { - -struct GetReleaseInstructionName { - static Maybe Case(StreamRoleCase, - DeviceType device_type) { // NOLINT - static constexpr auto* Get = DECORATE(&Call::Invalid, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::Compute, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::Host2Device, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::Device2Host, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::SyncedLaunchedCommNet, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::AsyncedLaunchedCommNet, ThreadLocal); - return *JUST(Get(device_type)); - } - static Maybe Case(StreamRoleCase, - DeviceType device_type) { - static constexpr auto* Get = DECORATE(&Call::CriticalSection, ThreadLocal); - return *JUST(Get(device_type)); - } - - private: - struct Call { - static Maybe Invalid(DeviceType device_type) { // NOLINT - UNIMPLEMENTED_THEN_RETURN(); - } - static Maybe Compute(DeviceType device_type) { - return *JUST(DeviceTag4DeviceType(device_type)) + ".ReleaseTensor"; - } - static Maybe Host2Device(DeviceType device_type) { - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("cuda_h2d.ReleaseTensor"); - } - static Maybe Device2Host(DeviceType device_type) { - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("cuda_d2h.ReleaseTensor"); - } - static Maybe SyncedLaunchedCommNet(DeviceType device_type) { - if (device_type == kCPU) { return std::string("comm_net.ReleaseTensor"); } - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("sync_launched_nccl.ReleaseTensor"); - } - static Maybe AsyncedLaunchedCommNet(DeviceType device_type) { - if (device_type == kCPU) { return std::string("comm_net.ReleaseTensor"); } - CHECK_EQ_OR_RETURN(device_type, kCUDA); - return std::string("async_launched_nccl.ReleaseTensor"); - } - static Maybe CriticalSection(DeviceType device_type) { - UNIMPLEMENTED_THEN_RETURN(); - } - }; -}; - -} // namespace oneflow - -#endif // ONEFLOW_CORE_FRAMEWORK_STREAM_GET_RELEASE_INSTRUCTION_NAME_H_ diff --git a/oneflow/core/framework/stream_get_stream_role_name.h b/oneflow/core/framework/stream_get_stream_role_name.h new file mode 100644 index 00000000000..b87148b2d6d --- /dev/null +++ b/oneflow/core/framework/stream_get_stream_role_name.h @@ -0,0 +1,40 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_FRAMEWORK_STREAM_GET_STREAM_ROLE_NAME_H_ +#define ONEFLOW_CORE_FRAMEWORK_STREAM_GET_STREAM_ROLE_NAME_H_ + +#include +#include +#include "oneflow/core/common/stream_role.h" +#include "oneflow/core/common/device_type.h" +#include "oneflow/core/framework/to_string.h" + +namespace oneflow { + +struct GetStreamRoleName : public StreamRoleVisitor { + static const char* VisitCompute() { return "compute"; } + static const char* VisitHost2Device() { return "h2d"; } + static const char* VisitDevice2Host() { return "d2h"; } + static const char* VisitSyncedLaunchedCommNet() { return "synced_launched_comm_net"; } + static const char* VisitAsyncedLaunchedCommNet() { return "asynced_launched_comm_net"; } + static const char* VisitBarrier() { return "barrier"; } + static const char* VisitCriticalSection() { return "critical_section"; } + static const char* VisitLazyJobLauncher() { return "lazy_job_launcher"; } +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_FRAMEWORK_STREAM_GET_STREAM_ROLE_NAME_H_ diff --git a/oneflow/core/framework/stream_is_comm_net_stream.h b/oneflow/core/framework/stream_is_comm_net_stream.h index c60906c7ff1..ccc231948f1 100644 --- a/oneflow/core/framework/stream_is_comm_net_stream.h +++ b/oneflow/core/framework/stream_is_comm_net_stream.h @@ -21,16 +21,15 @@ limitations under the License. namespace oneflow { -struct IsCommNetStream { - static bool Case(StreamRoleCase) { // NOLINT - LOG(FATAL); - } - static bool Case(StreamRoleCase) { return false; } - static bool Case(StreamRoleCase) { return false; } - static bool Case(StreamRoleCase) { return false; } - static bool Case(StreamRoleCase) { return true; } - static bool Case(StreamRoleCase) { return true; } - static bool Case(StreamRoleCase) { return false; } +struct IsCommNetStream final : public StreamRoleVisitor { + static bool VisitCompute() { return false; } + static bool VisitHost2Device() { return false; } + static bool VisitDevice2Host() { return false; } + static bool VisitSyncedLaunchedCommNet() { return true; } + static bool VisitAsyncedLaunchedCommNet() { return true; } + static bool VisitBarrier() { return false; } + static bool VisitCriticalSection() { return false; } + static bool VisitLazyJobLauncher() { return false; } }; } // namespace oneflow diff --git a/oneflow/core/framework/stream_mgr.cpp b/oneflow/core/framework/stream_mgr.cpp new file mode 100644 index 00000000000..4c1e44ec85e --- /dev/null +++ b/oneflow/core/framework/stream_mgr.cpp @@ -0,0 +1,61 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/stream_mgr.h" +#include "oneflow/core/common/container_util.h" +#include "oneflow/core/common/global.h" +#include "oneflow/core/common/util.h" + +namespace oneflow { + +Maybe> StreamMgr::AddStreamSymbol( + const Stream& stream, + const std::function>(size_t unique_stream_id)>& CreateStreamSymbol) { + Symbol stream_symbol; + std::unique_lock lock(mutex_); + if (stream2unique_stream_id_.count(stream) > 0) { + size_t unique_stream_id = stream2unique_stream_id_[stream]; + auto existed_stream_symbol = JUST(VectorAt(unique_stream_id2stream_symbol_, unique_stream_id)); + stream_symbol = JUST(CreateStreamSymbol(unique_stream_id)); + CHECK_OR_RETURN(existed_stream_symbol == stream_symbol) + << "the result of current called CreateStreamSymbol is not the result of last called " + "CreateStreamSymbol"; + } else { + size_t unique_stream_id = unique_stream_id2stream_symbol_.size(); + stream2unique_stream_id_[stream] = unique_stream_id; + stream_symbol = JUST(CreateStreamSymbol(unique_stream_id)); + unique_stream_id2stream_symbol_.push_back(stream_symbol); + CHECK_OR_RETURN(unique_stream_id2stream_symbol_[unique_stream_id] == stream) + << "the result of CreateStreamSymbol is no the symbol of `stream`"; + CHECK_EQ_OR_RETURN(unique_stream_id2stream_symbol_[unique_stream_id]->unique_stream_id(), + unique_stream_id) + << "unique_stream_id is wrongly initialized"; + } + return stream_symbol; +} + +size_t StreamMgr::UniqueStreamSize() const { + std::unique_lock lock(mutex_); + return unique_stream_id2stream_symbol_.size(); +} + +Maybe> StreamMgr::GetStreamSymbol(size_t unique_stream_id) const { + std::unique_lock lock(mutex_); + return JUST(VectorAt(unique_stream_id2stream_symbol_, unique_stream_id)); +} + +COMMAND(Global::SetAllocated(new StreamMgr())); + +} // namespace oneflow diff --git a/oneflow/core/framework/stream_mgr.h b/oneflow/core/framework/stream_mgr.h new file mode 100644 index 00000000000..a38ee2b183e --- /dev/null +++ b/oneflow/core/framework/stream_mgr.h @@ -0,0 +1,48 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_FRAMEWORK_STREAM_MGR_H_ +#define ONEFLOW_CORE_FRAMEWORK_STREAM_MGR_H_ + +#include +#include +#include "oneflow/core/common/symbol.h" +#include "oneflow/core/common/optional.h" +#include "oneflow/core/framework/stream.h" + +namespace oneflow { + +class StreamMgr final { + public: + StreamMgr() = default; + ~StreamMgr() = default; + + Maybe> AddStreamSymbol( + const Stream& stream, + const std::function>(size_t unique_stream_id)>& CreateStreamSymbol); + + size_t UniqueStreamSize() const; + + Maybe> GetStreamSymbol(size_t unique_stream_id) const; + + private: + mutable std::mutex mutex_; + std::vector> unique_stream_id2stream_symbol_; + std::unordered_map stream2unique_stream_id_; +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_FRAMEWORK_STREAM_MGR_H_ diff --git a/oneflow/core/framework/stream_need_soft_sync.h b/oneflow/core/framework/stream_need_soft_sync.h index d783c8f4d2c..35dcb71fd30 100644 --- a/oneflow/core/framework/stream_need_soft_sync.h +++ b/oneflow/core/framework/stream_need_soft_sync.h @@ -22,22 +22,15 @@ limitations under the License. namespace oneflow { -struct NeedSoftSync { - static bool Case(StreamRoleCase, DeviceType) { // NOLINT - LOG(FATAL); - } - static bool Case(StreamRoleCase, DeviceType device_type) { - return device_type != kCPU; - } - static bool Case(StreamRoleCase, DeviceType) { return false; } - static bool Case(StreamRoleCase, DeviceType) { return false; } - static bool Case(StreamRoleCase, DeviceType device_type) { - return device_type != kCPU; - } - static bool Case(StreamRoleCase, DeviceType) { - return false; - } - static bool Case(StreamRoleCase, DeviceType) { return false; } +struct NeedSoftSync : public StreamRoleVisitor { + static bool VisitCompute(DeviceType device_type) { return device_type != kCPU; } + static bool VisitHost2Device(DeviceType) { return false; } + static bool VisitDevice2Host(DeviceType) { return false; } + static bool VisitSyncedLaunchedCommNet(DeviceType device_type) { return device_type != kCPU; } + static bool VisitAsyncedLaunchedCommNet(DeviceType) { return false; } + static bool VisitBarrier(DeviceType) { return false; } + static bool VisitCriticalSection(DeviceType) { return false; } + static bool VisitLazyJobLauncher(DeviceType) { return false; } }; } // namespace oneflow diff --git a/oneflow/core/framework/stream_on_independent_thread.h b/oneflow/core/framework/stream_on_independent_thread.h new file mode 100644 index 00000000000..54795a6f746 --- /dev/null +++ b/oneflow/core/framework/stream_on_independent_thread.h @@ -0,0 +1,37 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_FRAMEWORK_STREAM_ON_INDEPENDENT_THREAD_H_ +#define ONEFLOW_CORE_FRAMEWORK_STREAM_ON_INDEPENDENT_THREAD_H_ + +#include +#include "oneflow/core/common/stream_role.h" + +namespace oneflow { + +struct StreamOnIndependentThread : public StreamRoleVisitor { + static bool VisitCompute() { return false; } + static bool VisitHost2Device() { return false; } + static bool VisitDevice2Host() { return false; } + static bool VisitSyncedLaunchedCommNet() { return false; } + static bool VisitAsyncedLaunchedCommNet() { return false; } + static bool VisitBarrier() { return false; } + static bool VisitCriticalSection() { return true; } + static bool VisitLazyJobLauncher() { return true; } +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_FRAMEWORK_STREAM_ON_INDEPENDENT_THREAD_H_ diff --git a/oneflow/core/framework/tensor.cpp b/oneflow/core/framework/tensor.cpp index e1817ef9836..9383d40055d 100644 --- a/oneflow/core/framework/tensor.cpp +++ b/oneflow/core/framework/tensor.cpp @@ -87,7 +87,7 @@ Maybe MirroredTensor::clone() const { const auto& device_type = JUST(this->device())->type(); int64_t device_id = JUST(this->device())->device_id(); std::shared_ptr input = std::const_pointer_cast(shared_from_this()); - const bool pin_memory = JUST(JUST(input->AsMirroredTensor())->eager_blob_object())->pin_memory(); + const bool pin_memory = JUST(JUST(input->AsMirroredTensor())->is_pinned()); return JUST(functional::Copy(input, device_type, device_id, /*pin_memory=*/pin_memory)); } diff --git a/oneflow/core/framework/tensor.h b/oneflow/core/framework/tensor.h index b12ee18907b..faaa90b5b2e 100644 --- a/oneflow/core/framework/tensor.h +++ b/oneflow/core/framework/tensor.h @@ -60,6 +60,7 @@ class Tensor : public std::enable_shared_from_this { virtual bool is_lazy() const = 0; virtual bool is_eager() const { return !is_lazy(); } virtual bool is_contiguous() const = 0; + virtual Maybe is_pinned() const = 0; virtual const TensorMeta& tensor_meta() const = 0; virtual Maybe data() = 0; virtual std::shared_ptr pin_memory() const = 0; @@ -204,6 +205,7 @@ class StaticZerosTensor final : public Tensor { PRINT_BUG_PROMPT_AND_ABORT(); return true; } + Maybe is_pinned() const override { RETURN_ERROR_WITH_BUG_PROMPT(); } std::shared_ptr grad_fn_node() const override { PRINT_BUG_PROMPT_AND_ABORT(); return nullptr; @@ -360,6 +362,7 @@ class ProxyTensor : public TensorIf { virtual bool is_leaf() const override { return tensor_->is_leaf(); } virtual bool retain_grad() const override { return tensor_->retain_grad(); } virtual bool is_contiguous() const override { return tensor_->is_contiguous(); } + virtual Maybe is_pinned() const override { return tensor_->is_pinned(); } virtual Maybe acc_grad() const override { return tensor_->acc_grad(); } virtual Maybe current_grad() const override { return tensor_->current_grad(); } virtual Maybe detach() const override { return tensor_->detach(); } @@ -488,6 +491,7 @@ class MirroredTensor final : public TensorIf { bool is_leaf() const override { return impl_->is_leaf(); } bool retain_grad() const override { return impl_->retain_grad(); } bool is_contiguous() const override { return impl_->is_contiguous(); } + Maybe is_pinned() const override { return impl_->is_pinned(); }; // Setters for autograd Maybe set_acc_grad(const std::shared_ptr& grad) override { @@ -606,6 +610,9 @@ class ConsistentTensor final : public TensorIf { bool is_leaf() const override { return impl_->is_leaf(); } bool retain_grad() const override { return impl_->retain_grad(); } bool is_contiguous() const override { return impl_->is_contiguous(); } + Maybe is_pinned() const override { + OF_RUNTIME_ERROR() << "Global tensor has no is_pinned method"; + } // Setters for autograd Maybe set_acc_grad(const std::shared_ptr& grad) override { diff --git a/oneflow/core/framework/tensor_consistent_id.cpp b/oneflow/core/framework/tensor_consistent_id.cpp index bcaf69e4142..f004f81c464 100644 --- a/oneflow/core/framework/tensor_consistent_id.cpp +++ b/oneflow/core/framework/tensor_consistent_id.cpp @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/common/decorator.h" #include "oneflow/core/framework/tensor.h" #include "oneflow/core/framework/tensor_tuple.h" #include "oneflow/core/framework/transport_token.h" diff --git a/oneflow/core/framework/tensor_impl.cpp b/oneflow/core/framework/tensor_impl.cpp index 8b0c074efc7..832fc8b4d8d 100644 --- a/oneflow/core/framework/tensor_impl.cpp +++ b/oneflow/core/framework/tensor_impl.cpp @@ -83,12 +83,11 @@ EagerMirroredTensorImpl::EagerMirroredTensorImpl( Maybe EagerMirroredTensorImpl::UpdateTensorStorage() { const auto& eager_blob_object = eager_blob_object_; tensor_storage_ = std::make_shared(eager_blob_object->tensor_storage()); - const auto& parallel_desc = JUST(Placement4Device(this->device())).shared_from_symbol(); tensor_storage_->set_releaser_hook( - [eager_blob_object, parallel_desc](const std::shared_ptr&) { + [eager_blob_object](const std::shared_ptr&) { CHECK_JUST(PhysicalRun([&](InstructionsBuilder* builder) -> Maybe { if (eager_blob_object->producer_stream().has_value()) { - JUST(builder->ReleaseTensor(eager_blob_object, parallel_desc)); + JUST(builder->ReleaseTensor(eager_blob_object)); } return Maybe::Ok(); })); @@ -122,6 +121,11 @@ Maybe EagerMirroredTensorImpl::InitEagerBlobObject( return Maybe::Ok(); } +Maybe EagerMirroredTensorImpl::is_pinned() const { + if (!eager_blob_object_) { return false; } + return eager_blob_object_->pin_memory(); +} + Maybe EagerMirroredTensorImpl::set_eager_blob_object( std::shared_ptr eager_blob_object) { eager_blob_object_ = eager_blob_object; diff --git a/oneflow/core/framework/tensor_impl.h b/oneflow/core/framework/tensor_impl.h index 3ddfefd28a8..d204f20689a 100644 --- a/oneflow/core/framework/tensor_impl.h +++ b/oneflow/core/framework/tensor_impl.h @@ -64,6 +64,7 @@ class TensorImpl { virtual Maybe has_eager_blob_object() const = 0; virtual Maybe storage_offset() const { OF_UNIMPLEMENTED(); } virtual bool is_contiguous() const = 0; + virtual Maybe is_pinned() const { OF_UNIMPLEMENTED(); } // Getters for autograd Maybe acc_grad() const; @@ -201,6 +202,7 @@ class LazyMirroredTensorImpl final : public MirroredTensorImpl { // but should return real status while stride/view mechanism is ready in lazy-mirrored mode return true; } + Maybe is_pinned() const override { RETURN_ERROR_WITH_BUG_PROMPT(); } // Getters valid only for EagerMirroredTensorImpl Maybe eager_blob_object() const override { RETURN_ERROR_WITH_BUG_PROMPT(); } @@ -229,6 +231,7 @@ class EagerMirroredTensorImpl final : public MirroredTensorImpl { Maybe detach() const override; bool is_lazy() const override { return false; } bool is_contiguous() const override { return tensor_meta_->is_contiguous(); } + Maybe is_pinned() const override; // Getters valid only for EagerMirroredTensorImpl Maybe eager_blob_object() const override { diff --git a/oneflow/core/framework/tensor_meta.cpp b/oneflow/core/framework/tensor_meta.cpp index 523077c6aae..ede1e574023 100644 --- a/oneflow/core/framework/tensor_meta.cpp +++ b/oneflow/core/framework/tensor_meta.cpp @@ -72,7 +72,7 @@ bool IsContiguous(const Shape& shape, const Stride& stride) { // https://stackoverflow.com/questions/31681324/identify-contiguous-segments-of-a-non-contiguous-numpy-array if (shape.At(i) == 0) { return true; } if (contig_if_nonempty && shape.At(i) != 1) { - if (stride.At(i) != expected_stride) { contig_if_nonempty = false; } + if (stride.at(i) != expected_stride) { contig_if_nonempty = false; } expected_stride *= shape.At(i); } } diff --git a/oneflow/core/framework/tensor_methods.cpp b/oneflow/core/framework/tensor_methods.cpp index 6f6cf271660..cc7b7aa08dc 100644 --- a/oneflow/core/framework/tensor_methods.cpp +++ b/oneflow/core/framework/tensor_methods.cpp @@ -75,7 +75,7 @@ Maybe BasicView(const std::shared_ptr& input, const Shape& targe auto tensor_impl = std::make_shared( tensor_meta, JUST(input->tensor_storage()), requires_grad, /*is_leaf=*/!requires_grad); - const bool pin_memory = JUST(JUST(input->AsMirroredTensor())->eager_blob_object())->pin_memory(); + const bool pin_memory = JUST(JUST(input->AsMirroredTensor())->is_pinned()); JUST(tensor_impl->InitEagerBlobObject(JUST(blob_object->compute_local_dep_object()), /*pin_memory=*/pin_memory)); @@ -134,7 +134,7 @@ Maybe Slice(const std::shared_ptr& input, const std::vectorAsMirroredTensor())->storage_offset()); for (int i = 0; i < ndim; ++i) { int64_t step = std::min(steps[i], shape->At(i)); @@ -147,20 +147,20 @@ Maybe Slice(const std::shared_ptr& input, const std::vectorAt(i); - storage_offset += start * strides->At(i); + target_strides[i] = step * strides->at(i); + storage_offset += start * strides->at(i); } - auto output = JUST(BasicView(input, Shape(target_dims), Stride(target_strides), storage_offset)); + auto output = JUST(BasicView(input, Shape(target_dims), target_strides, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { + const Shape in_shape = *input->shape(); auto backward_fn = std::make_shared(); backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads, bool create_graph) -> Maybe { autograd::AutoGradMode mode(create_graph); CHECK_EQ_OR_RETURN(out_grads.size(), 1); // NOLINT(maybe-need-error-msg) in_grads->resize(1); - (*in_grads)[0] = JUST(functional::SliceGrad( - JUST(VectorAt(out_grads, 0)), Shape(input->shape()->dim_vec()), starts, ends, steps)); + (*in_grads)[0] = JUST(functional::SliceGrad(out_grads[0], in_shape, starts, ends, steps)); return Maybe::Ok(); }; backward_fn->status = []() { return true; }; @@ -177,23 +177,23 @@ Maybe Unsqueeze(const std::shared_ptr& input, const int32_t& exp const auto& ndim = shape->NumAxes(); DimVector target_dim_vec(ndim + 1); - DimVector target_stride_vec(ndim + 1); + Stride target_stride_vec(ndim + 1); { int cnt = 0; for (int i = 0; i < ndim; i++) { if (i == expand_dim) { cnt++; } target_dim_vec[cnt] = shape->At(i); - target_stride_vec[cnt] = strides->At(i); + target_stride_vec[cnt] = strides->at(i); cnt++; } target_dim_vec[expand_dim] = 1; - target_stride_vec[expand_dim] = expand_dim < ndim ? strides->At(expand_dim) : 1; + target_stride_vec[expand_dim] = expand_dim < ndim ? strides->at(expand_dim) : 1; } int64_t storage_offset = JUST(JUST(input->AsMirroredTensor())->storage_offset()); std::shared_ptr output = - JUST(BasicView(input, Shape(target_dim_vec), Stride(target_stride_vec), storage_offset)); + JUST(BasicView(input, Shape(target_dim_vec), target_stride_vec, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { auto backward_fn = std::make_shared(); @@ -222,14 +222,14 @@ Maybe Squeeze(const std::shared_ptr& input, const int target_ndim = ndim - squeeze_dims.size(); DimVector target_dim_vec(target_ndim); - DimVector target_stride_vec(target_ndim); + Stride target_stride_vec(target_ndim); { int cnt = 0; for (int i = 0; i < ndim; i++) { if (find(squeeze_dims.begin(), squeeze_dims.end(), i) == squeeze_dims.end()) { target_dim_vec[cnt] = shape->At(i); - target_stride_vec[cnt] = strides->At(i); + target_stride_vec[cnt] = strides->at(i); cnt++; } } @@ -237,7 +237,7 @@ Maybe Squeeze(const std::shared_ptr& input, int64_t storage_offset = JUST(JUST(input->AsMirroredTensor())->storage_offset()); std::shared_ptr output = - JUST(BasicView(input, Shape(target_dim_vec), Stride(target_stride_vec), storage_offset)); + JUST(BasicView(input, Shape(target_dim_vec), target_stride_vec, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { auto backward_fn = std::make_shared(); @@ -266,13 +266,13 @@ Maybe Expand(const std::shared_ptr& input, const std::vectorAt(ndim - 1 - i); + target_stride_vec[target_ndim - 1 - i] = strides->at(ndim - 1 - i); } else if (in_shape[ndim - 1 - i] == 1) { // TODO (bowen): what if dim is 1, should stride be set to 0? target_dim_vec[target_ndim - 1 - i] = expand_shape[target_ndim - 1 - i]; @@ -286,7 +286,7 @@ Maybe Expand(const std::shared_ptr& input, const std::vectorToString(); } target_dim_vec[target_ndim - 1 - i] = in_shape[ndim - 1 - i]; - target_stride_vec[target_ndim - 1 - i] = strides->At(ndim - 1 - i); + target_stride_vec[target_ndim - 1 - i] = strides->at(ndim - 1 - i); } } else { if (expand_shape[target_ndim - 1 - i] == -1) { @@ -300,7 +300,7 @@ Maybe Expand(const std::shared_ptr& input, const std::vectorAsMirroredTensor())->storage_offset()); std::shared_ptr output = - JUST(BasicView(input, Shape(target_dim_vec), Stride(target_stride_vec), storage_offset)); + JUST(BasicView(input, Shape(target_dim_vec), target_stride_vec, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { auto backward_fn = std::make_shared(); @@ -334,13 +334,13 @@ Maybe Narrow(const std::shared_ptr& input, const int64_t& dim, c int64_t storage_offset = JUST(JUST(input->AsMirroredTensor())->storage_offset()); Shape target_shape(dim_vec); - DimVector stride_vec(ndim); + Stride stride(ndim); for (int i = 0; i < ndim; ++i) { - stride_vec[i] = strides->At(i); - if (dim == i) { storage_offset += start * strides->At(i); } + stride[i] = strides->at(i); + if (dim == i) { storage_offset += start * strides->at(i); } } - auto output = JUST(BasicView(input, target_shape, Stride(stride_vec), storage_offset)); + auto output = JUST(BasicView(input, target_shape, stride, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { auto backward_fn = std::make_shared(); backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads, @@ -363,13 +363,12 @@ Maybe Narrow(const std::shared_ptr& input, const int64_t& dim, c } Maybe AsStrided(const std::shared_ptr& input, const std::vector& size, - const std::vector& stride, const int32_t& storage_offset) { + const std::vector& stride_vec, const int32_t& storage_offset) { DimVector dim_vec; dim_vec.insert(dim_vec.end(), size.begin(), size.end()); Shape target_shape(dim_vec); - DimVector stride_vec(stride.size()); - for (int i = 0; i < stride.size(); ++i) { stride_vec[i] = stride[i]; } - auto output = JUST(view::BasicView(input, target_shape, Stride(stride_vec), storage_offset)); + Stride stride(stride_vec.begin(), stride_vec.end()); + auto output = JUST(view::BasicView(input, target_shape, stride, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { auto backward_fn = std::make_shared(); backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads, @@ -381,7 +380,7 @@ Maybe AsStrided(const std::shared_ptr& input, const std::ve JUST(input->device()), /*pin_memory=*/false)); in_grads->resize(1); (*in_grads)[0] = - JUST(functional::AsStridedGrad(out_grads[0], like, size, stride, storage_offset)); + JUST(functional::AsStridedGrad(out_grads[0], like, size, stride_vec, storage_offset)); return Maybe::Ok(); }; backward_fn->status = []() { return true; }; @@ -404,13 +403,13 @@ Maybe Transpose(const std::shared_ptr& input, const std::vector< for (auto i = 0; i < positive_perm.size(); i++) { JUST(maybe_wrap_dim(positive_perm[i], ndim)); } DimVector target_dims(ndim); - DimVector stride_vec(ndim); + Stride stride(ndim); for (int i = 0; i < ndim; ++i) { target_dims[i] = shape->At(permute[i]); - stride_vec[i] = strides->At(permute[i]); + stride[i] = strides->at(permute[i]); } - auto output = JUST(BasicView(input, Shape(target_dims), Stride(stride_vec), storage_offset)); + auto output = JUST(BasicView(input, Shape(target_dims), stride, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { auto backward_fn = std::make_shared(); backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads, @@ -451,20 +450,20 @@ Maybe UnfoldTensor(const std::shared_ptr& input, const int32_t& CHECK_GT_OR_RETURN(step, 0) << "attibute step should be > 0, but got " << size; DimVector out_shape(ndim + 1); - DimVector out_stride(ndim + 1); + Stride out_stride(ndim + 1); out_shape[ndim] = size; - out_stride[ndim] = ndim == 0 ? 1 : stride->At(dimension); + out_stride[ndim] = ndim == 0 ? 1 : stride->at(dimension); for (int64_t d = 0; d < ndim; ++d) { const int64_t in_size_at_d = shape->At(d); if (d == dimension) { out_shape.at(d) = (in_size_at_d - size) / step + 1; - out_stride.at(d) = step * stride->At(d); + out_stride.at(d) = step * stride->at(d); } else { out_shape.at(d) = in_size_at_d; - out_stride.at(d) = stride->At(d); + out_stride.at(d) = stride->at(d); } } - auto output = JUST(BasicView(input, Shape(out_shape), Stride(out_stride), storage_offset)); + auto output = JUST(BasicView(input, Shape(out_shape), out_stride, storage_offset)); if (autograd::GradMode::is_enabled() && input->requires_grad()) { auto backward_fn = std::make_shared(); @@ -504,24 +503,24 @@ Maybe Diagonal(const std::shared_ptr& input, const int32_t offse if (diag_size == 0) { // skip } else if (offset >= 0) { - storage_offset += offset * stride->At(dim2); + storage_offset += offset * stride->at(dim2); } else { - storage_offset -= offset * stride->At(dim1); + storage_offset -= offset * stride->at(dim1); } CHECK_GE_OR_RETURN(ndim, 2) << "input tensor's ndim should be >= 2, but got " << ndim; // infer output shape and stride DimVector out_shape(shape->dim_vec()); - DimVector out_stride(stride->StrideVec()); + Stride out_stride(*stride); out_shape.erase(out_shape.begin() + std::max(dim1, dim2)); out_stride.erase(out_stride.begin() + std::max(dim1, dim2)); out_shape.erase(out_shape.begin() + std::min(dim1, dim2)); out_stride.erase(out_stride.begin() + std::min(dim1, dim2)); out_shape.emplace_back(diag_size); - out_stride.emplace_back(stride->At(dim1) + stride->At(dim2)); + out_stride.emplace_back(stride->at(dim1) + stride->at(dim2)); // generate view tensor - auto output = JUST(BasicView(input, Shape(out_shape), Stride(out_stride), storage_offset)); + auto output = JUST(BasicView(input, Shape(out_shape), out_stride, storage_offset)); // autograd if (autograd::GradMode::is_enabled() && input->requires_grad()) { std::vector input_index{dim1, dim2}; diff --git a/oneflow/core/framework/user_op_hob.h b/oneflow/core/framework/user_op_hob.h index 2fbba415358..390f81899d1 100644 --- a/oneflow/core/framework/user_op_hob.h +++ b/oneflow/core/framework/user_op_hob.h @@ -46,6 +46,7 @@ ALWAYS_INLINE inline auto HobDataType(const std::string& tensor_name, int tensor return hob::make_custom( string_stream.str(), [tensor_name, tensor_idx](const KernelRegContext& ctx) -> DataType { const user_op::TensorDesc* desc = ctx.TensorDesc4ArgNameAndIndex(tensor_name, tensor_idx); + CHECK(desc != nullptr) << "key `" << tensor_name << "_" << tensor_idx << "` not found."; return desc->data_type(); }); } diff --git a/oneflow/core/framework/user_op_tensor.h b/oneflow/core/framework/user_op_tensor.h index b77f9ec06cb..cce7d5ee5c5 100644 --- a/oneflow/core/framework/user_op_tensor.h +++ b/oneflow/core/framework/user_op_tensor.h @@ -38,8 +38,8 @@ class Tensor { ~Tensor() = default; #pragma GCC diagnostic pop - virtual ShapeView shape() const = 0; - virtual MutShapeView mut_shape() = 0; + virtual ShapeView shape_view() const = 0; + virtual MutShapeView mut_shape_view() = 0; virtual const Stride& stride() const = 0; virtual DataType data_type() const = 0; virtual const MemoryCase& mem_case() const = 0; diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index fe62eb5f858..37c663d676f 100755 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -968,6 +968,11 @@ Double alpha=1.0) => MatMul" bind_python: True +- name: "mv" + signature: + "Tensor (Tensor input, Tensor vec) => Mv" + bind_python: True + - name: "fused_mlp" signature: "Tensor (Tensor x, TensorTuple weights, TensorTuple biases, Bool skip_final_activation) => FusedMLP" @@ -1027,11 +1032,11 @@ bind_python: False - name: "nll_loss" - signature: "Tensor(Tensor input, Tensor target, Tensor weight=None, Int64 ignore_index, String reduction) => NllLoss" + signature: "Tensor(Tensor input, Tensor target, Tensor weight=None, Int64 ignore_index, String reduction) => NLLLoss" bind_python: True -- name: "nll_loss_grad" - signature: "Tensor(Tensor dy, Tensor input, Tensor target, Tensor weight=None, Tensor total_target, Int64 ignore_index) => NllLossGrad" +- name: "nll_grad" + signature: "Tensor(Tensor out_grad, Tensor input, Tensor target, Tensor weight=None, Int64 ignore_index) => NLLGrad" bind_python: False - name: "binary_cross_entropy_loss" @@ -1297,14 +1302,6 @@ signature: "Tensor (Tensor x, Int64 start, Int64 end) => SliceView1dContiguous" bind_python: True -- name: "slice" - signature: "Tensor (Tensor x, Int64List start, Int64List stop, Int64List step, Bool enable_view_slice=None) => Slice" - bind_python: True - -- name: "slice_grad" - signature: "Tensor (Tensor dy, Shape like, Int64List start, Int64List stop, Int64List step) => SliceGrad" - bind_python: False - - name: "narrow" signature: "Tensor (Tensor input, Int64 dim, Int64 start, Int64 length) => Narrow" bind_python: True @@ -1313,17 +1310,17 @@ signature: "Tensor (Tensor dy, Tensor like, Int64 dim, Int64 start, Int64 length) => NarrowGrad" bind_python: False -- name: "slice_update" - signature: "Tensor (Tensor x, Tensor update, Int64List start, Int64List stop, Int64List step, *, Bool inplace=False) => SliceUpdate" +- name: "slice" + signature: "Tensor (Tensor x, Int64List start, Int64List stop, Int64List step, Bool enable_view_slice=None) => Slice" bind_python: True -- name: "logical_slice" - signature: "Tensor (Tensor x, Int64List start, Int64List stop, Int64List step, Bool enable_view_slice=None) => LogicalSlice" +- name: "slice_update" + signature: "Tensor (Tensor ref, Tensor value, Int64List start, Int64List stop, Int64List step, Bool inplace=False) => SliceUpdate" bind_python: True -- name: "logical_slice_assign" - signature: "Tensor (Tensor ref, Tensor value, Int64List start, Int64List stop, Int64List step) => LogicalSliceAssign" - bind_python: True +- name: "slice_grad" + signature: "Tensor (Tensor dy, Shape like_shape, Int64List start, Int64List stop, Int64List step) => SliceGrad" + bind_python: False - name: "copy" signature: "Tensor (Tensor x, String device_type, Int64 device_id, Bool pin_memory=False) => Copy" diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index baf634f1679..b0fefaf0fae 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -1227,59 +1227,6 @@ class InplaceToContiguousFunctor { std::shared_ptr assign_op_; }; -class SliceBaseFunctor { - public: - SliceBaseFunctor() = default; - virtual ~SliceBaseFunctor() = default; - Maybe operator()(const std::shared_ptr& x, const std::vector& start, - const std::vector& stop, const std::vector& step, - const Optional& enable_view_slice) const { - if (view::IsViewApplicable(x) && enable_view_slice.value_or(false)) { - return view::Slice(x, start, stop, step); - } - - MutableAttrMap attrs; - JUST(attrs.SetAttr>("start", start)); - JUST(attrs.SetAttr>("stop", stop)); - JUST(attrs.SetAttr>("step", step)); - return OpInterpUtil::Dispatch(*op_, {x}, attrs); - } - - protected: - std::shared_ptr op_; -}; - -class SliceGradBaseFunctor { - public: - SliceGradBaseFunctor() = default; - virtual ~SliceGradBaseFunctor() = default; - Maybe operator()(const std::shared_ptr& dy, const Shape& like, - const std::vector& start, const std::vector& stop, - const std::vector& step) const { - MutableAttrMap attrs; - JUST(attrs.SetAttr("like_shape", like)); - JUST(attrs.SetAttr>("start", start)); - JUST(attrs.SetAttr>("stop", stop)); - JUST(attrs.SetAttr>("step", step)); - return OpInterpUtil::Dispatch(*op_, {dy}, attrs); - } - - protected: - std::shared_ptr op_; -}; - -class SliceFunctor : public SliceBaseFunctor { - public: - SliceFunctor() { op_ = CHECK_JUST(one::OpBuilder("slice").Input("x").Output("y").Build()); } -}; - -class SliceGradFunctor : public SliceGradBaseFunctor { - public: - SliceGradFunctor() { - op_ = CHECK_JUST(one::OpBuilder("slice_grad").Input("dy").Output("dx").Build()); - } -}; - class NarrowFunctor { public: NarrowFunctor() { op_ = CHECK_JUST(one::OpBuilder("narrow").Input("in").Output("out").Build()); } @@ -1333,45 +1280,35 @@ class NarrowGradFunctor { std::shared_ptr op_; }; -class LogicalSliceFunctor : public SliceBaseFunctor { +class SliceFunctor { public: - LogicalSliceFunctor() { - op_ = CHECK_JUST(one::OpBuilder("logical_slice").Input("x").Output("y").Build()); - } -}; + SliceFunctor() { op_ = CHECK_JUST(one::OpBuilder("slice").Input("x").Output("y").Build()); } + Maybe operator()(const std::shared_ptr& x, const std::vector& start, + const std::vector& stop, const std::vector& step, + const Optional& enable_view_slice) const { + if (view::IsViewApplicable(x) && enable_view_slice.value_or(false)) { + return view::Slice(x, start, stop, step); + } -class LogicalSliceAssignFunctor { - public: - LogicalSliceAssignFunctor() { - op_ = CHECK_JUST( - one::OpBuilder("logical_slice_assign").Input("ref").Input("value").Output("y").Build()); - } - Maybe operator()(const std::shared_ptr& ref, - const std::shared_ptr& value, - const std::vector& start, const std::vector& stop, - const std::vector& step) const { MutableAttrMap attrs; JUST(attrs.SetAttr>("start", start)); JUST(attrs.SetAttr>("stop", stop)); JUST(attrs.SetAttr>("step", step)); - auto outputs = std::make_shared(1); - JUST(CheckInplaceValid(ref)); - JUST(VectorAt(*outputs, 0)) = ref; - JUST(OpInterpUtil::Dispatch(*op_, {ref, value}, outputs.get(), attrs)); - return JUST(VectorAt(*outputs, 0)); + return OpInterpUtil::Dispatch(*op_, {x}, attrs); } - private: + protected: std::shared_ptr op_; }; class SliceUpdateFunctor { public: SliceUpdateFunctor() { - op_ = CHECK_JUST(one::OpBuilder("slice_update").Input("x").Input("update").Output("y").Build()); + op_ = + CHECK_JUST(one::OpBuilder("slice_update").Input("ref").Input("value").Output("y").Build()); } - Maybe operator()(const std::shared_ptr& x, - const std::shared_ptr& update, + Maybe operator()(const std::shared_ptr& ref, + const std::shared_ptr& value, const std::vector& start, const std::vector& stop, const std::vector& step, bool inplace) const { MutableAttrMap attrs; @@ -1380,13 +1317,13 @@ class SliceUpdateFunctor { JUST(attrs.SetAttr>("step", step)); if (inplace) { - JUST(CheckInplaceValid(x)); auto outputs = std::make_shared(1); - (*outputs)[0] = x; - JUST(OpInterpUtil::Dispatch(*op_, {x, update}, outputs.get(), attrs)); - return outputs->at(0); + JUST(CheckInplaceValid(ref)); + JUST(VectorAt(*outputs, 0)) = ref; + JUST(OpInterpUtil::Dispatch(*op_, {ref, value}, outputs.get(), attrs)); + return JUST(VectorAt(*outputs, 0)); } else { - return OpInterpUtil::Dispatch(*op_, {x, update}, attrs); + return OpInterpUtil::Dispatch(*op_, {ref, value}, attrs); } } @@ -1394,6 +1331,26 @@ class SliceUpdateFunctor { std::shared_ptr op_; }; +class SliceGradFunctor { + public: + SliceGradFunctor() { + op_ = CHECK_JUST(one::OpBuilder("slice_grad").Input("dy").Output("dx").Build()); + } + Maybe operator()(const std::shared_ptr& dy, const Shape& like_shape, + const std::vector& start, const std::vector& stop, + const std::vector& step) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("like_shape", like_shape)); + JUST(attrs.SetAttr>("start", start)); + JUST(attrs.SetAttr>("stop", stop)); + JUST(attrs.SetAttr>("step", step)); + return OpInterpUtil::Dispatch(*op_, {dy}, attrs); + } + + protected: + std::shared_ptr op_; +}; + class UpsampleGradFunctor { public: UpsampleGradFunctor() { @@ -2030,7 +1987,7 @@ class TensorGetItemFunctor { if (is_identity) { result = expand_input; } else { - result = JUST(Slice(expand_input, start, end, step, /*enable_view_slice=*/false)); + result = JUST(Slice(expand_input, start, end, step, /*enable_view_slice=*/true)); } Shape shape(DimVector(target_dims.begin(), target_dims.end())); @@ -2133,17 +2090,7 @@ class TensorSetItemFunctor { if (slice_shape != *(value_tensor->shape())) { value_tensor = JUST(Reshape(value_tensor, slice_shape)); } - bool requires_grad = - (x->requires_grad() || value_tensor->requires_grad()) && autograd::GradMode::is_enabled(); - if (x->is_local()) { - if (requires_grad) { - JUST(SliceUpdate(x, value_tensor, start, end, step, /*inplace=*/true)); - } else { - JUST(LogicalSliceAssign(x, value_tensor, start, end, step)); - } - } else { - JUST(LogicalSliceAssign(x, value_tensor, start, end, step)); - } + JUST(SliceUpdate(x, value_tensor, start, end, step, /*inplace=*/true)); } return Maybe::Ok(); } @@ -2998,20 +2945,8 @@ class RepeatInterLeaveTensorFunctor { std::shared_ptr cumsum = JUST(Cumsum(repeats, 0, DType::Int32())); const int64_t& output_size_value = std::accumulate(repeats_value.begin(), repeats_value.end(), 0); - std::shared_ptr res; - if (output_size_value > 0) { - res = JUST(IndexSelect(input, dim_, - JUST(RepeatInterLeaveIndex(repeats, cumsum, output_size_value)))); - } else { - // Deal with 0-size Tensor. - DimVector new_input_shape(input_shape->dim_vec().begin(), input_shape->dim_vec().end()); - new_input_shape[dim_] = 0; - std::shared_ptr new_input = - JUST(Constant(Shape{new_input_shape}, Scalar(0), input->dtype(), JUST(input->device()))); - res = JUST(IndexSelect(new_input, dim_, - JUST(RepeatInterLeaveIndex(repeats, cumsum, output_size_value)))); - } - return res; + return JUST( + IndexSelect(input, dim_, JUST(RepeatInterLeaveIndex(repeats, cumsum, output_size_value)))); } }; @@ -3077,14 +3012,15 @@ class ReshapeLikeFunctor { class PinMemoryFunctor { public: PinMemoryFunctor() { - op_ = CHECK_JUST(one::OpBuilder("slice_update").Input("x").Input("update").Output("y").Build()); + op_ = + CHECK_JUST(one::OpBuilder("slice_update").Input("ref").Input("value").Output("y").Build()); } Maybe operator()(const std::shared_ptr& input) const { // TODO:(zhaoluyang) support consistent tensor.pin_memory() CHECK_OR_RETURN(input->is_local() && !(LazyMode::is_enabled())) << Error::RuntimeError() << "Tensor.pin_memory() only support local tensor for now!"; // if tensor already pinned, then just return - if (JUST(JUST(input->AsMirroredTensor())->eager_blob_object())->pin_memory()) { return input; } + if (JUST(JUST(input->AsMirroredTensor())->is_pinned())) { return input; } auto shape = input->shape(); auto device = JUST(input->device()); const bool requires_grad = input->requires_grad(); @@ -3162,13 +3098,11 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("View"); m.add_functor("ToContiguous"); m.add_functor("InplaceToContiguous"); - m.add_functor("Slice"); - m.add_functor("SliceGrad"); m.add_functor("Narrow"); m.add_functor("NarrowGrad"); - m.add_functor("LogicalSliceAssign"); - m.add_functor("LogicalSlice"); m.add_functor("SliceUpdate"); + m.add_functor("Slice"); + m.add_functor("SliceGrad"); m.add_functor("SliceView1dContiguous"); m.add_functor("Copy"); m.add_functor("Flip"); diff --git a/oneflow/core/functional/impl/common.cpp b/oneflow/core/functional/impl/common.cpp index 11cf67a2ab9..79ddc6bad62 100644 --- a/oneflow/core/functional/impl/common.cpp +++ b/oneflow/core/functional/impl/common.cpp @@ -95,22 +95,20 @@ Optional ComputeStride(const Shape& shape, const Stride& stride, * Description: in some case, view operate is not allowed, so need to check it's validation, * the check refers to torch(aten/src/ATen/native/TensorShape.cpp) *************************************************/ - if (stride.NumAxes() == 0) { + if (stride.size() == 0) { // for scalar input tensor - DimVector newstride(target_shape.NumAxes(), 1); - return Stride(newstride); + return Stride(target_shape.NumAxes(), 1); } int64_t elem_count = shape.elem_cnt(); int64_t ndim = shape.NumAxes(); int64_t tgt_ndim = target_shape.NumAxes(); DimVector shape_vec = shape.dim_vec(); DimVector tgt_shape_vec = target_shape.dim_vec(); - DimVector stride_vec = stride.StrideVec(); if (elem_count == 0) { return NullOpt; } int64_t view_d = tgt_ndim - 1; - int64_t chunk_base_stride = stride_vec.back(); - DimVector newstride(tgt_ndim); + int64_t chunk_base_stride = stride.back(); + Stride target_stride(tgt_ndim); // stride for each subspace in the chunk // numel in current chunk int64_t tensor_numel = 1; @@ -120,22 +118,21 @@ Optional ComputeStride(const Shape& shape, const Stride& stride, // if end of tensor size chunk, check view if ((tensor_d == 0) || (shape_vec[tensor_d - 1] != 1 - && stride_vec[tensor_d - 1] != tensor_numel * chunk_base_stride)) { + && stride[tensor_d - 1] != tensor_numel * chunk_base_stride)) { while (view_d >= 0 && (view_numel < tensor_numel || tgt_shape_vec[view_d] == 1)) { - newstride[view_d] = view_numel * chunk_base_stride; + target_stride[view_d] = view_numel * chunk_base_stride; view_numel *= tgt_shape_vec[view_d]; view_d--; } if (view_numel != tensor_numel) { return NullOpt; } if (tensor_d > 0) { - chunk_base_stride = stride_vec[tensor_d - 1]; + chunk_base_stride = stride[tensor_d - 1]; tensor_numel = 1; view_numel = 1; } } } if (view_d != -1) { return NullOpt; } - Stride target_stride(newstride); return target_stride; } diff --git a/oneflow/core/functional/impl/consistent_cast.cpp b/oneflow/core/functional/impl/consistent_cast.cpp index a01ef9d93a5..2af4f1b0d0e 100644 --- a/oneflow/core/functional/impl/consistent_cast.cpp +++ b/oneflow/core/functional/impl/consistent_cast.cpp @@ -242,7 +242,7 @@ Maybe GetConcatenatedShapeAndCheckDtype( if (nd_sbp->sbp_parallel(i).has_split_parallel()) { int64_t concat_axis = nd_sbp->sbp_parallel(i).split_parallel().axis(); int64_t group_size = parallel_hierarchy->Count(0, i); - int64_t stride = parallel_stride.At(i); + int64_t stride = parallel_stride.at(i); for (int group_id = 0; group_id < group_size; ++group_id) { int64_t parallel_num_in_group = parallel_hierarchy->At(i); for (int64_t stride_id = 0; stride_id < stride; ++stride_id) { @@ -470,7 +470,7 @@ class LocalToConsistentFunctor { CHECK_OR_RETURN(x->is_local()) << Error::RuntimeError() << "Expected local tensor for local_to_global but got global tensor!"; - std::shared_ptr input = x; + std::shared_ptr input = x->contiguous(); // copy to right device first if input's device type is wrong if (JUST(input->device())->type() != parallel_desc->device_tag()) { VLOG(2) << "The device_type of the input tensor is different from placement, now copy it to " diff --git a/oneflow/core/functional/impl/math_functor.cpp b/oneflow/core/functional/impl/math_functor.cpp index ab22624934a..71da3290b01 100644 --- a/oneflow/core/functional/impl/math_functor.cpp +++ b/oneflow/core/functional/impl/math_functor.cpp @@ -119,9 +119,16 @@ class ScalarMathBaseFunctor { if (inplace) { JUST(CheckInplaceCastValid(x, casted_vec[0])); JUST(CheckInplaceValid(x)); + std::shared_ptr outputs = std::make_shared(1); - outputs->at(0) = x; - JUST(OpInterpUtil::Dispatch(*op_, {x}, outputs.get(), attrs)); + (*outputs)[0] = x; + // TODO:(zhaoluyang) + // If the op need inplace operaton, and input tensor is non-contiguous, + // the interpreter will do input->contiguous() operaton for geting the correct result, + // therefore, output tensor and input will not inplaced. When scalar_math op/kernel + // support strided tensor as input, the problem above will be solved! + JUST(OpInterpUtil::Dispatch(*op_, {x}, outputs.get(), + OpExprInterpContext(attrs, /*inplace=*/true))); return outputs->at(0); } else { return OpInterpUtil::Dispatch(*op_, casted_vec, attrs); @@ -1654,7 +1661,7 @@ class SelectFunctor { int32_t pos_index = index >= 0 ? index : index + size; std::vector sizes(input->shape()->dim_vec().begin(), input->shape()->dim_vec().end()); - const auto& stride = JUST(input->stride())->StrideVec(); + const auto& stride = *JUST(input->stride()); std::vector strides(stride.begin(), stride.end()); auto storage_offset = JUST(input->storage_offset()) + pos_index * strides[pos_dim]; @@ -2130,7 +2137,7 @@ class TensorSplitVecFunctor { output[i] = JUST(Slice(input, start, stop, step, /*enable_view_slice=*/false)); start[pos_dim] = end_idx; } - stop[pos_dim] = input->shape()->At(ndim - 1); + stop[pos_dim] = input->shape()->At(pos_dim); output[num_indices] = JUST(Slice(input, start, stop, step, /*enable_view_slice=*/false)); return output; diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index a453cdb4dfe..fcb86c707cc 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -1099,23 +1099,25 @@ class BinaryCrossEntropyWithLogitsLossFunctor : public LossFunctorBase { std::shared_ptr op_weight_pos_; }; -class NllLossFunctor { +class NLLLossFunctor { public: - NllLossFunctor() { + NLLLossFunctor() { op_ = CHECK_JUST(one::OpBuilder("nll") .Input("input") .Input("target") - .Output("out") - .Output("total_weight") + .Output("output") + .Output("out_weight") .Build()); + op_weight_ = CHECK_JUST(one::OpBuilder("nll") .Input("input") .Input("target") .Input("weight") - .Output("out") - .Output("total_weight") + .Output("output") + .Output("out_weight") .Build()); } + Maybe operator()(const std::shared_ptr& input, const std::shared_ptr& target, const Optional& weight, const int64_t& ignore_index, @@ -1124,42 +1126,65 @@ class NllLossFunctor { << Error::RuntimeError() << "Reduction should be none, sum or mean."; const auto& input_shape = input->shape(); + const int64_t K = input_shape->NumAxes(); + CHECK_GE_OR_RETURN(K, 2) << Error::RuntimeError() << "Expected 2 or more dimensions"; + const int64_t N = input_shape->At(0); + const int64_t C = input_shape->At(1); + const auto& target_shape = target->shape(); - CHECK_LE_OR_RETURN(input_shape->NumAxes(), 5) - << Error::RuntimeError() << "The number of input's axis should be less equal to 5. "; - CHECK_EQ_OR_RETURN(input_shape->NumAxes() - 1, target_shape->NumAxes()) - << Error::RuntimeError() - << "The number of input's axis should be equal to the number of target's axis - 1. "; + CHECK_EQ_OR_RETURN(target_shape->NumAxes(), K - 1) + << Error::RuntimeError() << "Expected target dimensions (" << K - 1 + << ") to match input dimensions (" << K << "), got " << target_shape->NumAxes(); + CHECK_EQ_OR_RETURN(target_shape->At(0), N) + << Error::RuntimeError() << "Expected input batch_size (" << N + << ") to match target batch_size (" << target_shape->At(0) << ")"; + + std::shared_ptr input_; + std::shared_ptr target_; + if (K > 2) { + DimVector idea_target_dim_vec; + idea_target_dim_vec.push_back(N); + for (int64_t i = 2; i < K; ++i) { idea_target_dim_vec.push_back(input_shape->At(i)); } + Shape idea_target_shape(idea_target_dim_vec); + CHECK_EQ_OR_RETURN(*target_shape, idea_target_shape) + << Error::RuntimeError() << "Expected target shape " << idea_target_shape.ToString() + << ", got " << target_shape->ToString(); + + std::vector perm(input_shape->dim_vec().size(), 0); + perm[perm.size() - 1] = 1; + for (size_t i = 1; i < perm.size() - 1; ++i) { perm[i] = i + 1; } + + input_ = JUST(sequence_function(functional::Transpose) + .then(std::bind(functional::Reshape, std::placeholders::_1, Shape({-1, C}))) + .call(input, perm)); + target_ = JUST(functional::Flatten(target, 0, K - 2)); + } else { + input_ = input; + target_ = target; + } MutableAttrMap attrs; JUST(attrs.SetAttr("ignore_index", ignore_index)); - std::vector input_perm(input_shape->dim_vec().size(), 0); - input_perm[input_perm.size() - 1] = 1; - for (size_t i = 1; i < input_perm.size() - 1; ++i) { input_perm[i] = i + 1; } - - const auto input_ = JUST(sequence_function(functional::Transpose) - .then(std::bind(functional::Reshape, std::placeholders::_1, - Shape({-1, input_shape->At(1)}))) - .call(input, input_perm)); - auto target_ = JUST(functional::Flatten(target, 0, target_shape->NumAxes() - 1)); - - std::shared_ptr kernel_result; - std::shared_ptr result; + std::shared_ptr nll_result; if (weight) { - kernel_result = JUST( + nll_result = JUST( OpInterpUtil::Dispatch(*op_weight_, {input_, target_, JUST(weight)}, attrs)); } else { - kernel_result = JUST(OpInterpUtil::Dispatch(*op_, {input_, target_}, attrs)); + nll_result = JUST(OpInterpUtil::Dispatch(*op_, {input_, target_}, attrs)); } - result = JUST(functional::Reshape(kernel_result->at(0), *target_shape)); - if (reduction == "none") { return result; } + auto output = JUST(VectorAt(*nll_result, 0)); + + if (K > 2) { output = JUST(functional::Reshape(output, *target_shape)); } - result = JUST(functional::ReduceSum(result, {}, false)); + if (reduction == "none") { return output; } - if (reduction == "sum") { return result; } + auto sum = JUST(functional::ReduceSum(output, {}, false)); - return functional::Div(result, kernel_result->at(1)); + if (reduction == "sum") { return sum; } + + auto total_weight = JUST(functional::ReduceSum(JUST(VectorAt(*nll_result, 1)), {}, false)); + return functional::Div(sum, total_weight); } private: @@ -1171,18 +1196,20 @@ class CrossEntropyFunctor { public: CrossEntropyFunctor() { op_log_softmax_ = CHECK_JUST(one::OpBuilder("log_softmax").Input("in").Output("prob").Build()); + op_nll_ = CHECK_JUST(one::OpBuilder("nll") .Input("input") .Input("target") - .Output("out") - .Output("total_weight") + .Output("output") + .Output("out_weight") .Build()); + op_nll_weight_ = CHECK_JUST(one::OpBuilder("nll") .Input("input") .Input("target") .Input("weight") - .Output("out") - .Output("total_weight") + .Output("output") + .Output("out_weight") .Build()); } Maybe operator()(const std::shared_ptr& input, @@ -1193,8 +1220,6 @@ class CrossEntropyFunctor { << Error::RuntimeError() << "Reduction should be none, sum or mean."; const auto& input_shape = input->shape(); const auto& target_shape = target->shape(); - MutableAttrMap attrs; - JUST(attrs.SetAttr("ignore_index", ignore_index)); std::vector input_perm(input_shape->dim_vec().size(), 0); input_perm[input_perm.size() - 1] = 1; @@ -1210,21 +1235,26 @@ class CrossEntropyFunctor { const auto target_ = JUST(functional::Flatten(target, 0, target->shape()->NumAxes() - 1)); - std::shared_ptr kernel_result; - std::shared_ptr result; + MutableAttrMap attrs; + JUST(attrs.SetAttr("ignore_index", ignore_index)); + + std::shared_ptr nll_result; if (weight) { - kernel_result = JUST(OpInterpUtil::Dispatch( + nll_result = JUST(OpInterpUtil::Dispatch( *op_nll_weight_, {input_, target_, JUST(weight)}, attrs)); } else { - kernel_result = JUST(OpInterpUtil::Dispatch(*op_nll_, {input_, target_}, attrs)); + nll_result = JUST(OpInterpUtil::Dispatch(*op_nll_, {input_, target_}, attrs)); } - result = JUST(functional::Reshape((*kernel_result)[0], *target_shape)); - if (reduction == "none") { return result; } - result = JUST(functional::ReduceSum(result, {}, false)); - if (reduction == "sum") { return result; } + auto output = JUST(VectorAt(*nll_result, 0)); + output = JUST(functional::Reshape(output, *target_shape)); + if (reduction == "none") { return output; } + + auto sum = JUST(functional::ReduceSum(output, {}, false)); + if (reduction == "sum") { return sum; } - return functional::Div(result, kernel_result->at(1)); + auto total_weight = JUST(functional::ReduceSum(JUST(VectorAt(*nll_result, 1)), {}, false)); + return functional::Div(sum, total_weight); } private: @@ -3310,6 +3340,29 @@ class RocAucScoreFunctor { std::shared_ptr op_; }; +class MvFunctor { + public: + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& vec) const { + const auto& input_shape = input->shape(); + const auto& vec_shape = vec->shape(); + CHECK_OR_RETURN(input_shape->NumAxes() == 2 && vec_shape->NumAxes() == 1) + << Error::RuntimeError() << "vector + matrix @ vector expected, got " + << "1, " << input_shape->NumAxes() << ", " << vec_shape->NumAxes(); + CHECK_EQ_OR_RETURN(input_shape->at(1), vec_shape->at(0)) + << Error::RuntimeError() << "size mismatch, got " << std::to_string(input_shape->at(0)) + << ", " << std::to_string(input_shape->at(0)) << "x" << std::to_string(input_shape->at(1)) + << ", " << std::to_string(vec_shape->at(0)); + // TODO(zhongshsh): speedup + const std::shared_ptr reshape_vec = + JUST(Reshape(vec, Shape(DimVector{vec_shape->at(0), 1}))); + std::shared_ptr out = JUST(MatMul(input, reshape_vec, false, false, 1.0)); + std::shared_ptr reshape_out = JUST(Squeeze( + JUST(Reshape(out, Shape(DimVector{1, input_shape->at(0)}))), std::vector({0}))); + return reshape_out; + } +}; + } // namespace impl ONEFLOW_FUNCTION_LIBRARY(m) { @@ -3323,6 +3376,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("EmbeddingReNorm"); m.add_functor("Embedding"); m.add_functor("MatMul"); + m.add_functor("Mv"); m.add_functor("BatchMatMul"); m.add_functor("TensorDot"); m.add_functor("TensorDotIntDims"); @@ -3340,7 +3394,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("L1Loss"); m.add_functor("MseLoss"); m.add_functor("KLDivLoss"); - m.add_functor("NllLoss"); + m.add_functor("NLLLoss"); m.add_functor("BinaryCrossEntropyLoss"); m.add_functor("BinaryCrossEntropyWithLogitsLoss"); m.add_functor("SparseCrossEntropy"); diff --git a/oneflow/core/functional/impl/nn_grad_functor.cpp b/oneflow/core/functional/impl/nn_grad_functor.cpp index 8e43b83ddb1..5689710ac2b 100644 --- a/oneflow/core/functional/impl/nn_grad_functor.cpp +++ b/oneflow/core/functional/impl/nn_grad_functor.cpp @@ -363,39 +363,37 @@ class KLDivLossGradFunctor { std::shared_ptr op_; }; -class NllLossGradFunctor { +class NLLGradFunctor { public: - NllLossGradFunctor() { + NLLGradFunctor() { op_ = CHECK_JUST(one::OpBuilder("nll_grad") + .Input("out_grad") .Input("input") .Input("target") - .Input("total_weight") - .Input("dy") - .Output("dx") + .Output("in_grad") .Build()); + op_weight_ = CHECK_JUST(one::OpBuilder("nll_grad") + .Input("out_grad") .Input("input") .Input("target") - .Input("total_weight") .Input("weight") - .Input("dy") - .Output("dx") + .Output("in_grad") .Build()); } - Maybe operator()(const std::shared_ptr& dy, + + Maybe operator()(const std::shared_ptr& out_grad, const std::shared_ptr& input, const std::shared_ptr& target, - const Optional& weight, - const std::shared_ptr& total_weight, - const int64_t ignore_index) const { + const Optional& weight, const int64_t ignore_index) const { MutableAttrMap attrs; JUST(attrs.SetAttr("ignore_index", ignore_index)); if (weight) { - return OpInterpUtil::Dispatch( - *op_weight_, {input, target, total_weight, JUST(weight), dy}, attrs); + return OpInterpUtil::Dispatch(*op_weight_, + {out_grad, input, target, JUST(weight)}, attrs); } else { - return OpInterpUtil::Dispatch(*op_, {input, target, total_weight, dy}, attrs); + return OpInterpUtil::Dispatch(*op_, {out_grad, input, target}, attrs); } } @@ -1120,7 +1118,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("TFPoolNdGrad"); m.add_functor("AdaptivePoolNdGrad"); m.add_functor("KLDivLossGrad"); - m.add_functor("NllLossGrad"); + m.add_functor("NLLGrad"); m.add_functor("BinaryCrossEntropyLossGrad"); m.add_functor( "BinaryCrossEntropyWithLogitsLossGrad"); diff --git a/oneflow/core/functional/tensor_index.cpp b/oneflow/core/functional/tensor_index.cpp index c4f4a81ddf7..b73564164ab 100644 --- a/oneflow/core/functional/tensor_index.cpp +++ b/oneflow/core/functional/tensor_index.cpp @@ -75,8 +75,8 @@ Maybe ExpandMaskIndex(const std::shared_ptr& index) { JUST(SyncAccessTensorWithTimeOut(size_tensor, callback, "const")); for (int i = 0; i < index->ndim(); ++i) { - auto item = JUST( - functional::Slice((*res)[0], {0, i}, {size, i + 1}, {1, 1}, /*enable_view_slice=*/false)); + auto item = JUST(functional::Slice((*res)[0], {0, i}, {size, i + 1}, {1, 1}, + /*enable_view_slice=*/false)); item = JUST(functional::Reshape(item, {size})); indices->emplace_back(item); } @@ -377,7 +377,7 @@ Maybe ApplySelectIndexing(const std::shared_ptr& input, int32_t pos_index = index >= 0 ? index : index + size; std::vector sizes(input->shape()->dim_vec().begin() + 1, input->shape()->dim_vec().end()); - const auto& stride = JUST(input->stride())->StrideVec(); + const auto& stride = *JUST(input->stride()); const int32_t storage_offset = JUST(input->storage_offset()) + pos_index * stride[pos_dim]; std::vector strides(stride.begin() + 1, stride.end()); diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp index 0845a1b5b02..7205cf2217f 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp @@ -25,6 +25,7 @@ limitations under the License. #include "oneflow/core/graph/boxing/b21_sub_task_graph_builder.h" #include "oneflow/core/graph/boxing/one_to_one_sub_task_graph_builder.h" #include "oneflow/core/graph/boxing/sub_task_graph_builder_util.h" +#include "oneflow/core/framework/sbp_infer_util.h" #include "oneflow/core/job/sbp_parallel.h" #include "oneflow/core/graph/nccl_send_recv_boxing_task_node.h" #include "oneflow/core/job/nd_sbp_util.h" @@ -34,78 +35,6 @@ namespace oneflow { namespace { -void ParallelDimReduce(const ParallelDesc& parallel_desc, const NdSbp& nd_sbp, - ParallelDesc* reduced_parallel_desc, NdSbp* reduced_nd_sbp) { - const auto& hierarchy = parallel_desc.hierarchy(); - DimVector reduced_hierarchy; - FOR_RANGE(int64_t, i, 0, hierarchy->NumAxes()) { - if (hierarchy->At(i) != 1) { - if (reduced_nd_sbp->sbp_parallel().empty() - || (nd_sbp.sbp_parallel(i) - != reduced_nd_sbp->sbp_parallel(reduced_nd_sbp->sbp_parallel_size() - 1))) { - reduced_hierarchy.emplace_back(hierarchy->At(i)); - *reduced_nd_sbp->add_sbp_parallel() = nd_sbp.sbp_parallel(i); - } else { - reduced_hierarchy.back() *= hierarchy->At(i); - } - } - } - if (reduced_hierarchy.empty()) { - reduced_hierarchy.emplace_back(hierarchy->At(0)); - *reduced_nd_sbp->add_sbp_parallel() = nd_sbp.sbp_parallel(0); - } - ParallelConf reduced_parallel_conf = parallel_desc.parallel_conf(); - Shape(reduced_hierarchy).ToProto(reduced_parallel_conf.mutable_hierarchy()); - *reduced_parallel_desc = ParallelDesc(reduced_parallel_conf); -} - -void CollaborativeParallelDimReduce(const ParallelDesc& in_parallel_desc, - const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, - const NdSbp& out_nd_sbp, ParallelDesc* reduced_in_parallel_desc, - ParallelDesc* reduced_out_parallel_desc, - NdSbp* reduced_in_nd_sbp, NdSbp* reduced_out_nd_sbp) { - const auto& in_hierarchy = in_parallel_desc.hierarchy(); - const auto& out_hierarchy = out_parallel_desc.hierarchy(); - CHECK_EQ(in_hierarchy->NumAxes(), out_hierarchy->NumAxes()); - - DimVector reduced_in_hierarchy; - DimVector reduced_out_hierarchy; - FOR_RANGE(int64_t, i, 0, in_hierarchy->NumAxes()) { - if (in_hierarchy->At(i) != 1 || out_hierarchy->At(i) != 1) { - if (reduced_in_nd_sbp->sbp_parallel().empty() - || (in_nd_sbp.sbp_parallel(i) - != reduced_in_nd_sbp->sbp_parallel(reduced_in_nd_sbp->sbp_parallel_size() - 1) - || out_nd_sbp.sbp_parallel(i) - != reduced_out_nd_sbp->sbp_parallel(reduced_out_nd_sbp->sbp_parallel_size() - - 1))) { - reduced_in_hierarchy.emplace_back(in_hierarchy->At(i)); - *reduced_in_nd_sbp->add_sbp_parallel() = in_nd_sbp.sbp_parallel(i); - - reduced_out_hierarchy.emplace_back(out_hierarchy->At(i)); - *reduced_out_nd_sbp->add_sbp_parallel() = out_nd_sbp.sbp_parallel(i); - } else { - reduced_in_hierarchy.back() *= in_hierarchy->At(i); - reduced_out_hierarchy.back() *= out_hierarchy->At(i); - } - } - } - if (reduced_in_hierarchy.empty()) { - reduced_in_hierarchy.emplace_back(in_hierarchy->At(0)); - *reduced_in_nd_sbp->add_sbp_parallel() = in_nd_sbp.sbp_parallel(0); - - reduced_out_hierarchy.emplace_back(out_hierarchy->At(0)); - *reduced_out_nd_sbp->add_sbp_parallel() = out_nd_sbp.sbp_parallel(0); - } - - ParallelConf reduced_in_parallel_conf = in_parallel_desc.parallel_conf(); - Shape(reduced_in_hierarchy).ToProto(reduced_in_parallel_conf.mutable_hierarchy()); - *reduced_in_parallel_desc = ParallelDesc(reduced_in_parallel_conf); - - ParallelConf reduced_out_parallel_conf = out_parallel_desc.parallel_conf(); - Shape(reduced_out_hierarchy).ToProto(reduced_out_parallel_conf.mutable_hierarchy()); - *reduced_out_parallel_desc = ParallelDesc(reduced_out_parallel_conf); -} - std::shared_ptr Make1DSubTskGphBuilder() { std::vector> builders; builders.emplace_back(new OneToOneSubTskGphBuilder()); @@ -143,28 +72,6 @@ void MergeParallelConf(const ParallelDesc& parallel_desc_0, const ParallelDesc& } // namespace -void InOutParallelDimReduce(const ParallelDesc& in_parallel_desc, - const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, - const NdSbp& out_nd_sbp, ParallelDesc* reduced_in_parallel_desc, - ParallelDesc* reduced_out_parallel_desc, NdSbp* reduced_in_nd_sbp, - NdSbp* reduced_out_nd_sbp) { - const int64_t in_hierarchy_axes = in_parallel_desc.hierarchy()->NumAxes(); - const int64_t out_hierarchy_axes = out_parallel_desc.hierarchy()->NumAxes(); - if (in_hierarchy_axes == 1 && out_hierarchy_axes == 1) { - *reduced_in_parallel_desc = in_parallel_desc; - *reduced_out_parallel_desc = out_parallel_desc; - *reduced_in_nd_sbp = in_nd_sbp; - *reduced_out_nd_sbp = out_nd_sbp; - } else if (in_hierarchy_axes != out_hierarchy_axes) { - ParallelDimReduce(in_parallel_desc, in_nd_sbp, reduced_in_parallel_desc, reduced_in_nd_sbp); - ParallelDimReduce(out_parallel_desc, out_nd_sbp, reduced_out_parallel_desc, reduced_out_nd_sbp); - } else { - CollaborativeParallelDimReduce(in_parallel_desc, out_parallel_desc, in_nd_sbp, out_nd_sbp, - reduced_in_parallel_desc, reduced_out_parallel_desc, - reduced_in_nd_sbp, reduced_out_nd_sbp); - } -} - class FlatSubTskGphBuilder final : public HierarchicalSubTskGphBuilder { public: OF_DISALLOW_COPY_AND_MOVE(FlatSubTskGphBuilder); diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h index d3fffe33baf..e57323d3d0c 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h @@ -41,12 +41,6 @@ class DispatchHierarchicalSubTskGphBuilder final : public HierarchicalSubTskGphB std::unique_ptr impl_; }; -void InOutParallelDimReduce(const ParallelDesc& in_parallel_desc, - const ParallelDesc& out_parallel_desc, const NdSbp& in_nd_sbp, - const NdSbp& out_nd_sbp, ParallelDesc* reduced_in_parallel_desc, - ParallelDesc* reduced_out_parallel_desc, NdSbp* reduced_in_nd_sbp, - NdSbp* reduced_out_nd_sbp); - } // namespace oneflow #endif // ONEFLOW_CORE_GRAPH_BOXING_HIERARCHICAL_SUB_TASK_GRAPH_BUILDER_IMPL_H_ diff --git a/oneflow/core/graph/graph.h b/oneflow/core/graph/graph.h index a72f728c1d8..b9f62e01696 100644 --- a/oneflow/core/graph/graph.h +++ b/oneflow/core/graph/graph.h @@ -34,7 +34,13 @@ class Graph { // For Each void ForEachNode(std::function NodeHandler) const; Maybe MaybeForEachNode(std::function(NodeType*)> NodeHandler) const; + // In case you want to change the topological structure during the node handler. + // For example, adding/deleting a node or an edge. + // Still, it might have bugs even if you use TopoForEachNodeDynamic. + void TopoForEachNodeDynamic(std::function NodeHandler) const; void TopoForEachNode(std::function NodeHandler) const; + Maybe TopoForEachNodeDynamicWithErrorCaptured( + std::function(NodeType*)> NodeHandler) const; Maybe TopoForEachNodeWithErrorCaptured( std::function(NodeType*)> NodeHandler) const; void ReverseTopoForEachNode(std::function NodeHandler) const; @@ -53,18 +59,40 @@ class Graph { const std::function&)>& ForEachNext, const std::function& Handler) const; + void TopoForEachNodeDynamic( + const std::list& starts, + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function& Handler) const; + void TopoForEachNode( const std::list& starts, const std::function&)>& ForEachInNode, const std::function&)>& ForEachOutNode, const std::function& Handler) const; + void TopoForEachNode( + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function& Handler) const; + + Maybe TopoForEachNodeDynamicWithErrorCaptured( + const std::list& starts, + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function(NodeType*)>& Handler) const; + Maybe TopoForEachNodeWithErrorCaptured( const std::list& starts, const std::function&)>& ForEachInNode, const std::function&)>& ForEachOutNode, const std::function(NodeType*)>& Handler) const; + Maybe TopoForEachNodeWithErrorCaptured( + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function(NodeType*)>& Handler) const; + void DfsTopoForEachNode( const std::list& starts, const std::function&)>& ForEachInNode, @@ -211,16 +239,33 @@ NodeType* Graph::SoleSinkNode() const { return sink_nodes_list.front(); } +template +void Graph::TopoForEachNodeDynamic( + std::function NodeHandler) const { + TopoForEachNodeDynamic(source_nodes(), &NodeType::ForEachNodeOnInEdge, + &NodeType::ForEachNodeOnOutEdge, NodeHandler); +} + template void Graph::TopoForEachNode(std::function NodeHandler) const { - TopoForEachNode(source_nodes(), &NodeType::ForEachNodeOnInEdge, &NodeType::ForEachNodeOnOutEdge, - NodeHandler); + CHECK_JUST(TopoForEachNodeWithErrorCaptured(&NodeType::ForEachNodeOnInEdge, + &NodeType::ForEachNodeOnOutEdge, [&](NodeType* node) { + NodeHandler(node); + return Maybe::Ok(); + })); +} + +template +Maybe Graph::TopoForEachNodeDynamicWithErrorCaptured( + std::function(NodeType*)> NodeHandler) const { + return TopoForEachNodeDynamicWithErrorCaptured(source_nodes(), &NodeType::ForEachNodeOnInEdge, + &NodeType::ForEachNodeOnOutEdge, NodeHandler); } template Maybe Graph::TopoForEachNodeWithErrorCaptured( std::function(NodeType*)> NodeHandler) const { - return TopoForEachNodeWithErrorCaptured(source_nodes(), &NodeType::ForEachNodeOnInEdge, + return TopoForEachNodeWithErrorCaptured(&NodeType::ForEachNodeOnInEdge, &NodeType::ForEachNodeOnOutEdge, NodeHandler); } @@ -229,15 +274,14 @@ void Graph::SortedTopoForEachNode( std::function LessThan, std::function NodeHandler) const { ForEachNode([&](NodeType* node) { node->SortInOutEdges(LessThan); }); - TopoForEachNode(source_nodes(), &NodeType::ForEachNodeOnSortedInEdge, - &NodeType::ForEachNodeOnSortedOutEdge, NodeHandler); + TopoForEachNode(&NodeType::ForEachNodeOnSortedInEdge, &NodeType::ForEachNodeOnSortedOutEdge, + NodeHandler); } template void Graph::ReverseTopoForEachNode( std::function NodeHandler) const { - TopoForEachNode(sink_nodes(), &NodeType::ForEachNodeOnOutEdge, &NodeType::ForEachNodeOnInEdge, - NodeHandler); + TopoForEachNode(&NodeType::ForEachNodeOnOutEdge, &NodeType::ForEachNodeOnInEdge, NodeHandler); } template @@ -493,6 +537,19 @@ std::unique_ptr> Graph::FindFirstNontrivi return std::unique_ptr>(); } +template +void Graph::TopoForEachNodeDynamic( + const std::list& starts, + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function& Handler) const { + CHECK_JUST(TopoForEachNodeDynamicWithErrorCaptured(starts, ForEachInNode, ForEachOutNode, + [&](NodeType* node) { + Handler(node); + return Maybe::Ok(); + })); +} + template void Graph::TopoForEachNode( const std::list& starts, @@ -507,7 +564,18 @@ void Graph::TopoForEachNode( } template -Maybe Graph::TopoForEachNodeWithErrorCaptured( +void Graph::TopoForEachNode( + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function& Handler) const { + CHECK_JUST(TopoForEachNodeWithErrorCaptured(ForEachInNode, ForEachOutNode, [&](NodeType* node) { + Handler(node); + return Maybe::Ok(); + })); +} + +template +Maybe Graph::TopoForEachNodeDynamicWithErrorCaptured( const std::list& starts, const std::function&)>& ForEachInNode, const std::function&)>& ForEachOutNode, @@ -537,6 +605,64 @@ Maybe Graph::TopoForEachNodeWithErrorCaptured( return Maybe::Ok(); } +template +Maybe Graph::TopoForEachNodeWithErrorCaptured( + const std::list& starts, + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function(NodeType*)>& Handler) const { + HashMap counter_in; + std::queue queue; + for (NodeType* start : starts) { + queue.push(start); + counter_in[start] = 0; + ForEachInNode(start, [&](NodeType*) { LOG(FATAL) << "not a source"; }); + } + while (!queue.empty()) { + NodeType* cur_node = queue.front(); + queue.pop(); + JUST(Handler(cur_node)); + ForEachOutNode(cur_node, [&](NodeType* out) { + auto it = counter_in.find(out); + // Move the initialization here + if (it == counter_in.end()) { + int32_t count = 0; + ForEachInNode(out, [&](NodeType* out_in) { count++; }); + counter_in[out] = count; + it = counter_in.find(out); + } + it->second--; + if (it->second == 0) { queue.push(out); } + }); + } + return Maybe::Ok(); +} + +template +Maybe Graph::TopoForEachNodeWithErrorCaptured( + const std::function&)>& ForEachInNode, + const std::function&)>& ForEachOutNode, + const std::function(NodeType*)>& Handler) const { + HashMap counter_in; + std::queue queue; + ForEachNode([&](NodeType* node) { + int32_t count = 0; + ForEachInNode(node, [&](NodeType*) { count++; }); + counter_in[node] = count; + if (count == 0) { queue.push(node); } + }); + while (!queue.empty()) { + NodeType* cur_node = queue.front(); + queue.pop(); + JUST(Handler(cur_node)); + ForEachOutNode(cur_node, [&](NodeType* out) { + --counter_in[out]; + if (counter_in[out] == 0) { queue.push(out); } + }); + } + return Maybe::Ok(); +} + template void Graph::DfsTopoForEachNodeSortByDistanceToSink( const std::list& starts, @@ -546,7 +672,7 @@ void Graph::DfsTopoForEachNodeSortByDistanceToSink( HashMap node2distance_to_sink; { std::list nodes; - TopoForEachNode(starts, ForEachInNode, ForEachOutNode, + TopoForEachNode(ForEachInNode, ForEachOutNode, [&](NodeType* node) { nodes.emplace_back(node); }); std::list sinks; for (NodeType* node : nodes) { @@ -554,7 +680,7 @@ void Graph::DfsTopoForEachNodeSortByDistanceToSink( ForEachOutNode(node, [&](NodeType* out_node) { is_sink = false; }); if (is_sink) { sinks.emplace_back(node); } } - TopoForEachNode(sinks, ForEachOutNode, ForEachInNode, [&](NodeType* node) { + TopoForEachNode(ForEachOutNode, ForEachInNode, [&](NodeType* node) { int64_t distance_to_sink = -1; ForEachOutNode(node, [&](NodeType* out_node) { distance_to_sink = std::max(distance_to_sink, node2distance_to_sink[out_node]); @@ -649,12 +775,12 @@ Graph::MakePredicatorIsReachable( std::shared_ptr id2ancestor(new Id2Ancestor(node_num())); int64_t id = 0; node2id->reserve(node_num()); - TopoForEachNode(starts, ForEachInNode, ForEachOutNode, [&](NodeType* node) { + TopoForEachNode(ForEachInNode, ForEachOutNode, [&](NodeType* node) { node2id->emplace(node, id); id2ancestor->at(id).Resize(node_num()); id += 1; }); - TopoForEachNode(starts, ForEachInNode, ForEachOutNode, [&](NodeType* node) { + TopoForEachNode(ForEachInNode, ForEachOutNode, [&](NodeType* node) { const int64_t node_id = node2id->at(node); auto& ancestor_bitset_vec = id2ancestor->at(node_id); ForEachInNode(node, [&](NodeType* in_node) { diff --git a/oneflow/core/graph/op_graph.cpp b/oneflow/core/graph/op_graph.cpp index 4bd88e55f5f..45e5eba9166 100644 --- a/oneflow/core/graph/op_graph.cpp +++ b/oneflow/core/graph/op_graph.cpp @@ -472,8 +472,7 @@ void OpGraph::TopoForEachNodeWithCtrlEdge(const std::function& No const std::function& Handler) { ForEachDataAndCtrlOutNode(node, Handler); }; - TopoForEachNode(DataOrCtrlSourceNodes(), OpGraphForEachInDataAndCtrlNode, - OpGraphForEachOutDataAndCtrlNode, NodeHandler); + TopoForEachNode(OpGraphForEachInDataAndCtrlNode, OpGraphForEachOutDataAndCtrlNode, NodeHandler); } std::function diff --git a/oneflow/core/graph/straighten_nodes.cpp b/oneflow/core/graph/straighten_nodes.cpp new file mode 100644 index 00000000000..1e708e19df0 --- /dev/null +++ b/oneflow/core/graph/straighten_nodes.cpp @@ -0,0 +1,485 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/graph/straighten_nodes.h" +#include "oneflow/core/graph/op_graph.h" +#include "oneflow/core/graph/task_node.h" +#include "oneflow/core/job/job_desc.h" +#include "oneflow/core/common/protobuf.h" +#include "oneflow/core/job/task.pb.h" + +namespace oneflow { + +namespace { + +enum TaskClassifier : int { + kWaitingTransfer = 0, + kWaitingComputation = 1, + kRunASAP = 2, + kRunALAP = 3 +}; + +class TopoStruct { + public: + TaskNode* node = nullptr; + int32_t min_layer = -1; + int32_t tributary_layer = -1; + bool on_mainstem = false; + int32_t counter = 0; + int32_t min_distance2transfer = -1; + TopoStruct* next_same_node = nullptr; + // We can have some other nodes in it for example + // SbpNode* node; + // SbpEdge* node; + // Or we can omit all the pointers and leave all the useful parameters. + + // Drop down the tributary layer + void DropTributaryLayer(int32_t upper_bound); + + void SpreadTributaryLayer(HashMap* task_node2topo_struct); + + void SpreadMainstem(HashMap* task_node2topo_struct); + + // The minimum computation distance from the beginning of this op to the next transfer + int32_t GetMinDistance2Transfer(HashMap* task_node2topo_struct); + + // deciding parameter + // i = 0: those with small tributary layers go first + // i = 1: those with small minimum distance to transfer go first + // i = 2: first in first out + // i = 3: those with large tributary layers go first + // i = 4: those with long distance to transfer go first + // i = 5: last in first out + int32_t GetDecidingParameter(int32_t i) const; +}; + +// move the head from source to target +void MoveFrontBetweenMaps(std::map& source, + std::map& target) { + if (!source.empty()) { + const auto& front = source.begin(); + target[front->first] = front->second; + source.erase(front); + } +}; + +bool ShouldRunASAP(TaskType task_type) { + // They are sorted according to frequency of occurrences + switch (task_type) { + // We mark the number of occurrences in bert + case TaskType::kDeviceTick: // 38 + case TaskType::kTick: // 8 + case TaskType::kSrcSubsetTick: // 6 + case TaskType::kDstSubsetTick: // 6 + case TaskType::kCriticalSectionWaitTick: // 4 + case TaskType::kWaitAndSendIds: // 2 + case TaskType::kPack: // 0 + case TaskType::kUnpack: // 0 + case TaskType::kRepeat: // 0 + case TaskType::kAcc: // 0 + case TaskType::kSourceTick: // 0 + case TaskType::kAccTick: // 0 + case TaskType::kCase: // 0 + case TaskType::kEsac: // 0 + case TaskType::kReentrantLock: return true; // 0 + default: return false; + } +} + +bool IsTransferNode(TaskType task_type) { + // return task_type == 12 || task_type == 13 || (48 <= task_type && task_type <= 64); + // They are sorted according to frequency of occurrences + switch (task_type) { + // We mark the number of occurrences in bert + case TaskType::kCollectiveBoxingGeneric: // 76 + case TaskType::kCopyHd: // 27 + case TaskType::kSliceBoxing: // 16 + case TaskType::kCopyCommNet: // 12 + case TaskType::kCollectiveBoxingPack: // 8 + case TaskType::kCollectiveBoxingUnpack: // 8 + case TaskType::kBoxingZeros: // 3 + case TaskType::kForeignInput: // 0 + case TaskType::kForeignOutput: // 0 + case TaskType::kDistributeConcat: // 0 + case TaskType::kDistributeSplit: // 0 + case TaskType::kBoxingIdentity: // 0 + case TaskType::kDecodeH2D: // 0 + case TaskType::kSspVariableProxy: return true; // 0 + default: return false; + } +} + +// Classifier for the set according to the task type +TaskClassifier GetTaskClassifier(const TaskNode* node) { + // Check task.pb.h for detail + // They are sorted according to frequency of judgement + // frequency of judgement = the number of occurrences / the times of judgement + TaskType task_type = node->GetTaskType(); + if (task_type == TaskType::kNormalForward) { return TaskClassifier::kWaitingComputation; } + if (IsTransferNode(task_type)) { return TaskClassifier::kWaitingTransfer; } + if (task_type == TaskType::kCallbackNotify) { return TaskClassifier::kRunALAP; } + if (ShouldRunASAP(task_type)) { return TaskClassifier::kRunASAP; } + CHECK(false) << "Unclassified or invalid task type (" << task_type << ") showing up"; + // Throw a kRunASAP which means ignoring this node in the algorithm + return TaskClassifier::kRunASAP; +} + +// Drop down the maximum layer with the minimum layer form consumer +void TopoStruct::DropTributaryLayer(int32_t upper_bound) { + if (upper_bound < tributary_layer || tributary_layer < 0) { tributary_layer = upper_bound; } +} + +// Should initialize the counter to be the number of out edges +// Compute maximum layer for tributaries +void TopoStruct::SpreadTributaryLayer(HashMap* task_node2topo_struct) { + if (counter || min_layer <= 0) { return; } + int32_t producer_max_lay = 0; + if (on_mainstem) { + producer_max_lay = min_layer - 1; + } else { + // On a tributary, the operator could be run later. + producer_max_lay = tributary_layer; + } + node->ForEachNodeOnInEdge([&](TaskNode* in) { + auto& topo_struct_in = task_node2topo_struct->at(in); + topo_struct_in.DropTributaryLayer(producer_max_lay); + --topo_struct_in.counter; + if (topo_struct_in.counter == 0) { topo_struct_in.SpreadTributaryLayer(task_node2topo_struct); } + }); + // Reduce counter to -1 to avoid visiting again + counter--; +} + +// Judge if this node is on the mainstem +// If so, judge it for its producer/upstream nodes +void TopoStruct::SpreadMainstem(HashMap* task_node2topo_struct) { + // Skip it if this node is already judged. + if (on_mainstem) { return; } + CHECK_GE(min_layer, 0) << "TopoStruct not initialized!"; + on_mainstem = true; + // If I am in the mainstem, then all the children with (min_layer >= my layer id - 1) would be + // considered as in the mainstem + node->ForEachNodeOnInEdge([&](TaskNode* in) { + auto& topo_struct_in = task_node2topo_struct->at(in); + if (topo_struct_in.min_layer == min_layer - 1) { + topo_struct_in.SpreadTributaryLayer(task_node2topo_struct); + } + }); +} + +// The minimum computation distance from the beginning of this op to the next transfer +int32_t TopoStruct::GetMinDistance2Transfer(HashMap* task_node2topo_struct) { + if (min_distance2transfer >= 0) { return min_distance2transfer; } + // if this node is a transfer node + if (IsTransferNode(node->GetTaskType())) { + min_distance2transfer = 0; + return min_distance2transfer; + } + // Otherwise, initialize it with a large number + // Well, the total number in the task graph is large enough + min_distance2transfer = task_node2topo_struct->size(); + node->ForEachNodeOnOutEdge([&](TaskNode* out) { + min_distance2transfer = + std::min(min_distance2transfer, + task_node2topo_struct->at(out).GetMinDistance2Transfer(task_node2topo_struct)); + }); + ++min_distance2transfer; + return min_distance2transfer; +} + +// deciding parameter +// i = 0: those with small tributary layers go first +// i = 1: those with small minimum distance to transfer go first +// i = 2: first in first out +// i = 3: those with large tributary layers go first +// i = 4: those with long distance to transfer go first +// i = 5: last in first out +int32_t TopoStruct::GetDecidingParameter(int32_t i) const { + int32_t sign = 1; + if (i >= 3) { + i -= 3; + sign = -1; + } + switch (i) { + case 0: return sign * tributary_layer; + case 1: return sign * min_distance2transfer; + case 2: return sign * min_layer; + } + return 0; +} + +// Find the mainstem of the task graph, then reduce the wait time for tributaries +void FindMainstem(HashMap* task_node2topo_struct) { + // Find the maximum layer number + int32_t max_min_layer = -1; + for (const auto& pair : *task_node2topo_struct) { + if (max_min_layer < pair.second.min_layer) { max_min_layer = pair.second.min_layer; } + } + // All the nodes with min_layer>=mainstem_end_id would be considered as mainstem nodes + // The last 5 layers would be considered as in mainstem anyway. + int32_t mainstem_end_id = max_min_layer - 4; + for (auto& pair : *task_node2topo_struct) { + auto& topo_struct = pair.second; + // Initialize the counter and Tributary Layer + topo_struct.counter = pair.first->out_edges().size(); + topo_struct.tributary_layer = max_min_layer; + // Find out all the nodes on the mainstem. + if (topo_struct.min_layer >= mainstem_end_id) { + topo_struct.SpreadMainstem(task_node2topo_struct); + } + } + + for (auto& pair : *task_node2topo_struct) { + // Compute maximum layer for tributaries + pair.second.SpreadTributaryLayer(task_node2topo_struct); + // Set the min_distance2transfer for each topological structure + pair.second.GetMinDistance2Transfer(task_node2topo_struct); + } +} + +} // anonymous namespace + +void StraightenNodes(TaskGraph* task_graph, std::vector* ordered_task_nodes) { + // The function for settle the order in the graph + int64_t order_in_graph = 0; + + // Generate topological data structure for each task node + HashMap task_node2topo_struct; + // Determine the same nodes which should run simultaneously + HashMap>> + task_type2machine_id2node_id2topo_structs; + std::map min_node_id2topo_struct; + int32_t previous_min_layer = 0; + task_graph->TopoForEachNode([&](TaskNode* node) { + auto& topo_struct = task_node2topo_struct[node]; + topo_struct.node = node; + if (node->in_edges().empty()) { + topo_struct.min_layer = 0; + } else { + int32_t max_min_layer = 0; + node->ForEachNodeOnInEdge([&](TaskNode* in) { + max_min_layer = std::max(max_min_layer, task_node2topo_struct[in].min_layer); + }); + topo_struct.min_layer = max_min_layer + 1; + // Deal with all the nodes with min_layer=previous_min_layer + if (max_min_layer >= previous_min_layer) { + // Using "7" to represent "and" + // a7b means a pair (a, b) + for (auto& task_type7machine_id2node_id2topo_structs : + task_type2machine_id2node_id2topo_structs) { + auto& machine_id2node_id2topo_structs = task_type7machine_id2node_id2topo_structs.second; + // Initializing the smallest node id for each machine + for (auto& machine_id7node_id2topo_structs : machine_id2node_id2topo_structs) { + MoveFrontBetweenMaps(machine_id7node_id2topo_structs.second, min_node_id2topo_struct); + } + + while (!min_node_id2topo_struct.empty()) { + // auto* topo_struct_min_node_id = min_node_id2topo_struct.begin()->second; + // Store the same nodes in different machines + std::vector same_nodes; + for (auto& min_node_id7topo_struct : min_node_id2topo_struct) { + auto* curr_topo_struct = min_node_id7topo_struct.second; + // Find out all the same nodes + // Stop using Visual string before we find a better key + // Currently we can use the topological structure and node id to decide the same nodes + same_nodes.push_back(curr_topo_struct); + } + // Cyclize them + for (int32_t i = 1; i < same_nodes.size(); i++) { + same_nodes[i - 1]->next_same_node = same_nodes[i]; + } + (*same_nodes.rbegin())->next_same_node = same_nodes[0]; + // Delete them and add new candidates + for (auto* same_node_topo_struct : same_nodes) { + // Erase them from min_node_id2topo_struct + min_node_id2topo_struct.erase(same_node_topo_struct->node->node_id()); + // Add new candidate + MoveFrontBetweenMaps( + machine_id2node_id2topo_structs[same_node_topo_struct->node->machine_id()], + min_node_id2topo_struct); + } + } + } + // Renew the previous min_layer at the end + previous_min_layer = topo_struct.min_layer; + } + } + // Put the topo structure into the map, waiting for determine the same nodes + task_type2machine_id2node_id2topo_structs[node->GetTaskType()][node->machine_id()] + [node->node_id()] = &topo_struct; + }); + + // Generate other parameters in the topological data structure + FindMainstem(&task_node2topo_struct); + + VLOG(3) << "Straightening order: " << 5 << ", " << 3; + + // Order in the waiting sets + // Decide which node should run first + struct comp { + bool operator()(const TopoStruct* a, const TopoStruct* b) const { + // NOTE: Leave these code for debugging in the future + // static std::vector decide_parameters({ParseIntegerFromEnv("Parameter0", 0), + // ParseIntegerFromEnv("Parameter1", 1), + // ParseIntegerFromEnv("Parameter2", 2)}); + // The best parameter set is {5, 3} + static std::vector decide_parameters({5, 3}); + for (int32_t decide_parameter : decide_parameters) { + int32_t decide_parameter_a = a->GetDecidingParameter(decide_parameter); + int32_t decide_parameter_b = b->GetDecidingParameter(decide_parameter); + if (decide_parameter_a != decide_parameter_b) { + return decide_parameter_a < decide_parameter_b; + } + } + return a->node->node_id() < b->node->node_id(); + } + }; + + // Classify sets for the task nodes + // std::set waiting_transfer; // 0, TaskClassifier::kWaitingTransfer + // std::set waiting_computation; // 1, TaskClassifier::kWaitingComputation + // std::set run_asap; // 2, TaskClassifier::kRunASAP , run as soon as possible + // std::set run_alap; // 3, TaskClassifier::kRunALAP , run as late as possible + const int32_t num_classifier = 4; + std::vector> waiting_lists(num_classifier); + + std::vector remain_task_nums(num_classifier, 0); + + auto SetOrderInGraph = [&](TaskNode* task_node) { + task_node->set_order_in_graph(order_in_graph); + ordered_task_nodes->emplace_back(task_node); + ++order_in_graph; + }; + + // wait in the list + auto wait = [&](TaskNode* node) { + TopoStruct* first_topo_struct = &task_node2topo_struct[node]; + // Check if all the same nodes are ready simultaneously + TopoStruct* curr_topo_struct = first_topo_struct->next_same_node; + while (curr_topo_struct && curr_topo_struct != first_topo_struct) { + if (curr_topo_struct->counter) { return; } + curr_topo_struct = curr_topo_struct->next_same_node; + } + // Add all the same nodes at the same time + curr_topo_struct = first_topo_struct; + auto& waiting_list = waiting_lists[GetTaskClassifier(node)]; + while (true) { + waiting_list.insert(curr_topo_struct); + // Reduce counter then this node will never be added again + // Though inserting into a map twice does not matter because of the same keys + curr_topo_struct->counter--; + curr_topo_struct = curr_topo_struct->next_same_node; + if ((!curr_topo_struct) || (curr_topo_struct == first_topo_struct)) { break; } + } + }; + + // initialization + task_graph->ForEachNode([&](TaskNode* node) { + int32_t count = node->in_edges().size(); + task_node2topo_struct[node].counter = count; + if (count == 0) { wait(node); } + remain_task_nums[GetTaskClassifier(node)]++; + }); + + // Finish execution + auto finish_execution = [&](TaskNode* node) { + node->ForEachNodeOnOutEdge([&](TaskNode* out) { + --(task_node2topo_struct[out].counter); + if (task_node2topo_struct[out].counter == 0) { wait(out); } + }); + }; + + // Move the first node of the waiting list to the execution list + auto move2execution_list = [&](std::set& waiting_list, + std::vector& execution_list) { + TaskNode* first_node = (*waiting_list.begin())->node; + int32_t execution_num = 0; + TopoStruct* first_topo_struct = &task_node2topo_struct[first_node]; + // Find all the same nodes in different machine + // They should be run simultaneously + TopoStruct* curr_topo_struct = first_topo_struct; + while (true) { + execution_num++; + execution_list.push_back(curr_topo_struct->node); + waiting_list.erase(curr_topo_struct); + // move and maybe leave + curr_topo_struct = curr_topo_struct->next_same_node; + if ((!curr_topo_struct) || (curr_topo_struct == first_topo_struct)) { break; } + } + CHECK_GT(execution_num, 0) << "Error, no task nodes are moved to the execution list"; + }; + + // Execute the first n nodes in the waiting list + auto execute = [&](int32_t list_classifier, int32_t n, bool if_reverse = false) { + // n > 0 + if (n <= 0) { return; } + auto& waiting_list = waiting_lists[list_classifier]; + std::vector execution_list; + int32_t count = 0; + // Move to the execution list + while (!waiting_list.empty()) { + move2execution_list(waiting_list, execution_list); + count++; + if (count >= n) { break; } + } + remain_task_nums[list_classifier] -= execution_list.size(); + // Set the order and then remove from the execution list + for (auto* node : execution_list) { + SetOrderInGraph(node); + finish_execution(node); + } + }; + + // straightening + while (true) { + if (waiting_lists[TaskClassifier::kRunASAP].empty()) { + if (waiting_lists[TaskClassifier::kWaitingTransfer].empty()) { + if (waiting_lists[TaskClassifier::kWaitingComputation].empty()) { + if (waiting_lists[TaskClassifier::kRunALAP].empty()) { + // All the waiting lists are empty + break; + } else { + // Execute all the nodes left + execute(TaskClassifier::kRunALAP, waiting_lists[TaskClassifier::kRunALAP].size()); + } + } else { + // Execute one computation node + execute(TaskClassifier::kWaitingComputation, 1); + } + } else { + int32_t computation_num = + std::min(int32_t(waiting_lists[TaskClassifier::kWaitingComputation].size() + / (waiting_lists[TaskClassifier::kWaitingTransfer].size())), + remain_task_nums[TaskClassifier::kWaitingComputation] + / remain_task_nums[TaskClassifier::kWaitingTransfer]); + // Holding the transfer + std::vector transfer_execution_list; + move2execution_list(waiting_lists[TaskClassifier::kWaitingTransfer], + transfer_execution_list); + remain_task_nums[TaskClassifier::kWaitingTransfer] -= transfer_execution_list.size(); + for (auto* transfer_node : transfer_execution_list) { SetOrderInGraph(transfer_node); } + // Overlap transfer with computation + execute(TaskClassifier::kWaitingComputation, computation_num); + + // Release the transfer + for (auto* transfer_node : transfer_execution_list) { finish_execution(transfer_node); } + } + } else { + execute(TaskClassifier::kRunASAP, waiting_lists[TaskClassifier::kRunASAP].size()); + } + } +} + +} // namespace oneflow diff --git a/oneflow/ir/include/OneFlow/Conversion/SCFToGPU.h b/oneflow/core/graph/straighten_nodes.h similarity index 68% rename from oneflow/ir/include/OneFlow/Conversion/SCFToGPU.h rename to oneflow/core/graph/straighten_nodes.h index e6c70591035..e68a03c698c 100644 --- a/oneflow/ir/include/OneFlow/Conversion/SCFToGPU.h +++ b/oneflow/core/graph/straighten_nodes.h @@ -13,19 +13,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_SCFTOGPU_H_ -#define ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_SCFTOGPU_H_ +#ifndef ONEFLOW_CORE_GRAPH_STRAIGHTEN_NODES_H_ +#define ONEFLOW_CORE_GRAPH_STRAIGHTEN_NODES_H_ -#include "mlir/Pass/Pass.h" - -namespace mlir { +#include "oneflow/core/graph/task_graph.h" namespace oneflow { -std::unique_ptr createMapSCFToGPUPass(); +void StraightenNodes(TaskGraph* task_graph, std::vector* ordered_task_nodes); } // namespace oneflow -} // namespace mlir - -#endif // ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_SCFTOGPU_H_ +#endif // ONEFLOW_CORE_GRAPH_STRAIGHTEN_NODES_H_ diff --git a/oneflow/core/graph/task_graph.cpp b/oneflow/core/graph/task_graph.cpp index 040e113ad14..79f6ffdc74c 100644 --- a/oneflow/core/graph/task_graph.cpp +++ b/oneflow/core/graph/task_graph.cpp @@ -29,6 +29,7 @@ limitations under the License. #include "oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h" #include "oneflow/core/graph/task_stream_index_manager.h" #include "oneflow/core/ep/include/primitive/memcpy.h" +#include "oneflow/core/graph/straighten_nodes.h" namespace oneflow { @@ -419,7 +420,7 @@ void ForEachOpGraphNecessaryCtrlEdge( } // namespace -TaskGraph::TaskGraph() { +TaskGraph::TaskGraph(bool disable_straighten_algorithm) { OpGraph* op_graph = Global::Get(); sub_tsk_gph_builder_ctx_.reset(new SubTskGphBuilderCtx(this)); boxing_logger_ = CreateBoxingLogger(); @@ -450,7 +451,11 @@ TaskGraph::TaskGraph() { } }); - SetOrderInGraphForEachNode(); + if (disable_straighten_algorithm) { + SetOrderInGraphForEachNode(); + } else { + StraightenNodes(this, &ordered_task_nodes_); + } if (Global::Get()->enable_debug_mode()) { ToDotWithAutoFilePath(); } } diff --git a/oneflow/core/graph/task_graph.h b/oneflow/core/graph/task_graph.h index 71593a834f1..2ec3e15f18e 100644 --- a/oneflow/core/graph/task_graph.h +++ b/oneflow/core/graph/task_graph.h @@ -43,7 +43,7 @@ class TaskGraph final : public Graph { OF_DISALLOW_COPY_AND_MOVE(TaskGraph); ~TaskGraph() override; - explicit TaskGraph(); + explicit TaskGraph(bool disable_straighten_algorithm); const char* TypeName() const override { return "TaskGraph"; } void RemoveEmptyRegsts(); diff --git a/oneflow/core/job/compiler.cpp b/oneflow/core/job/compiler.cpp index 7cdcbb9a5e1..a2d47a1d38a 100644 --- a/oneflow/core/job/compiler.cpp +++ b/oneflow/core/job/compiler.cpp @@ -61,7 +61,8 @@ void Compiler::Compile(Job* job, Plan* plan, bool need_job_complete) const { // Step3: build task_gph. // TODO(levi): we can rewrite this part of code in visitor pattern. - auto task_gph = std::make_unique(); + auto task_gph = + std::make_unique(job->job_conf().disable_straighten_algorithm_in_task_graph()); using std::placeholders::_1; task_gph->ForEachNode(std::bind(&TaskNode::ProduceAllRegstsAndBindEdges, _1)); task_gph->ForEachNode(std::bind(&TaskNode::ConsumeAllRegsts, _1)); diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index 959a7837010..85408c2c45e 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -19,6 +19,8 @@ limitations under the License. #include "oneflow/core/job/eager_nccl_comm_manager.h" #include "oneflow/core/device/nccl_util.h" #include "oneflow/core/job/id_manager.h" +#include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/vm/vm_util.h" #ifdef WITH_CUDA @@ -76,8 +78,14 @@ void CreateNcclComm(ncclComm_t* comm, const int dev, const std::string& key, << ", key = {" << key << "}\n"; } +bool NeedUnifiedNcclCommInit(const std::string& op_type_name) { + return UserKernelUnifiedNcclCommInitRegistry::Instance().IsRegistered(op_type_name); +} + } // namespace +const std::string EagerNcclCommMgr::kDefaultStreamName = "DEFAULT"; + EagerNcclCommMgr::~EagerNcclCommMgr() { for (auto& device_set7device_id2comm : device_set2device_id2comm_) { for (auto& device_id7comm : device_set7device_id2comm.second) { @@ -139,6 +147,69 @@ ncclComm_t EagerNcclCommMgr::GetCommForDeviceAndStreamName( return comm; } +void EagerNcclCommMgr::CreateCommFromPlan(const Plan& plan) { + const int64_t rank = GlobalProcessCtx::Rank(); + const int64_t dev = GlobalProcessCtx::LocalRank(); + std::map>> nccl_comm_key2devices; + + for (const auto& task_proto : plan.task()) { + if (task_proto.machine_id() != rank) { continue; } + if (task_proto.exec_sequence().exec_node_size() != 1) { continue; } + const auto& kernel_conf = task_proto.exec_sequence().exec_node(0).kernel_conf(); + const OpAttribute* op_attr = nullptr; + if (kernel_conf.has_op_attribute()) { + op_attr = &kernel_conf.op_attribute(); + } else if (kernel_conf.has_op_attribute_ref()) { + const auto& ref_name = kernel_conf.op_attribute_ref(); + op_attr = &plan.job_id2op_attribute_ref_table() + .at(task_proto.job_id()) + .op_name2op_attribute() + .at(ref_name); + } else { + continue; + } + const auto& op_conf = op_attr->op_conf(); + if (!op_conf.has_user_conf()) { continue; } + if (!NeedUnifiedNcclCommInit(op_conf.user_conf().op_type_name())) { continue; } + + if (!op_attr->has_parallel_conf_signature()) { continue; } + if (!op_attr->parallel_conf_signature().has_op_parallel_conf()) { continue; } + + std::vector> device_vec; + ParallelDesc parallel_desc(op_attr->parallel_conf_signature().op_parallel_conf()); + for (int64_t parallel_id = 0; parallel_id < parallel_desc.parallel_num(); ++parallel_id) { + int64_t machine_id = CHECK_JUST(parallel_desc.MachineId4ParallelId(parallel_id)); + int64_t device_id = CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_id)); + device_vec.emplace_back(machine_id, device_id); + } + + std::string stream_name = kDefaultStreamName; + if (op_conf.has_stream_name_hint()) { stream_name = op_conf.stream_name_hint(); } + std::string key = GetNcclUniqueIdRpcKey(device_vec) + "-stream_name_hint:" + stream_name; + + VLOG(3) << " EagerNcclCommMgr create nccl comm for " << op_conf.name() << ", rank = " << rank + << ", dev = " << dev << ", key = {" << key << "}\n"; + nccl_comm_key2devices.emplace(std::move(key), std::move(device_vec)); + } + + if (nccl_comm_key2devices.size() == 0) { return; } + + CHECK_JUST(vm::CurrentRankSync()); + CudaCurrentDeviceGuard guard(dev); + + for (const auto& pair : nccl_comm_key2devices) { + const auto& key = pair.first; + auto device_id2comm_it = device7stream2device_id2comm_.find(key); + if (device_id2comm_it != device7stream2device_id2comm_.end()) { + auto comm_it = device_id2comm_it->second.find(dev); + if (comm_it != device_id2comm_it->second.end()) { continue; } + } + ncclComm_t comm; + CreateNcclComm(&comm, dev, key, pair.second); + device7stream2device_id2comm_[key][dev] = comm; + } +} + } // namespace oneflow #endif // WITH_CUDA diff --git a/oneflow/core/job/eager_nccl_comm_manager.h b/oneflow/core/job/eager_nccl_comm_manager.h index d818a916731..77526fdff40 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.h +++ b/oneflow/core/job/eager_nccl_comm_manager.h @@ -27,6 +27,8 @@ namespace oneflow { class EagerNcclCommMgr final { public: + static const std::string kDefaultStreamName; + OF_DISALLOW_COPY_AND_MOVE(EagerNcclCommMgr); ~EagerNcclCommMgr(); @@ -34,6 +36,8 @@ class EagerNcclCommMgr final { ncclComm_t GetCommForDeviceAndStreamName(const std::set>& device_set, const std::string& stream_name); + void CreateCommFromPlan(const Plan& plan); + private: friend class Global; EagerNcclCommMgr() = default; @@ -44,8 +48,43 @@ class EagerNcclCommMgr final { std::mutex mutex_; }; +class UserKernelUnifiedNcclCommInitRegistry final { + public: + struct Trigger { + explicit Trigger(const std::string& key) { + UserKernelUnifiedNcclCommInitRegistry::Instance().Register(key); + } + }; + + static UserKernelUnifiedNcclCommInitRegistry& Instance() { + static UserKernelUnifiedNcclCommInitRegistry reg; + return reg; + } + + OF_DISALLOW_COPY_AND_MOVE(UserKernelUnifiedNcclCommInitRegistry); + ~UserKernelUnifiedNcclCommInitRegistry() = default; + + void Register(const std::string& key) { + bool insert_success = reg_set_.insert(key).second; + if (!insert_success) { + std::cerr << key << " was already registered in NcclCommRegistry" << std::endl; + abort(); + } + } + + bool IsRegistered(const std::string& key) const { return reg_set_.find(key) != reg_set_.end(); } + + private: + UserKernelUnifiedNcclCommInitRegistry() = default; + std::set reg_set_; +}; + } // namespace oneflow +#define REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT(op_type_name) \ + static auto OF_PP_CAT(g_nccl_comm_reg_, __COUNTER__) = \ + ::oneflow::UserKernelUnifiedNcclCommInitRegistry::Trigger(op_type_name) + #endif // WITH_CUDA #endif // ONEFLOW_CORE_JOB_EAGER_NCCL_COMM_MANAGER_H_ diff --git a/oneflow/core/job/env_global_objects_scope.cpp b/oneflow/core/job/env_global_objects_scope.cpp index 95b529b0f97..400770cf0f2 100644 --- a/oneflow/core/job/env_global_objects_scope.cpp +++ b/oneflow/core/job/env_global_objects_scope.cpp @@ -115,18 +115,11 @@ void ClearAllSymbol() { Global>::Get()->ClearAll(); } -#if defined(__linux__) && defined(WITH_RDMA) +#if defined(WITH_RDMA) && defined(OF_PLATFORM_POSIX) -bool CommNetIBEnabled() { - bool user_enabled = ParseBooleanFromEnv("ONEFLOW_COMM_NET_IB_ENABLE", false); - if (user_enabled) { - return ibv::IsAvailable(); - } else { - return false; - } -} +bool CommNetIBEnabled() { return ibv::IsAvailable(); } -#endif +#endif // WITH_RDMA && OF_PLATFORM_POSIX } // namespace @@ -202,16 +195,7 @@ Maybe EnvGlobalObjectsScope::Init(const EnvProto& env_proto) { Global::New(); Global::New(); if (Global::Get()->process_ranks().size() > 1) { -#ifdef WITH_RDMA - if (CommNetIBEnabled()) { - Global::New(); - Global::SetAllocated(Global::Get()); - } else { - Global::SetAllocated(Global::Get()); - } -#else Global::SetAllocated(Global::Get()); -#endif // WITH_RDMA } #endif // __linux__ } @@ -277,4 +261,40 @@ EnvGlobalObjectsScope::~EnvGlobalObjectsScope() { google::ShutdownGoogleLogging(); } +Maybe InitRDMA() { + if (!Global::Get()->enable_dry_run()) { +#ifdef __linux__ + if (Global::Get()->process_ranks().size() > 1) { +#if defined(WITH_RDMA) && defined(OF_PLATFORM_POSIX) + if (CommNetIBEnabled()) { + if (Global::Get() == nullptr) { + Global::New(); + Global::SetAllocated(Global::Get()); + } else { + LOG(WARNING) << "Skip init RDMA because RDMA is already initialized!"; + } + } else { + LOG(WARNING) << "Skip init RDMA because RDMA is unavailable!"; + } +#else + LOG(WARNING) << "Skip init RDMA because RDMA is not compiled!"; +#endif // WITH_RDMA && OF_PLATFORM_POSIX + } else { + LOG(WARNING) << "Skip init RDMA because only one process in this group!"; + } +#endif // __linux__ + } else { + LOG(WARNING) << "Skip init RDMA in dry run mode!"; + } + return Maybe::Ok(); +} + +Maybe RDMAIsInitialized() { +#if defined(WITH_RDMA) && defined(OF_PLATFORM_POSIX) + return Global::Get() != nullptr; +#else + return false; +#endif // WITH_RDMA && OF_PLATFORM_POSIX +} + } // namespace oneflow diff --git a/oneflow/core/job/env_global_objects_scope.h b/oneflow/core/job/env_global_objects_scope.h index 845aff0cb04..ff17a05573f 100644 --- a/oneflow/core/job/env_global_objects_scope.h +++ b/oneflow/core/job/env_global_objects_scope.h @@ -45,6 +45,10 @@ class EnvGlobalObjectsScope final { Optional is_normal_exit_; }; +Maybe InitRDMA(); + +Maybe RDMAIsInitialized(); + } // namespace oneflow #endif // ONEFLOW_CORE_JOB_CLUSTER_OBJECTS_SCOPE_H_ diff --git a/oneflow/core/job/intra_job_mem_sharing_util.cpp b/oneflow/core/job/intra_job_mem_sharing_util.cpp index 1af896e1b57..6ee9e8ecea0 100644 --- a/oneflow/core/job/intra_job_mem_sharing_util.cpp +++ b/oneflow/core/job/intra_job_mem_sharing_util.cpp @@ -528,7 +528,7 @@ void MemReusedAlgorithm_AllocateByOrderAndMutualExclusion( void MemReusedAlgorithm_MemSizeFirstAlgo( const HashMap>& regst2mutual_exclusion_regsts, - MemBlockResultInfo* result) { + const HashMap& regst2alloc_order, MemBlockResultInfo* result) { std::vector order; order.reserve(regst2mutual_exclusion_regsts.size()); HashMap regst_desc2size; @@ -538,7 +538,10 @@ void MemReusedAlgorithm_MemSizeFirstAlgo( .second); } std::sort(order.begin(), order.end(), [&](RegstDescProto* lhs, RegstDescProto* rhs) { - return regst_desc2size.at(lhs) > regst_desc2size.at(rhs); + int64_t l_size = regst_desc2size.at(lhs); + int64_t r_size = regst_desc2size.at(rhs); + if (l_size == r_size) { return regst2alloc_order.at(lhs) < regst2alloc_order.at(rhs); } + return l_size > r_size; }); MemReusedAlgorithm_AllocateByOrderAndMutualExclusion(order, regst_desc2size, regst2mutual_exclusion_regsts, result); @@ -546,7 +549,7 @@ void MemReusedAlgorithm_MemSizeFirstAlgo( void MemReusedAlgorithm_MutualExclusionFirstAlgo( const HashMap>& regst2mutual_exclusion_regsts, - MemBlockResultInfo* result) { + const HashMap& regst2alloc_order, MemBlockResultInfo* result) { std::vector order; order.reserve(regst2mutual_exclusion_regsts.size()); HashMap regst_desc2size; @@ -556,8 +559,10 @@ void MemReusedAlgorithm_MutualExclusionFirstAlgo( .second); } std::sort(order.begin(), order.end(), [&](RegstDescProto* lhs, RegstDescProto* rhs) { - return regst2mutual_exclusion_regsts.at(lhs).size() - < regst2mutual_exclusion_regsts.at(rhs).size(); + int64_t l_size = regst2mutual_exclusion_regsts.at(lhs).size(); + int64_t r_size = regst2mutual_exclusion_regsts.at(rhs).size(); + if (l_size == r_size) { return regst2alloc_order.at(lhs) < regst2alloc_order.at(rhs); } + return l_size > r_size; }); MemReusedAlgorithm_AllocateByOrderAndMutualExclusion(order, regst_desc2size, regst2mutual_exclusion_regsts, result); @@ -704,12 +709,20 @@ void SelectAlgorithmGenMemBlockOffset4Regsts( MemBlockResultInfo* result) { CHECK_EQ(result->mem_block_size, 0); CHECK(result->regst_desc2offset.empty()); + + // NOTE(chengcheng): When mem size or exclusion num equal, there need second order by allocate. + HashMap regst2alloc_order; + for (int64_t i = 0; i < alloc_regsts_timeline.size(); ++i) { + const auto& regsts = alloc_regsts_timeline.at(i); + for (RegstDescProto* regst : regsts) { CHECK(regst2alloc_order.emplace(regst, i).second); } + } switch (algo_id) { case kMemSizeFirstAlgo: - MemReusedAlgorithm_MemSizeFirstAlgo(regst2mutual_exclusion_regsts, result); + MemReusedAlgorithm_MemSizeFirstAlgo(regst2mutual_exclusion_regsts, regst2alloc_order, result); break; case kMutualExclusionFirstAlgo: - MemReusedAlgorithm_MutualExclusionFirstAlgo(regst2mutual_exclusion_regsts, result); + MemReusedAlgorithm_MutualExclusionFirstAlgo(regst2mutual_exclusion_regsts, regst2alloc_order, + result); break; case kTimeLineAlgo: MemReusedAlgorithm_TimeLineAlgo(alloc_regsts_timeline, free_regsts_timeline, result); diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index 8ae659fd541..23711a89b94 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -196,7 +196,7 @@ void JobBuildAndInferCtx::AddOpAndUpdateJobParallelViewConf(const OperatorConf& (*module_name2module_conf)[module_name].set_name(scope.scope_proto().module_name()); } - (*module_name2module_conf)[module_name].add_ops()->CopyFrom(operator_conf); + *((*module_name2module_conf)[module_name].add_ops()) = operator_conf.name(); } } @@ -999,7 +999,7 @@ Maybe LazyJobBuildAndInferCtx::Complete() { int32_t pass_cnt = 0; const int64_t prev_v = FLAGS_v; auto DoPass = [&](const std::string& pass_name, int32_t cnt = 0) -> Maybe { - VLOG(1) << job_name << " is compiling with pass" + VLOG(1) << job_name << " start compiling with pass" << " pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name << (cnt > 0 ? std::to_string(cnt) : ""); if (unlikely(NeedLogJob(pass_name))) { @@ -1013,6 +1013,9 @@ Maybe LazyJobBuildAndInferCtx::Complete() { std::string cnt_str = cnt > 0 ? std::to_string(cnt) : ""; LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-after"); } + VLOG(1) << job_name << " finish compiling with pass" + << " pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + << (cnt > 0 ? std::to_string(cnt) : ""); ++pass_cnt; return Maybe::Ok(); }; diff --git a/oneflow/core/job/job_builder.cpp b/oneflow/core/job/job_builder.cpp index b13bd8a67fd..fcfacd60087 100644 --- a/oneflow/core/job/job_builder.cpp +++ b/oneflow/core/job/job_builder.cpp @@ -19,7 +19,10 @@ limitations under the License. #include "oneflow/core/common/container_util.h" #include "oneflow/core/job/job.pb.h" #include "oneflow/core/job/sbp_parallel.pb.h" +#include "oneflow/core/operator/op_conf.pb.h" #include "oneflow/core/operator/operator.h" +#include "oneflow/core/vm/symbol_storage.h" +#include "oneflow/core/framework/scope_util.h" namespace oneflow { @@ -170,6 +173,7 @@ Maybe JobBuilder::AddOp(const ParallelConf& parallel_conf, const OperatorC OperatorConf* mut_op_conf = job_->mutable_net()->add_op(); *mut_op_conf = op_conf; CHECK_OR_RETURN(op_name2op_conf_.emplace(op_conf.name(), mut_op_conf).second); + AddOpToModuleConf(op_conf); AddOpNamesToPlacementGroup({op_conf.name()}, parallel_conf); return Maybe::Ok(); } @@ -185,10 +189,35 @@ void JobBuilder::AddOps(const ParallelConf& parallel_conf, *mut_op_conf = op_conf; CHECK(op_name2op_conf_.emplace(op_conf.name(), mut_op_conf).second); op_names.emplace_back(op_conf.name()); + AddOpToModuleConf(op_conf); } AddOpNamesToPlacementGroup(op_names, parallel_conf); } +void JobBuilder::AddOpToModuleConf(const OperatorConf& op_conf) { + // set up the module config + if (Global>::Get()->Has(op_conf.scope_symbol_id())) { + const auto& scope = Global>::Get()->Get(op_conf.scope_symbol_id()); + if (scope.scope_proto().has_module_name()) { + const auto& module_name = scope.scope_proto().module_name(); + auto* module_name2module_conf = job_->mutable_module_name2module_conf(); + if (!(*module_name2module_conf)[module_name].has_name()) { + (*module_name2module_conf)[module_name].set_name(scope.scope_proto().module_name()); + } + + *((*module_name2module_conf)[module_name].add_ops()) = op_conf.name(); + return; + } + } + const auto& module_name = job_->job_conf().job_name(); + auto* module_name2module_conf = job_->mutable_module_name2module_conf(); + if (!(*module_name2module_conf)[module_name].has_name()) { + (*module_name2module_conf)[module_name].set_name(module_name); + } + + *((*module_name2module_conf)[module_name].add_ops()) = op_conf.name(); +} + void JobBuilder::AddOpNamesToPlacementGroup(const std::vector& op_names, const ParallelConf& parallel_conf) { PlacementGroup* placement_group = nullptr; @@ -230,6 +259,21 @@ void JobBuilder::RemoveOpByName(const std::unordered_set& removing_ for (const OperatorConf& op_conf : net.op()) { if (removing_names.count(op_conf.name()) == 0) { *(job_->mutable_net()->add_op()) = op_conf; } } + // Update module conf + auto module_confs_map = job_->module_name2module_conf(); + job_->clear_module_name2module_conf(); + for (const auto& module_conf_pair : module_confs_map) { + const auto& module_name = module_conf_pair.first; + auto* module_name2module_conf = job_->mutable_module_name2module_conf(); + if (!(*module_name2module_conf)[module_name].has_name()) { + (*module_name2module_conf)[module_name].set_name(module_name); + } + for (const auto& op_name : module_conf_pair.second.ops()) { + if (removing_names.count(op_name) == 0) { + *((*module_name2module_conf)[module_name].add_ops()) = op_name; + } + } + } // Update placement auto placement_group = job_->placement().placement_group(); job_->mutable_placement()->clear_placement_group(); diff --git a/oneflow/core/job/job_builder.h b/oneflow/core/job/job_builder.h index e9faf8645ec..a954d12ed7e 100644 --- a/oneflow/core/job/job_builder.h +++ b/oneflow/core/job/job_builder.h @@ -81,6 +81,7 @@ class JobBuilder final { private: void AddOpNamesToPlacementGroup(const std::vector& op_names, const ParallelConf& parallel_conf); + void AddOpToModuleConf(const OperatorConf& op_conf); Job* job_; HashMap op_name2op_conf_; diff --git a/oneflow/core/job/job_conf.proto b/oneflow/core/job/job_conf.proto index 03638feec30..18dcb92e41b 100644 --- a/oneflow/core/job/job_conf.proto +++ b/oneflow/core/job/job_conf.proto @@ -240,6 +240,8 @@ message JobConfigProto { optional bool cudnn_conv_enable_pseudo_half = 600 [default = true]; optional bool enable_auto_mixed_precision = 602 [default = false]; optional bool enable_quantization_aware_training = 603 [default = false]; + + optional bool disable_straighten_algorithm_in_task_graph = 700 [default = false]; optional int64 concurrency_width = 1000 [default = 128]; diff --git a/oneflow/core/job/module_conf.proto b/oneflow/core/job/module_conf.proto index b44913ac7f8..dbbdb389c88 100644 --- a/oneflow/core/job/module_conf.proto +++ b/oneflow/core/job/module_conf.proto @@ -1,9 +1,7 @@ syntax = "proto2"; package oneflow; -import "oneflow/core/operator/op_conf.proto"; - message ModuleConf { required string name = 1; - repeated OperatorConf ops = 2; + repeated string ops = 2; } diff --git a/oneflow/core/job/nd_sbp_util.cpp b/oneflow/core/job/nd_sbp_util.cpp index c8502367838..9726e5e902b 100644 --- a/oneflow/core/job/nd_sbp_util.cpp +++ b/oneflow/core/job/nd_sbp_util.cpp @@ -71,7 +71,7 @@ std::vector GetTensorSliceView(const int64_t parallel_num, ranges[i].mut_begin() = 0; ranges[i].mut_end() = shape.At(i); } - if (shape.NumAxes() == 0 && shape.elem_cnt() == 1) { + if (shape.NumAxes() == 0) { // NOTE(chengcheng): For Scalar Tensor. ranges.emplace_back(0, 1); } @@ -105,7 +105,7 @@ TensorSliceView GetTensorSliceView4ParallelRank(const Shape& parallel_hierarchy, ranges[i].mut_begin() = 0; ranges[i].mut_end() = logical_shape.At(i); } - if (logical_shape.NumAxes() == 0 && logical_shape.elem_cnt() == 1) { + if (logical_shape.NumAxes() == 0) { // NOTE(chengcheng): For Scalar Tensor. ranges.emplace_back(0, 1); } diff --git a/oneflow/core/job/plan_util.cpp b/oneflow/core/job/plan_util.cpp index dff5faa8065..fc7aec57dbe 100644 --- a/oneflow/core/job/plan_util.cpp +++ b/oneflow/core/job/plan_util.cpp @@ -861,8 +861,9 @@ namespace { struct MemBlockMemoryInfo { int64_t mem_block_id; int64_t mem_block_mem_size; + bool is_reused; std::vector ordered_op_names; - MemBlockMemoryInfo() : mem_block_id(-1), mem_block_mem_size(-1) {} + MemBlockMemoryInfo() : mem_block_id(-1), mem_block_mem_size(-1), is_reused(false) {} }; struct ChunkMemoryInfo { @@ -924,7 +925,10 @@ void PlanUtil::PlanMemoryLog(Plan* plan, const std::string& plan_name) { if (mem_block.mem_case().has_device_cuda_mem()) { if (mem_block.has_chunk_id()) { rank_memory_info.chunk_info.mem_block_ids.push_back(mem_block_id); + info.is_reused = true; } else { + rank_memory_info.chunk_info.mem_block_ids.push_back(mem_block_id); + info.is_reused = false; rank_memory_info.not_reused_mem_size += mem_block.mem_size(); rank_memory_info.total_mem_size += mem_block.mem_size(); if (mem_block.has_variable_op_name()) { @@ -968,25 +972,26 @@ void PlanUtil::PlanMemoryLog(Plan* plan, const std::string& plan_name) { << B2MiB(rank_memory_info.eager_variable_total_mem_size) << " MiB ]."; } - if (IsInDebugMode()) { - for (const auto& rank_memory_info : rank_device_memory_infos) { - int64_t chunk_id = rank_memory_info.chunk_info.chunk_id; - VLOG(2) << " For detail: Chunk id: " << chunk_id << " has " - << rank_memory_info.chunk_info.mem_block_ids.size() << " MemBlocks."; - for (int64_t mem_block_id : rank_memory_info.chunk_info.mem_block_ids) { - CHECK(mem_block_id2info.find(mem_block_id) != mem_block_id2info.end()); - const auto& mem_block_info = mem_block_id2info.at(mem_block_id); - VLOG(2) << " In Chunk id: " << chunk_id << " MemBlock id: " << mem_block_id - << " has num = " << mem_block_info.ordered_op_names.size() - << " ops with mem size = " << B2MiB(mem_block_info.mem_block_mem_size); - } - for (int64_t mem_block_id : rank_memory_info.chunk_info.mem_block_ids) { - CHECK(mem_block_id2info.find(mem_block_id) != mem_block_id2info.end()); - const auto& mem_block_info = mem_block_id2info.at(mem_block_id); - for (int64_t i = 0; i < mem_block_info.ordered_op_names.size(); ++i) { - VLOG(3) << " In Chunk id: " << chunk_id << " MemBlock id: " << mem_block_id - << " order: " << i << " op_name: " << mem_block_info.ordered_op_names.at(i); - } + for (const auto& rank_memory_info : rank_device_memory_infos) { + int64_t chunk_id = rank_memory_info.chunk_info.chunk_id; + int64_t device_id = rank_memory_info.device_id; + int64_t not_reuse_size = rank_memory_info.not_reused_mem_size; + VLOG(2) << " For detail: Chunk id: " << chunk_id << " has " + << rank_memory_info.chunk_info.mem_block_ids.size() << " MemBlocks" + << " not reused size = " << B2MiB(not_reuse_size); + for (int64_t mem_block_id : rank_memory_info.chunk_info.mem_block_ids) { + CHECK(mem_block_id2info.find(mem_block_id) != mem_block_id2info.end()); + const auto& mem_block_info = mem_block_id2info.at(mem_block_id); + VLOG(2) << " In Device: " << device_id << " Chunk id: " << chunk_id + << " MemBlock id: " << mem_block_id + << " has num = " << mem_block_info.ordered_op_names.size() + << " ops with mem size = " << B2MiB(mem_block_info.mem_block_mem_size) + << " is reused " << mem_block_info.is_reused; + for (int64_t i = 0; i < mem_block_info.ordered_op_names.size(); ++i) { + VLOG(3) << " In Device: " << device_id << " Chunk id: " << chunk_id + << " In MemBlock id: " << mem_block_id << " order: " << i << " is reused " + << mem_block_info.is_reused + << " op_name: " << mem_block_info.ordered_op_names.at(i); } } } diff --git a/oneflow/core/job/runtime.cpp b/oneflow/core/job/runtime.cpp index 6c920f9ec0e..f5167fca246 100644 --- a/oneflow/core/job/runtime.cpp +++ b/oneflow/core/job/runtime.cpp @@ -23,6 +23,7 @@ limitations under the License. #include "oneflow/core/job/global_for.h" #include "oneflow/core/job/runtime_context.h" #include "oneflow/core/job/runtime_job_descs.h" +#include "oneflow/core/job/eager_nccl_comm_manager.h" #include "oneflow/core/thread/thread_manager.h" #include "oneflow/core/graph/task_node.h" #include "oneflow/core/device/cuda_util.h" @@ -69,6 +70,9 @@ Runtime::Runtime( Global::Get()->AddPlan(plan); collective_boxing_scheduler_plan_token_ = Global::Get()->AddPlan(plan); +#ifdef WITH_CUDA + Global::Get()->CreateCommFromPlan(plan); +#endif // WITH_CUDA } std::vector source_tasks; source_tasks.reserve(plan.task().size()); diff --git a/oneflow/core/job_rewriter/group_boxing_by_dst_parallel.cpp b/oneflow/core/job_rewriter/group_boxing_by_dst_parallel.cpp index 29915c8667f..0fb0dba7d6a 100644 --- a/oneflow/core/job_rewriter/group_boxing_by_dst_parallel.cpp +++ b/oneflow/core/job_rewriter/group_boxing_by_dst_parallel.cpp @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/job_rewriter/group_boxing_by_dst_parallel.h" +#include "oneflow/core/framework/sbp_infer_util.h" #include "oneflow/core/job/job_desc.h" #include "oneflow/core/common/protobuf.h" @@ -28,18 +29,35 @@ Maybe GroupBoxingByDstParallel(const OpGraph& op_graph, JobBuilder* job_bu OperatorConf::OpTypeCase op_type_case = node->op().op_conf().op_type_case(); if (IsClassRegistered(op_type_case)) { return; } for (const std::string& ibn : node->op().input_bns()) { + const auto& blob_modifier_ = node->op().InputBlobModifier4Ibn(ibn); + if (blob_modifier_.has_is_mutable() && blob_modifier_.is_mutable()) { continue; } const LogicalBlobId& lbi = node->op().BnInOp2Lbi(ibn); const OpNode& producer = node->ProducerOpNode4Lbi(lbi); const NdSbp& producer_nd_sbp = producer.NdSbp4Lbi(lbi); + const std::string& producer_lbn = *CHECK_JUST(producer.op().obn4lbi(lbi)); + const ParallelDesc& producer_parallel_desc = + *CHECK_JUST(producer.op().GetParallelDesc4BnInOp(producer_lbn)).get(); + ParallelDesc reduced_in_parallel_desc = producer_parallel_desc; + NdSbp reduced_in_nd_sbp; + NdSbpDimReduce(producer_parallel_desc, producer_nd_sbp, &reduced_in_parallel_desc, + &reduced_in_nd_sbp); + const NdSbp& consumer_nd_sbp = node->NdSbp4BnInOp(ibn); + const ParallelDesc& consumer_parallel_desc = + *CHECK_JUST(node->op().GetParallelDesc4BnInOp(ibn)); + ParallelDesc reduced_out_parallel_desc = consumer_parallel_desc; + NdSbp reduced_out_nd_sbp; + NdSbpDimReduce(consumer_parallel_desc, consumer_nd_sbp, &reduced_out_parallel_desc, + &reduced_out_nd_sbp); - if (producer.parallel_desc() != node->parallel_desc() - || (node->parallel_desc().parallel_num() != 1 && producer_nd_sbp != consumer_nd_sbp)) { - lbi2consumer_grouped_by_parallel[lbi][{node->parallel_desc(), consumer_nd_sbp}].push_back( - {node, ibn}); - if (op_node2op_conf.find(node) == op_node2op_conf.end()) { - op_node2op_conf[node] = node->op().op_conf(); - } + if (reduced_in_parallel_desc == reduced_out_parallel_desc + && reduced_in_nd_sbp == reduced_out_nd_sbp) { + continue; + } + lbi2consumer_grouped_by_parallel[lbi][{reduced_out_parallel_desc, reduced_out_nd_sbp}] + .push_back({node, ibn}); + if (op_node2op_conf.find(node) == op_node2op_conf.end()) { + op_node2op_conf[node] = node->op().op_conf(); } } }); diff --git a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp index 9d211f74d21..d15b5313c9f 100644 --- a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp +++ b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp @@ -26,7 +26,7 @@ limitations under the License. #include "oneflow/core/vm/vm_util.h" #include "oneflow/core/vm/symbol_storage.h" #include "oneflow/core/operator/operator.h" -#include "oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.h" +#include "oneflow/core/framework/sbp_infer_util.h" namespace oneflow { @@ -232,6 +232,18 @@ bool TryBuildNcclBy1DHierarchy(OperatorConf* ret, const SbpParallel& src_sbp, .Build() .op_conf(); return true; + } else if (!dst_sbp.has_partial_sum_parallel()) { + *ret = user_op::UserOpConfWrapperBuilder(kNcclLogicalOpNamePrefix + "-(Send)2(Recv)-" + + NewUniqueId()) + .Op("_nccl_logical_send_recv") + .Input("in", lbn) + .Output("out") + .Attr>("src_nd_sbp", {SbpToString(src_sbp)}) + .Attr>("dst_nd_sbp", {SbpToString(dst_sbp)}) + .ScopeSymbolId(scope_symbol_id) + .Build() + .op_conf(); + return true; } return false; } @@ -517,7 +529,7 @@ void InsertNcclLogicalOpsAsCloseAsPossibleToSrcNode( } if (Global::Get()->enable_debug_mode()) { - VLOG(3) << " insert nccl op: " << nccl_op.name() << " from [" << src_op_name + VLOG(2) << " insert nccl op: " << nccl_op.name() << " from [" << src_op_name << ", order=" << src_order << ", sbp=" << NdSbpToString(src_node->NdSbp4Lbi(lbi)) << "] to [" << dst_op_name << ", order=" << node2subgraph_order.at(dst_node) << ", sbp=" << NdSbpToString(dst_node->NdSbp4Lbi(lbi)) << "] and before [" @@ -583,7 +595,7 @@ void InsertNcclLogicalOpsAsCloseAsPossibleToDstNode( } if (Global::Get()->enable_debug_mode()) { - VLOG(3) << " insert nccl op: " << nccl_op.name() << " from [" << src_op_name + VLOG(2) << " insert nccl op: " << nccl_op.name() << " from [" << src_op_name << ", order=" << node2subgraph_order.at(src_node) << "] to [" << dst_op_name << ", order=" << dst_order << "] and after [" << pre_op_name << ", order=" << dst_order - 1 << "]\n"; diff --git a/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp b/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp index 19851e21852..e9a0211ea62 100644 --- a/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp +++ b/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp @@ -213,8 +213,8 @@ Maybe SplitSparseSoftmaxCrossEntropyOpPass::Apply(const OpGraph& op_graph, .Op("nll") .Input("input", broadcast_sub_op.output("z", 0)) .Input("target", op_label_blob_name) - .Output("out") - .Output("total_weight") + .Output("output") + .Output("out_weight") .Attr("ignore_index", -100) .ScopeSymbolId(scope_symbol_id) .Build(); @@ -223,7 +223,7 @@ Maybe SplitSparseSoftmaxCrossEntropyOpPass::Apply(const OpGraph& op_graph, const std::string& prob_lbn = cur_op.output("prob", 0); const std::string& out_lbn = cur_op.output("out", 0); const std::string& new_prob_lbn = broadcast_div_op.output("z", 0); - const std::string& new_out_lbn = nll_op.output("out", 0); + const std::string& new_out_lbn = nll_op.output("output", 0); for (const OpEdge* out_edge : node->out_edges()) { const OpNode* consumer = out_edge->dst_node(); diff --git a/oneflow/core/kernel/blob_tensor_view.cpp b/oneflow/core/kernel/blob_tensor_view.cpp index bd9c1df9949..f84e14160c5 100644 --- a/oneflow/core/kernel/blob_tensor_view.cpp +++ b/oneflow/core/kernel/blob_tensor_view.cpp @@ -22,9 +22,9 @@ namespace user_op { BlobTensorView::BlobTensorView(Blob* blob) : blob_(blob) {} -ShapeView BlobTensorView::shape() const { return blob_->shape(); } +ShapeView BlobTensorView::shape_view() const { return blob_->shape(); } -MutShapeView BlobTensorView::mut_shape() { return *blob_->mut_shape_view(); } +MutShapeView BlobTensorView::mut_shape_view() { return *blob_->mut_shape_view(); } const Stride& BlobTensorView::stride() const { return blob_->stride(); } diff --git a/oneflow/core/kernel/blob_tensor_view.h b/oneflow/core/kernel/blob_tensor_view.h index 7277c2d35cf..129a6330880 100644 --- a/oneflow/core/kernel/blob_tensor_view.h +++ b/oneflow/core/kernel/blob_tensor_view.h @@ -29,8 +29,8 @@ class BlobTensorView final : public Tensor { explicit BlobTensorView(Blob* blob); ~BlobTensorView() = default; - ShapeView shape() const override; - MutShapeView mut_shape() override; + ShapeView shape_view() const override; + MutShapeView mut_shape_view() override; const Stride& stride() const override; DataType data_type() const override; const MemoryCase& mem_case() const override; diff --git a/oneflow/core/kernel/user_kernel.cpp b/oneflow/core/kernel/user_kernel.cpp index e5f29fe99bc..1f29ad41012 100644 --- a/oneflow/core/kernel/user_kernel.cpp +++ b/oneflow/core/kernel/user_kernel.cpp @@ -427,14 +427,14 @@ class UserKernelInferContext final : public user_op::KernelInferContext { user_op::Tensor* arg_tensor = Tensor4ArgNameAndIndex(arg_name, arg_index); CHECK(arg_tensor != nullptr) << "Tensor of arg (" << arg_name << "," << arg_index << ") is not found"; - return arg_tensor->shape(); + return arg_tensor->shape_view(); } MutShapeView MutShapeView4ArgNameAndIndex(const std::string& arg_name, int32_t arg_index) override { user_op::Tensor* arg_tensor = Tensor4ArgNameAndIndex(arg_name, arg_index); CHECK(arg_tensor != nullptr) << "Tensor of arg (" << arg_name << "," << arg_index << ") is not found"; - return arg_tensor->mut_shape(); + return arg_tensor->mut_shape_view(); } user_op::InferContext* MutOpInferContext() override { return &op_infer_ctx_; } diff --git a/oneflow/core/ndarray/cpu_concat_var_ndarray_test.cpp b/oneflow/core/ndarray/cpu_concat_var_ndarray_test.cpp index c632aefb331..d2e5a8b8ec4 100644 --- a/oneflow/core/ndarray/cpu_concat_var_ndarray_test.cpp +++ b/oneflow/core/ndarray/cpu_concat_var_ndarray_test.cpp @@ -26,9 +26,9 @@ TEST(CpuConcatVarNdarray, two_elem_concat) { std::vector buffer{-1, -1}; std::vector expected{0, 1}; CpuNdarrayBuilder ndarray; - auto x0 = ndarray.Var({1LL}, x0_data.data()); - auto x1 = ndarray.Var({1LL}, x1_data.data()); - ndarray.Var({2LL}, buffer.data()).CopyFrom(ndarray.Concatenate({x0, x1})); + auto x0 = ndarray.Var(Shape{1LL}, x0_data.data()); + auto x1 = ndarray.Var(Shape{1LL}, x1_data.data()); + ndarray.Var(Shape{2LL}, buffer.data()).CopyFrom(ndarray.Concatenate({x0, x1})); ASSERT_EQ(memcmp(buffer.data(), expected.data(), sizeof(int32_t) * 2), 0); } @@ -37,9 +37,9 @@ TEST(CpuConcatVarNdarray, two_elem_concat_assign) { std::vector x1_data{-1}; std::vector buffer{0, 1}; CpuNdarrayBuilder ndarray; - auto x0 = ndarray.Var({1LL}, x0_data.data()); - auto x1 = ndarray.Var({1LL}, x1_data.data()); - ndarray.Concatenate({x0, x1}).CopyFrom(ndarray.Var({2LL}, buffer.data())); + auto x0 = ndarray.Var(Shape{1LL}, x0_data.data()); + auto x1 = ndarray.Var(Shape{1LL}, x1_data.data()); + ndarray.Concatenate({x0, x1}).CopyFrom(ndarray.Var(Shape{2LL}, buffer.data())); ASSERT_EQ(x0_data[0], 0); ASSERT_EQ(x1_data[0], 1); } @@ -61,9 +61,9 @@ TEST(CpuConcatVarNdarray, 2d_concat) { std::vector buffer(10, -1); // clang-format on CpuNdarrayBuilder ndarray; - auto x0 = ndarray.Var({2LL, 3LL}, x0_data.data()); - auto x1 = ndarray.Var({2LL, 2LL}, x1_data.data()); - ndarray.Var({2LL, 5LL}, buffer.data()).CopyFrom(ndarray.template Concatenate<1>({x0, x1})); + auto x0 = ndarray.Var(Shape{2LL, 3LL}, x0_data.data()); + auto x1 = ndarray.Var(Shape{2LL, 2LL}, x1_data.data()); + ndarray.Var(Shape{2LL, 5LL}, buffer.data()).CopyFrom(ndarray.template Concatenate<1>({x0, x1})); ASSERT_EQ(memcmp(buffer.data(), expected.data(), sizeof(int32_t) * 10), 0); } @@ -85,9 +85,9 @@ TEST(CpuConcatVarNdarray, 2d_concat_assign) { }; // clang-format on CpuNdarrayBuilder ndarray; - auto x = ndarray.Var({2LL, 5LL}, x_data.data()); - auto y0 = ndarray.Var({2LL, 3LL}, y0_buffer.data()); - auto y1 = ndarray.Var({2LL, 2LL}, y1_buffer.data()); + auto x = ndarray.Var(Shape{2LL, 5LL}, x_data.data()); + auto y0 = ndarray.Var(Shape{2LL, 3LL}, y0_buffer.data()); + auto y1 = ndarray.Var(Shape{2LL, 2LL}, y1_buffer.data()); ndarray.template Concatenate<1>({y0, y1}).CopyFrom(x); ASSERT_EQ(memcmp(y0_buffer.data(), y0_expected.data(), sizeof(int32_t) * 6), 0); ASSERT_EQ(memcmp(y1_buffer.data(), y1_expected.data(), sizeof(int32_t) * 4), 0); @@ -119,9 +119,10 @@ TEST(CpuConcatVarNdarray, 3d_concat) { std::vector buffer(20, -1); // clang-format on CpuNdarrayBuilder ndarray; - auto x0 = ndarray.Var({2LL, 2LL, 3LL}, x0_data.data()); - auto x1 = ndarray.Var({2LL, 2LL, 2LL}, x1_data.data()); - ndarray.Var({2LL, 2LL, 5LL}, buffer.data()).CopyFrom(ndarray.template Concatenate<2>({x0, x1})); + auto x0 = ndarray.Var(Shape{2LL, 2LL, 3LL}, x0_data.data()); + auto x1 = ndarray.Var(Shape{2LL, 2LL, 2LL}, x1_data.data()); + ndarray.Var(Shape{2LL, 2LL, 5LL}, buffer.data()) + .CopyFrom(ndarray.template Concatenate<2>({x0, x1})); ASSERT_EQ(memcmp(buffer.data(), expected.data(), sizeof(int32_t) * 20), 0); } @@ -152,9 +153,9 @@ TEST(CpuConcatVarNdarray, 3d_concat_assign) { std::vector y1_buffer(2*2*2, -1); // clang-format on CpuNdarrayBuilder ndarray; - auto x = ndarray.Var({2LL, 2LL, 5LL}, x_data.data()); - auto y0 = ndarray.Var({2LL, 2LL, 3LL}, y0_buffer.data()); - auto y1 = ndarray.Var({2LL, 2LL, 2LL}, y1_buffer.data()); + auto x = ndarray.Var(Shape{2LL, 2LL, 5LL}, x_data.data()); + auto y0 = ndarray.Var(Shape{2LL, 2LL, 3LL}, y0_buffer.data()); + auto y1 = ndarray.Var(Shape{2LL, 2LL, 2LL}, y1_buffer.data()); ndarray.template Concatenate<2>({y0, y1}).CopyFrom(x); ASSERT_EQ(memcmp(y0_buffer.data(), y0_expected.data(), sizeof(int32_t) * y0_expected.size()), 0); ASSERT_EQ(memcmp(y1_buffer.data(), y1_expected.data(), sizeof(int32_t) * y1_expected.size()), 0); diff --git a/oneflow/core/ndarray/cpu_slice_var_ndarray_test.cpp b/oneflow/core/ndarray/cpu_slice_var_ndarray_test.cpp index 9abead525b8..db1f62ab40f 100644 --- a/oneflow/core/ndarray/cpu_slice_var_ndarray_test.cpp +++ b/oneflow/core/ndarray/cpu_slice_var_ndarray_test.cpp @@ -24,8 +24,8 @@ TEST(CpuSliceVarNdarray, one_elem_assign) { std::vector data({1}); std::vector buffer({0}); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({1LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({1LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{1LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{1LL}, buffer.data()); buffer_ndarray(0).CopyFrom(data_ndarray(0)); ASSERT_EQ(data[0], buffer[0]); } @@ -34,8 +34,8 @@ TEST(CpuSliceVarNdarray, one_elem_assign_slice_on_slice) { std::vector data({1}); std::vector buffer({0}); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({1LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({1LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{1LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{1LL}, buffer.data()); buffer_ndarray(0)(0).CopyFrom(data_ndarray(0)(0)); ASSERT_EQ(data[0], buffer[0]); } @@ -44,8 +44,8 @@ TEST(CpuSliceVarNdarray, 1d_assign) { std::vector data({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); std::vector buffer(10, 0); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({10LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({10LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{10LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{10LL}, buffer.data()); buffer_ndarray({}).CopyFrom(data_ndarray({})); ASSERT_EQ(memcmp(data.data(), buffer.data(), sizeof(int32_t) * 10), 0); } @@ -55,8 +55,8 @@ TEST(CpuSliceVarNdarray, 1d_slice_assign) { std::vector buffer(10, 100); std::vector expected({100, 1, 2, 3, 4, 5, 6, 7, 8, 100}); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({static_cast(data.size())}, data.data()); - auto&& buffer_ndarray = ndarray.Var({10LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{static_cast(data.size())}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{10LL}, buffer.data()); ASSERT_EQ(buffer_ndarray({1, -1}).xpu_shape(), XpuShape(Shape({8}))); buffer_ndarray({1, -1}).CopyFrom(data_ndarray({})); ASSERT_EQ(memcmp(expected.data(), buffer.data(), sizeof(int32_t) * 10), 0); @@ -67,8 +67,8 @@ TEST(CpuSliceVarNdarray, 1d_slice) { std::vector buffer(8, 100); std::vector expected({1, 2, 3, 4, 5, 6, 7, 8}); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({static_cast(data.size())}, data.data()); - auto&& buffer_ndarray = ndarray.Var({static_cast(buffer.size())}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{static_cast(data.size())}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{static_cast(buffer.size())}, buffer.data()); buffer_ndarray({}).CopyFrom(data_ndarray({1, -1})); ASSERT_EQ(memcmp(expected.data(), buffer.data(), sizeof(int32_t) * buffer.size()), 0); } @@ -85,8 +85,8 @@ TEST(CpuSliceVarNdarray, 2d_slice) { std::vector buffer(4, 100); std::vector expected({0, 1, 2, 3}); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({4LL, 4LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({2LL, 2LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{4LL, 4LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{2LL, 2LL}, buffer.data()); buffer_ndarray({}, {}).CopyFrom(data_ndarray({1, -1}, {1, -1})); ASSERT_EQ(memcmp(expected.data(), buffer.data(), sizeof(int32_t) * buffer.size()), 0); } @@ -103,8 +103,8 @@ TEST(CpuSliceVarNdarray, 2d_slice_assign) { }); // clang-format on CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({2LL, 2LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({4LL, 4LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{2LL, 2LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{4LL, 4LL}, buffer.data()); buffer_ndarray({1, -1}, {1, -1}).CopyFrom(data_ndarray({}, {})); ASSERT_EQ(memcmp(expected.data(), buffer.data(), sizeof(int32_t) * buffer.size()), 0); } @@ -126,8 +126,8 @@ TEST(CpuSliceVarNdarray, 2d_slice_reverse) { }); // clang-format on CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({4LL, 4LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({4LL, 4LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{4LL, 4LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{4LL, 4LL}, buffer.data()); buffer_ndarray({1, -1}, {1, -1}).CopyFrom(data_ndarray({-2, 0, -1}, {1, -1})); ASSERT_EQ(memcmp(expected.data(), buffer.data(), sizeof(int32_t) * buffer.size()), 0); } @@ -155,8 +155,8 @@ TEST(CpuSliceVarNdarray, 3d_slice) { }); // clang-format on CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({2LL, 4LL, 4LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({2LL, 2LL, 2LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{2LL, 4LL, 4LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{2LL, 2LL, 2LL}, buffer.data()); buffer_ndarray.CopyFrom(data_ndarray({}, {1, -1}, {1, -1})); ASSERT_EQ(memcmp(expected.data(), buffer.data(), sizeof(int32_t) * buffer.size()), 0); } @@ -184,8 +184,8 @@ TEST(CpuSliceVarNdarray, 3d_slice_assign) { }); // clang-format on CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({2LL, 2LL, 2LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({2LL, 4LL, 4LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{2LL, 2LL, 2LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{2LL, 4LL, 4LL}, buffer.data()); buffer_ndarray({}, {1, -1}, {1, -1}).CopyFrom(data_ndarray); ASSERT_EQ(memcmp(expected.data(), buffer.data(), sizeof(int32_t) * buffer.size()), 0); } diff --git a/oneflow/core/ndarray/cpu_var_ndarray_test.cpp b/oneflow/core/ndarray/cpu_var_ndarray_test.cpp index bdcbbf11697..5d24a6e4863 100644 --- a/oneflow/core/ndarray/cpu_var_ndarray_test.cpp +++ b/oneflow/core/ndarray/cpu_var_ndarray_test.cpp @@ -24,8 +24,8 @@ TEST(CpuVarNdarray, one_elem_assign) { std::vector data({1}); std::vector buffer({0}); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({1LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({1LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{1LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{1LL}, buffer.data()); buffer_ndarray.CopyFrom(data_ndarray); ASSERT_EQ(data[0], buffer[0]); } @@ -34,8 +34,8 @@ TEST(CpuVarNdarray, 1d_assign) { std::vector data({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); std::vector buffer(10, 0); CpuNdarrayBuilder ndarray; - auto&& data_ndarray = ndarray.Var({10LL}, data.data()); - auto&& buffer_ndarray = ndarray.Var({10LL}, buffer.data()); + auto&& data_ndarray = ndarray.Var(Shape{10LL}, data.data()); + auto&& buffer_ndarray = ndarray.Var(Shape{10LL}, buffer.data()); buffer_ndarray.CopyFrom(data_ndarray); ASSERT_EQ(memcmp(data.data(), buffer.data(), sizeof(int32_t) * 10), 0); } diff --git a/oneflow/core/operator/operator.cpp b/oneflow/core/operator/operator.cpp index ed24e39eb79..35e9f236938 100644 --- a/oneflow/core/operator/operator.cpp +++ b/oneflow/core/operator/operator.cpp @@ -727,7 +727,6 @@ Maybe Operator::GreedilyFindMinCopyCostNdSbp( double priority_ratio = ComputeSbpInferPriority( producer_infer_hint4ibn->nd_sbp(), JUST(VectorAt(nd_sbp_sig_list, i)).bn_in_op2nd_sbp().at(ibn), - producer_infer_hint4ibn->logical_blob_desc(), producer_infer_hint4ibn->parallel_desc(), *JUST(GetParallelDesc4BnInOp(ibn)), requires_same_sbp[ibn_id]); sum_priority_ratio += priority_ratio; @@ -847,11 +846,6 @@ Maybe Operator::InferNdSbpSignature( HashMap ibn2sbp_infer_hint; for (const auto& ibn : input_bns()) { const NdSbpInferHint* hint = JUST(NdSbpInferHint4Ibn(ibn)); - if (hint->nd_sbp().sbp_parallel_size() != 1) { - CHECK_OR_RETURN(Is1dSbp(hint->nd_sbp()) || hint->parallel_desc().parallel_num() == 1) - << op_name() << ", " << *JUST(PlacementToString(hint->parallel_desc())) << ", " - << NdSbpToString(hint->nd_sbp()); - } ibn2sbp_infer_hint.emplace(ibn, SbpInferHint(&hint->parallel_desc(), &hint->logical_blob_desc(), &hint->nd_sbp().sbp_parallel(0))); diff --git a/oneflow/core/vm/barrier_instruction_type.h b/oneflow/core/vm/barrier_instruction_type.h new file mode 100644 index 00000000000..f6f3e20edc2 --- /dev/null +++ b/oneflow/core/vm/barrier_instruction_type.h @@ -0,0 +1,66 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_VM_BARRIER_INSTRUCTION_TYPE_H_ +#define ONEFLOW_CORE_VM_BARRIER_INSTRUCTION_TYPE_H_ + +#include "oneflow/core/common/util.h" +#include "oneflow/core/intrusive/flat_msg_view.h" +#include "oneflow/core/rpc/include/base.h" +#include "oneflow/core/vm/control_stream_type.h" +#include "oneflow/core/vm/instruction_type.h" +#include "oneflow/core/vm/instruction.h" +#include "oneflow/core/vm/virtual_machine_engine.h" +#include "oneflow/core/vm/barrier_phy_instr_operand.h" +#include "oneflow/core/control/global_process_ctx.h" + +namespace oneflow { +namespace vm { + +class BarrierInstructionType : public InstructionType { + public: + BarrierInstructionType() = default; + virtual ~BarrierInstructionType() override = default; + + bool IsBarrier() const override { return true; } + + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { return "Barrier"; } + void Compute(Instruction* instruction) const override { Run(instruction->instr_msg()); } + void ComputeInFuseMode(InstructionMsg* instr_msg) const override { Run(*instr_msg); } + + protected: + void Run(const InstructionMsg& instr_msg) const { + const auto* operand = + dynamic_cast(instr_msg.phy_instr_operand().get()); + CHECK_NOTNULL(operand)->callback(); + } +}; + +class GlobalSyncInstructionType : public InstructionType { + public: + GlobalSyncInstructionType() = default; + virtual ~GlobalSyncInstructionType() override = default; + + bool IsBarrier() const override { return true; } + + std::string DebugName(const vm::InstructionMsg& instr_msg) const override { return "GlobalSync"; } + void Compute(Instruction* instruction) const override { OF_ENV_BARRIER(); } + void ComputeInFuseMode(InstructionMsg* instr_msg) const override { OF_ENV_BARRIER(); } +}; + +} // namespace vm +} // namespace oneflow + +#endif // ONEFLOW_CORE_VM_BARRIER_INSTRUCTION_TYPE_H_ diff --git a/oneflow/core/vm/control_stream_type.cpp b/oneflow/core/vm/control_stream_type.cpp index 931f9b2ae2b..f007ea33812 100644 --- a/oneflow/core/vm/control_stream_type.cpp +++ b/oneflow/core/vm/control_stream_type.cpp @@ -13,7 +13,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/vm/stream_desc.h" #include "oneflow/core/vm/control_stream_type.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/vm/instruction.h" @@ -27,8 +26,7 @@ namespace oneflow { namespace vm { void ControlStreamType::Compute(Instruction* instruction) const { - const auto& instr_type_id = instruction->instr_msg().instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); + instruction->instr_msg().instruction_type().Compute(instruction); auto* status_buffer = instruction->mut_status_buffer(); NaiveInstrStatusQuerier::MutCast(status_buffer->mut_buffer()->mut_data())->set_done(); } @@ -50,14 +48,5 @@ bool ControlStreamType::QueryInstructionStatusDone( return NaiveInstrStatusQuerier::Cast(status_buffer.buffer().data())->done(); } -intrusive::shared_ptr ControlStreamType::MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const { - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(1); - ret->set_num_streams_per_thread(1); - return ret; -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/control_stream_type.h b/oneflow/core/vm/control_stream_type.h index a5e66dcd6a5..622bf318d93 100644 --- a/oneflow/core/vm/control_stream_type.h +++ b/oneflow/core/vm/control_stream_type.h @@ -29,8 +29,6 @@ class ControlStreamType final : public StreamType { ControlStreamType() = default; ~ControlStreamType() = default; - const char* stream_tag() const override { return "control"; } - void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override {} void InitInstructionStatus(const Stream& stream, @@ -39,8 +37,6 @@ class ControlStreamType final : public StreamType { InstructionStatusBuffer* status_buffer) const override; bool QueryInstructionStatusDone(const Stream& stream, const InstructionStatusBuffer& status_buffer) const override; - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; void Compute(Instruction* instruction) const override; bool OnSchedulerThread() const override { return true; } diff --git a/oneflow/core/vm/cpu_stream_type.cpp b/oneflow/core/vm/cpu_stream_type.cpp index ca61f0aba73..8e04d05f8ba 100644 --- a/oneflow/core/vm/cpu_stream_type.cpp +++ b/oneflow/core/vm/cpu_stream_type.cpp @@ -49,24 +49,10 @@ bool CpuStreamType::QueryInstructionStatusDone(const Stream& stream, void CpuStreamType::Compute(Instruction* instruction) const { OF_PROFILER_RANGE_GUARD("S:" + instruction->instr_msg().DebugName()); - { - const auto& instr_type_id = instruction->mut_instr_msg()->instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); - } + instruction->instr_msg().instruction_type().Compute(instruction); auto* status_buffer = instruction->mut_status_buffer(); NaiveInstrStatusQuerier::MutCast(status_buffer->mut_buffer()->mut_data())->set_done(); } -intrusive::shared_ptr CpuStreamType::MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const { - if (!resource.has_cpu_device_num()) { return intrusive::shared_ptr(); } - std::size_t device_num = resource.cpu_device_num(); - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(device_num); - ret->set_num_streams_per_thread(device_num); - return ret; -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/cpu_stream_type.h b/oneflow/core/vm/cpu_stream_type.h index 304f1ff29e7..f94226ac7c1 100644 --- a/oneflow/core/vm/cpu_stream_type.h +++ b/oneflow/core/vm/cpu_stream_type.h @@ -30,8 +30,6 @@ class CpuStreamType final : public StreamType { CpuStreamType() = default; ~CpuStreamType() override = default; - const char* stream_tag() const override { return "cpu"; } - void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override; void InitInstructionStatus(const Stream& stream, @@ -41,8 +39,6 @@ class CpuStreamType final : public StreamType { bool QueryInstructionStatusDone(const Stream& stream, const InstructionStatusBuffer& status_buffer) const override; void Compute(Instruction* instruction) const override; - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; bool OnSchedulerThread() const override { return false; } bool SupportingTransportInstructions() const override { return true; } }; diff --git a/oneflow/core/eager/critical_section_status_querier.h b/oneflow/core/vm/critical_section_status_querier.h similarity index 91% rename from oneflow/core/eager/critical_section_status_querier.h rename to oneflow/core/vm/critical_section_status_querier.h index 6b5293a7789..8e26fccf4d1 100644 --- a/oneflow/core/eager/critical_section_status_querier.h +++ b/oneflow/core/vm/critical_section_status_querier.h @@ -13,8 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_CORE_EAGER_CRITICAL_SECTION_QUERIER_H_ -#define ONEFLOW_CORE_EAGER_CRITICAL_SECTION_QUERIER_H_ +#ifndef ONEFLOW_CORE_VM_CRITICAL_SECTION_QUERIER_H_ +#define ONEFLOW_CORE_VM_CRITICAL_SECTION_QUERIER_H_ #include #include @@ -58,4 +58,4 @@ class CriticalSectionStatusQuerier final { } // namespace vm } // namespace oneflow -#endif // ONEFLOW_CORE_EAGER_CRITICAL_SECTION_QUERIER_H_ +#endif // ONEFLOW_CORE_VM_CRITICAL_SECTION_QUERIER_H_ diff --git a/oneflow/core/eager/critical_section_stream_type.cpp b/oneflow/core/vm/critical_section_stream_type.cpp similarity index 75% rename from oneflow/core/eager/critical_section_stream_type.cpp rename to oneflow/core/vm/critical_section_stream_type.cpp index 86f9a7a8b72..b718fafc220 100644 --- a/oneflow/core/eager/critical_section_stream_type.cpp +++ b/oneflow/core/vm/critical_section_stream_type.cpp @@ -14,11 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/eager/critical_section_stream_type.h" +#include "oneflow/core/vm/critical_section_stream_type.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/vm/instruction.h" #include "oneflow/core/vm/thread_ctx.h" -#include "oneflow/core/eager/critical_section_status_querier.h" +#include "oneflow/core/vm/critical_section_status_querier.h" #include "oneflow/core/common/util.h" namespace oneflow { @@ -47,19 +47,7 @@ bool CriticalSectionStreamType::QueryInstructionStatusDone( } void CriticalSectionStreamType::Compute(Instruction* instruction) const { - { - const auto& instr_type_id = instruction->mut_instr_msg()->instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); - } -} - -intrusive::shared_ptr CriticalSectionStreamType::MakeStreamDesc( - const Resource& resource, int64_t this_machine_id) const { - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(1); - ret->set_num_streams_per_thread(1); - return ret; + instruction->instr_msg().instruction_type().Compute(instruction); } } // namespace vm diff --git a/oneflow/core/eager/critical_section_stream_type.h b/oneflow/core/vm/critical_section_stream_type.h similarity index 80% rename from oneflow/core/eager/critical_section_stream_type.h rename to oneflow/core/vm/critical_section_stream_type.h index b71ace70090..f4ad4e9a5e7 100644 --- a/oneflow/core/eager/critical_section_stream_type.h +++ b/oneflow/core/vm/critical_section_stream_type.h @@ -14,8 +14,8 @@ See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_CORE_EAGER_CRITICAL_SECTION_STREAM_TYPE_H_ -#define ONEFLOW_CORE_EAGER_CRITICAL_SECTION_STREAM_TYPE_H_ +#ifndef ONEFLOW_CORE_VM_CRITICAL_SECTION_STREAM_TYPE_H_ +#define ONEFLOW_CORE_VM_CRITICAL_SECTION_STREAM_TYPE_H_ #include "oneflow/core/intrusive/flat_msg_view.h" #include "oneflow/core/vm/stream_type.h" @@ -31,8 +31,6 @@ class CriticalSectionStreamType final : public StreamType { CriticalSectionStreamType() = default; virtual ~CriticalSectionStreamType() = default; - const char* stream_tag() const override { return "critical_section"; } - void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override; void InitInstructionStatus(const Stream& stream, @@ -44,11 +42,9 @@ class CriticalSectionStreamType final : public StreamType { void Compute(Instruction* instruction) const override; bool OnSchedulerThread() const override { return false; } bool SupportingTransportInstructions() const override { return false; } - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; }; } // namespace vm } // namespace oneflow -#endif // ONEFLOW_CORE_EAGER_CRITICAL_SECTION_STREAM_TYPE_H_ +#endif // ONEFLOW_CORE_VM_CRITICAL_SECTION_STREAM_TYPE_H_ diff --git a/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp b/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp index ee1acaaeb49..2437b5d3521 100644 --- a/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp +++ b/oneflow/core/vm/cuda_copy_d2h_stream_type.cpp @@ -55,27 +55,12 @@ bool CudaCopyD2HStreamType::QueryInstructionStatusDone( void CudaCopyD2HStreamType::Compute(Instruction* instruction) const { auto* stream = instruction->mut_stream(); cudaSetDevice(stream->device_id()); - { - const auto& instr_type_id = instruction->mut_instr_msg()->instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); - OF_CUDA_CHECK(cudaGetLastError()); - } + instruction->instr_msg().instruction_type().Compute(instruction); + OF_CUDA_CHECK(cudaGetLastError()); char* data_ptr = instruction->mut_status_buffer()->mut_buffer()->mut_data(); CudaOptionalEventRecordStatusQuerier::MutCast(data_ptr)->SetLaunched(stream->device_ctx().get()); } -// Specifies copy_d2h stream description of the virtual machine to be used. -intrusive::shared_ptr CudaCopyD2HStreamType::MakeStreamDesc( - const Resource& resource, int64_t this_machine_id) const { - if (!resource.has_gpu_device_num()) { return intrusive::shared_ptr(); } - std::size_t device_num = resource.gpu_device_num(); - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(device_num); - ret->set_num_streams_per_thread(device_num); - return ret; -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/cuda_copy_d2h_stream_type.h b/oneflow/core/vm/cuda_copy_d2h_stream_type.h index 4ba2bc3cfa0..c8039af3537 100644 --- a/oneflow/core/vm/cuda_copy_d2h_stream_type.h +++ b/oneflow/core/vm/cuda_copy_d2h_stream_type.h @@ -37,8 +37,6 @@ class CudaCopyD2HStreamType final : public StreamType { CudaCopyD2HStreamType() = default; ~CudaCopyD2HStreamType() = default; - const char* stream_tag() const override { return "cuda_d2h"; } - void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override; void InitInstructionStatus(const Stream& stream, @@ -48,8 +46,6 @@ class CudaCopyD2HStreamType final : public StreamType { bool QueryInstructionStatusDone(const Stream& stream, const InstructionStatusBuffer& status_buffer) const override; void Compute(Instruction* instruction) const override; - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; bool OnSchedulerThread() const override { return true; } bool SupportingTransportInstructions() const override { return false; } }; diff --git a/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp b/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp index 84dcc316457..8bfba60c214 100644 --- a/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp +++ b/oneflow/core/vm/cuda_copy_h2d_stream_type.cpp @@ -49,26 +49,12 @@ bool CudaCopyH2DStreamType::QueryInstructionStatusDone( void CudaCopyH2DStreamType::Compute(Instruction* instruction) const { auto* stream = instruction->mut_stream(); cudaSetDevice(stream->device_id()); - { - const auto& instr_type_id = instruction->mut_instr_msg()->instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); - OF_CUDA_CHECK(cudaGetLastError()); - } + instruction->instr_msg().instruction_type().Compute(instruction); + OF_CUDA_CHECK(cudaGetLastError()); char* data_ptr = instruction->mut_status_buffer()->mut_buffer()->mut_data(); CudaOptionalEventRecordStatusQuerier::MutCast(data_ptr)->SetLaunched(stream->device_ctx().get()); } -intrusive::shared_ptr CudaCopyH2DStreamType::MakeStreamDesc( - const Resource& resource, int64_t this_machine_id) const { - if (!resource.has_gpu_device_num()) { return intrusive::shared_ptr(); } - std::size_t device_num = resource.gpu_device_num(); - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(device_num); - ret->set_num_streams_per_thread(device_num); - return ret; -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/cuda_copy_h2d_stream_type.h b/oneflow/core/vm/cuda_copy_h2d_stream_type.h index 24237260544..22e6180b0eb 100644 --- a/oneflow/core/vm/cuda_copy_h2d_stream_type.h +++ b/oneflow/core/vm/cuda_copy_h2d_stream_type.h @@ -36,8 +36,6 @@ class CudaCopyH2DStreamType final : public StreamType { CudaCopyH2DStreamType() = default; ~CudaCopyH2DStreamType() = default; - const char* stream_tag() const override { return "cuda_h2d"; } - void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override; void InitInstructionStatus(const Stream& stream, @@ -47,8 +45,6 @@ class CudaCopyH2DStreamType final : public StreamType { bool QueryInstructionStatusDone(const Stream& stream, const InstructionStatusBuffer& status_buffer) const override; void Compute(Instruction* instruction) const override; - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; bool OnSchedulerThread() const override { return true; } bool SupportingTransportInstructions() const override { return false; } }; diff --git a/oneflow/core/vm/cuda_stream_type.cpp b/oneflow/core/vm/cuda_stream_type.cpp index 671986aa5ae..0498e1680c3 100644 --- a/oneflow/core/vm/cuda_stream_type.cpp +++ b/oneflow/core/vm/cuda_stream_type.cpp @@ -55,27 +55,13 @@ void CudaStreamType::Compute(Instruction* instruction) const { OF_PROFILER_RANGE_PUSH("S:" + instruction->instr_msg().DebugName()); auto* stream = instruction->mut_stream(); cudaSetDevice(stream->device_id()); - { - const auto& instr_type_id = instruction->mut_instr_msg()->instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); - OF_CUDA_CHECK(cudaGetLastError()); - } + instruction->instr_msg().instruction_type().Compute(instruction); + OF_CUDA_CHECK(cudaGetLastError()); char* data_ptr = instruction->mut_status_buffer()->mut_buffer()->mut_data(); CudaOptionalEventRecordStatusQuerier::MutCast(data_ptr)->SetLaunched(stream->device_ctx().get()); OF_PROFILER_RANGE_POP(); } -intrusive::shared_ptr CudaStreamType::MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const { - if (!resource.has_gpu_device_num()) { return intrusive::shared_ptr(); } - std::size_t device_num = resource.gpu_device_num(); - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(device_num); - ret->set_num_streams_per_thread(device_num); - return ret; -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/cuda_stream_type.h b/oneflow/core/vm/cuda_stream_type.h index 9dce5146827..cfaf855f486 100644 --- a/oneflow/core/vm/cuda_stream_type.h +++ b/oneflow/core/vm/cuda_stream_type.h @@ -32,8 +32,6 @@ class CudaStreamType final : public StreamType { CudaStreamType() = default; ~CudaStreamType() override = default; - const char* stream_tag() const override { return "cuda"; } - void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override; void InitInstructionStatus(const Stream& stream, @@ -43,8 +41,6 @@ class CudaStreamType final : public StreamType { bool QueryInstructionStatusDone(const Stream& stream, const InstructionStatusBuffer& status_buffer) const override; void Compute(Instruction* instruction) const override; - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; bool OnSchedulerThread() const override { return true; } bool SupportingTransportInstructions() const override { return true; } }; diff --git a/oneflow/core/vm/async_cuda_stream_type.cpp b/oneflow/core/vm/event_recorded_cuda_stream_type.cpp similarity index 60% rename from oneflow/core/vm/async_cuda_stream_type.cpp rename to oneflow/core/vm/event_recorded_cuda_stream_type.cpp index e18bd824224..161cec36ef1 100644 --- a/oneflow/core/vm/async_cuda_stream_type.cpp +++ b/oneflow/core/vm/event_recorded_cuda_stream_type.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef WITH_CUDA -#include "oneflow/core/vm/async_cuda_stream_type.h" +#include "oneflow/core/vm/event_recorded_cuda_stream_type.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/vm/stream.h" #include "oneflow/core/vm/cuda_stream_handle_device_context.h" @@ -25,13 +25,13 @@ limitations under the License. namespace oneflow { namespace vm { -void AsyncCudaStreamType::InitDeviceCtx(std::unique_ptr* device_ctx, - Stream* stream) const { +void EventRecordedCudaStreamType::InitDeviceCtx(std::unique_ptr* device_ctx, + Stream* stream) const { device_ctx->reset(new CudaStreamHandleDeviceCtx(stream->device_id())); } -void AsyncCudaStreamType::InitInstructionStatus(const Stream& stream, - InstructionStatusBuffer* status_buffer) const { +void EventRecordedCudaStreamType::InitInstructionStatus( + const Stream& stream, InstructionStatusBuffer* status_buffer) const { static_assert(sizeof(CudaOptionalEventRecordStatusQuerier) < kInstructionStatusBufferBytes, ""); auto* event_provider = dynamic_cast(stream.device_ctx().get()); auto* data_ptr = status_buffer->mut_buffer()->mut_data(); @@ -39,42 +39,28 @@ void AsyncCudaStreamType::InitInstructionStatus(const Stream& stream, CudaOptionalEventRecordStatusQuerier::PlacementNew(data_ptr, cuda_event); } -void AsyncCudaStreamType::DeleteInstructionStatus(const Stream& stream, - InstructionStatusBuffer* status_buffer) const { +void EventRecordedCudaStreamType::DeleteInstructionStatus( + const Stream& stream, InstructionStatusBuffer* status_buffer) const { auto* ptr = CudaOptionalEventRecordStatusQuerier::MutCast(status_buffer->mut_buffer()->mut_data()); ptr->~CudaOptionalEventRecordStatusQuerier(); } -bool AsyncCudaStreamType::QueryInstructionStatusDone( +bool EventRecordedCudaStreamType::QueryInstructionStatusDone( const Stream& stream, const InstructionStatusBuffer& status_buffer) const { return CudaOptionalEventRecordStatusQuerier::Cast(status_buffer.buffer().data())->done(); } -void AsyncCudaStreamType::Compute(Instruction* instruction) const { +void EventRecordedCudaStreamType::Compute(Instruction* instruction) const { OF_PROFILER_RANGE_GUARD("S:" + instruction->instr_msg().DebugName()); auto* stream = instruction->mut_stream(); cudaSetDevice(stream->device_id()); - { - const auto& instr_type_id = instruction->mut_instr_msg()->instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); - OF_CUDA_CHECK(cudaGetLastError()); - } + instruction->instr_msg().instruction_type().Compute(instruction); + OF_CUDA_CHECK(cudaGetLastError()); char* data_ptr = instruction->mut_status_buffer()->mut_buffer()->mut_data(); CudaOptionalEventRecordStatusQuerier::MutCast(data_ptr)->SetLaunched(stream->device_ctx().get()); } -intrusive::shared_ptr AsyncCudaStreamType::MakeStreamDesc( - const Resource& resource, int64_t this_machine_id) const { - if (!resource.has_gpu_device_num()) { return intrusive::shared_ptr(); } - std::size_t device_num = resource.gpu_device_num(); - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(device_num); - ret->set_num_streams_per_thread(device_num); - return ret; -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/async_cuda_stream_type.h b/oneflow/core/vm/event_recorded_cuda_stream_type.h similarity index 75% rename from oneflow/core/vm/async_cuda_stream_type.h rename to oneflow/core/vm/event_recorded_cuda_stream_type.h index 52094e4b578..238f2c505ab 100644 --- a/oneflow/core/vm/async_cuda_stream_type.h +++ b/oneflow/core/vm/event_recorded_cuda_stream_type.h @@ -15,8 +15,8 @@ limitations under the License. */ #ifdef WITH_CUDA -#ifndef ONEFLOW_CORE_VM_ASYNC_CUDA_STREAM_TYPE_H_ -#define ONEFLOW_CORE_VM_ASYNC_CUDA_STREAM_TYPE_H_ +#ifndef ONEFLOW_CORE_VM_EVENT_RECORDED_CUDA_STREAM_TYPE_H_ +#define ONEFLOW_CORE_VM_EVENT_RECORDED_CUDA_STREAM_TYPE_H_ #include "oneflow/core/intrusive/flat_msg_view.h" #include "oneflow/core/vm/stream_type.h" @@ -27,12 +27,10 @@ limitations under the License. namespace oneflow { namespace vm { -class AsyncCudaStreamType final : public StreamType { +class EventRecordedCudaStreamType final : public StreamType { public: - AsyncCudaStreamType() = default; - ~AsyncCudaStreamType() override = default; - - const char* stream_tag() const override { return "async_launched_nccl"; } + EventRecordedCudaStreamType() = default; + ~EventRecordedCudaStreamType() override = default; void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override; @@ -43,8 +41,6 @@ class AsyncCudaStreamType final : public StreamType { bool QueryInstructionStatusDone(const Stream& stream, const InstructionStatusBuffer& status_buffer) const override; void Compute(Instruction* instruction) const override; - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; bool OnSchedulerThread() const override { return true; } bool SupportingTransportInstructions() const override { return true; } }; @@ -52,5 +48,5 @@ class AsyncCudaStreamType final : public StreamType { } // namespace vm } // namespace oneflow -#endif // ONEFLOW_CORE_VM_ASYNC_CUDA_STREAM_TYPE_H_ +#endif // ONEFLOW_CORE_VM_EVENT_RECORDED_CUDA_STREAM_TYPE_H_ #endif // WITH_CUDA diff --git a/oneflow/core/vm/fuse_instruction_type.cpp b/oneflow/core/vm/fuse_instruction_type.h similarity index 58% rename from oneflow/core/vm/fuse_instruction_type.cpp rename to oneflow/core/vm/fuse_instruction_type.h index fe2d060b69b..25fd45bb127 100644 --- a/oneflow/core/vm/fuse_instruction_type.cpp +++ b/oneflow/core/vm/fuse_instruction_type.h @@ -13,28 +13,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ONEFLOW_CORE_VM_FUSE_INSTRUCTION_TYPE_H_ +#define ONEFLOW_CORE_VM_FUSE_INSTRUCTION_TYPE_H_ + #include "oneflow/core/vm/instruction.h" #include "oneflow/core/vm/fuse_phy_instr_operand.h" -#include "oneflow/core/vm/cuda_stream_type.h" -#include "oneflow/core/vm/async_cuda_stream_type.h" -#include "oneflow/core/vm/cuda_copy_h2d_stream_type.h" -#include "oneflow/core/vm/cuda_copy_d2h_stream_type.h" -#include "oneflow/core/vm/cpu_stream_type.h" #include "oneflow/core/profiler/profiler.h" namespace oneflow { namespace vm { -template class FuseInstructionType : public vm::InstructionType { public: FuseInstructionType() = default; ~FuseInstructionType() override = default; - using stream_type = StreamT; - - std::string DebugOpTypeName(const InstructionMsg&) const override { return "Fuse"; } + std::string DebugName(const InstructionMsg&) const override { return "Fuse"; } void InitInstructionStatus(Instruction* instruction) const override { const auto& phy_instr_operand = instruction->instr_msg().phy_instr_operand(); @@ -42,7 +37,7 @@ class FuseInstructionType : public vm::InstructionType { auto* instr_msg_list = CHECK_NOTNULL(ptr)->mut_instr_msg_list(); auto* last_instr_msg = CHECK_NOTNULL(instr_msg_list->Last()); // init instruction status by last instruction_msg. - last_instr_msg->instr_type_id().instruction_type().InitInstructionStatusIf(instruction); + last_instr_msg->instruction_type().InitInstructionStatusIf(instruction); } void Compute(vm::Instruction* instruction) const override { @@ -51,23 +46,12 @@ class FuseInstructionType : public vm::InstructionType { auto* instr_msg_list = CHECK_NOTNULL(ptr)->mut_instr_msg_list(); INTRUSIVE_UNSAFE_FOR_EACH_PTR(instr_msg, instr_msg_list) { OF_PROFILER_RANGE_GUARD("F:" + instr_msg->DebugName()); - instr_msg->instr_type_id().instruction_type().ComputeInFuseMode(instr_msg); + instr_msg->instruction_type().ComputeInFuseMode(instr_msg); } } }; -COMMAND(vm::RegisterInstructionType>("cpu.Fuse")); -COMMAND(vm::RegisterInstructionType>("comm_net.Fuse")); - -#ifdef WITH_CUDA -COMMAND(vm::RegisterInstructionType>("cuda.Fuse")); -COMMAND(vm::RegisterInstructionType>("cuda_h2d.Fuse")); -COMMAND(vm::RegisterInstructionType>("cuda_d2h.Fuse")); -COMMAND( - vm::RegisterInstructionType>("sync_launched_nccl.Fuse")); -COMMAND(vm::RegisterInstructionType>( - "async_launched_nccl.Fuse")); -#endif - } // namespace vm } // namespace oneflow + +#endif // ONEFLOW_CORE_VM_FUSE_INSTRUCTION_TYPE_H_ diff --git a/oneflow/core/vm/fuse_phy_instr_operand.h b/oneflow/core/vm/fuse_phy_instr_operand.h index b9af5ae0004..258ab206f03 100644 --- a/oneflow/core/vm/fuse_phy_instr_operand.h +++ b/oneflow/core/vm/fuse_phy_instr_operand.h @@ -35,13 +35,10 @@ class FusePhyInstrOperand : public PhyInstrOperand { auto* last_instr_msg = instr_msg_list_.Last(); INTRUSIVE_UNSAFE_FOR_EACH_PTR(instr_msg, &instr_msg_list_) { if (instr_msg == last_instr_msg) { - CHECK(instr_msg->instr_type_id().instruction_type().fuse_type() - == kEnableInstructionFuseAsTailOnly - || instr_msg->instr_type_id().instruction_type().fuse_type() - == kEnableInstructionFuseAtAnyPosition); + CHECK(instr_msg->instruction_type().fuse_type() == kEnableInstructionFuseAsTailOnly + || instr_msg->instruction_type().fuse_type() == kEnableInstructionFuseAtAnyPosition); } else { - CHECK(instr_msg->instr_type_id().instruction_type().fuse_type() - == kEnableInstructionFuseAtAnyPosition); + CHECK(instr_msg->instruction_type().fuse_type() == kEnableInstructionFuseAtAnyPosition); } if (unlikely(stream_sequential_dependence_ == nullptr)) { stream_sequential_dependence_ = diff --git a/oneflow/core/vm/id_generator.cpp b/oneflow/core/vm/id_generator.cpp deleted file mode 100644 index 61232a5b082..00000000000 --- a/oneflow/core/vm/id_generator.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/control/global_process_ctx.h" -#include "oneflow/core/vm/id_generator.h" -#include "oneflow/core/vm/id_util.h" - -namespace oneflow { -namespace vm { - -Maybe LogicalIdGenerator::NewSymbolId() { - // NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to - // PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master. - return IdUtil::NewPhysicalSymbolId(GlobalProcessCtx::Rank()); -} - -Maybe LogicalIdGenerator::NewObjectId() { - // NOTE(chengcheng): in Multi-Client LogicalIdGenerator will degenerate directly to - // PhysicalIdGenerator, because each rank will generate id ONLY from itself, NOT the master. - return IdUtil::NewPhysicalObjectId(GlobalProcessCtx::Rank()); -} - -Maybe PhysicalIdGenerator::NewSymbolId() { - return IdUtil::NewPhysicalSymbolId(GlobalProcessCtx::Rank()); -} - -Maybe PhysicalIdGenerator::NewObjectId() { - return IdUtil::NewPhysicalObjectId(GlobalProcessCtx::Rank()); -} - -} // namespace vm -} // namespace oneflow diff --git a/oneflow/core/vm/id_generator.h b/oneflow/core/vm/id_generator.h deleted file mode 100644 index 58a03a3d898..00000000000 --- a/oneflow/core/vm/id_generator.h +++ /dev/null @@ -1,60 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_ID_GENERATOR_H_ -#define ONEFLOW_CORE_VM_ID_GENERATOR_H_ - -#include "oneflow/core/common/maybe.h" - -namespace oneflow { -namespace vm { - -class IdGenerator { - public: - virtual ~IdGenerator() = default; - - virtual Maybe NewSymbolId() = 0; - virtual Maybe NewObjectId() = 0; - - protected: - IdGenerator() = default; -}; - -class LogicalIdGenerator : public IdGenerator { - public: - LogicalIdGenerator(const LogicalIdGenerator&) = delete; - LogicalIdGenerator(LogicalIdGenerator&&) = delete; - LogicalIdGenerator() = default; - ~LogicalIdGenerator() override = default; - - Maybe NewSymbolId() override; - Maybe NewObjectId() override; -}; - -class PhysicalIdGenerator : public IdGenerator { - public: - PhysicalIdGenerator(const PhysicalIdGenerator&) = delete; - PhysicalIdGenerator(PhysicalIdGenerator&&) = delete; - PhysicalIdGenerator() = default; - ~PhysicalIdGenerator() override = default; - - Maybe NewSymbolId() override; - Maybe NewObjectId() override; -}; - -} // namespace vm -} // namespace oneflow - -#endif // ONEFLOW_CORE_VM_ID_GENERATOR_H_ diff --git a/oneflow/core/vm/id_util.cpp b/oneflow/core/vm/id_util.cpp deleted file mode 100644 index 5191f04514c..00000000000 --- a/oneflow/core/vm/id_util.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include -#include -#include "oneflow/core/vm/id_util.h" - -namespace oneflow { -namespace vm { - -namespace { - -static const int64_t kObjectIdMaximumValue = LLONG_MAX / 2; -static const int64_t kMachineNumberLimit = (1 << 12); -static const int64_t kErrorCodeLimit = 4096; - -static_assert(kMachineNumberLimit >= kErrorCodeLimit, ""); - -int64_t ObjectIdCounter() { - static int64_t counter = 0; - return (counter += kMachineNumberLimit); -} - -int64_t NewLogicalObjectIdFromCounter() { return ObjectIdCounter() + kMachineNumberLimit - 1; } - -int64_t NewPhysicalObjectIdFromCounter(int32_t machine_id) { - CHECK_LT(machine_id, kMachineNumberLimit - 1); - return ObjectIdCounter() + machine_id; -} - -} // namespace - -int64_t IdUtil::IsErrorId(int64_t id) { return id >= -kErrorCodeLimit && id <= kErrorCodeLimit; } - -int64_t IdUtil::NewLogicalValueObjectId() { - int64_t val = NewLogicalObjectIdFromCounter(); - CHECK_LT(val, kObjectIdMaximumValue); - return val; -} - -int64_t IdUtil::NewLogicalValueSymbolId() { - return NewLogicalObjectIdFromCounter() + kObjectIdMaximumValue; -} - -int64_t IdUtil::IsLogicalValueId(int64_t id) { - CHECK(IsValueId(id)); - return ((id + 1) % kObjectIdMaximumValue) == 0; -} - -int64_t IdUtil::NewPhysicalValueObjectId(int32_t machine_id) { - int64_t val = NewPhysicalObjectIdFromCounter(machine_id); - CHECK_LT(val, kObjectIdMaximumValue); - return val; -} - -int64_t IdUtil::NewPhysicalValueSymbolId(int32_t machine_id) { - return NewPhysicalObjectIdFromCounter(machine_id) + kObjectIdMaximumValue; -} - -bool IdUtil::IsObjectId(int64_t object_id) { return object_id < kObjectIdMaximumValue; } - -bool IdUtil::IsSymbolId(int64_t symbol_id) { return symbol_id > kObjectIdMaximumValue; } - -int64_t IdUtil::GetTypeId(int64_t id) { - if (IsTypeId(id)) { return id; } - return -id; -} - -bool IdUtil::IsTypeId(int64_t id) { return id < 0; } - -int64_t IdUtil::GetValueId(int64_t id) { - if (IsValueId(id)) { return id; } - return -id; -} - -bool IdUtil::IsValueId(int64_t id) { return id > 0; } - -} // namespace vm -} // namespace oneflow diff --git a/oneflow/core/vm/id_util.h b/oneflow/core/vm/id_util.h deleted file mode 100644 index ccd515ecde9..00000000000 --- a/oneflow/core/vm/id_util.h +++ /dev/null @@ -1,64 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_LOGICAL_OBJECT_ID_H_ -#define ONEFLOW_CORE_VM_LOGICAL_OBJECT_ID_H_ - -#include -#include "oneflow/core/intrusive/flat_msg.h" - -namespace oneflow { -namespace vm { - -using ObjectId = int64_t; - -struct IdUtil final { - // usually [-4096, 4096] - static int64_t IsErrorId(int64_t id); - - static int64_t IsLogicalId(int64_t id) { return IsLogicalValueId(id); } - static int64_t NewLogicalObjectId() { return NewLogicalValueObjectId(); } - static int64_t NewLogicalSymbolId() { return NewLogicalValueSymbolId(); } - static int64_t NewPhysicalObjectId(int32_t machine_id) { - return NewPhysicalValueObjectId(machine_id); - } - static int64_t NewPhysicalSymbolId(int32_t machine_id) { - return NewPhysicalValueSymbolId(machine_id); - } - - static int64_t IsLogicalValueId(int64_t id); - static int64_t NewLogicalValueObjectId(); - static int64_t NewLogicalValueSymbolId(); - static int64_t NewPhysicalValueObjectId(int32_t machine_id); - static int64_t NewPhysicalValueSymbolId(int32_t machine_id); - - // type object id or value object id - static bool IsObjectId(int64_t object_id); - // type symbol id or value symbol id - static bool IsSymbolId(int64_t symbol_id); - - // type object id or type symbol id - static int64_t GetTypeId(int64_t id); - static bool IsTypeId(int64_t id); - - // value object id or value symbol id - static int64_t GetValueId(int64_t id); - static bool IsValueId(int64_t id); -}; - -} // namespace vm -} // namespace oneflow - -#endif // ONEFLOW_CORE_VM_LOGICAL_OBJECT_ID_H_ diff --git a/oneflow/core/vm/instr_type_id.h b/oneflow/core/vm/instr_type_id.h deleted file mode 100644 index 4e41b4f8462..00000000000 --- a/oneflow/core/vm/instr_type_id.h +++ /dev/null @@ -1,81 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_INSTRUCTION_ID_H_ -#define ONEFLOW_CORE_VM_INSTRUCTION_ID_H_ - -#include -#include "oneflow/core/intrusive/flat_msg.h" -#include "oneflow/core/common/layout_standardize.h" -#include "oneflow/core/vm/stream_desc.h" - -namespace oneflow { -namespace vm { - -class InstructionType; -class StreamType; - -class InstrTypeId final { - public: - InstrTypeId() { __Init__(); } - InstrTypeId(const InstrTypeId& rhs) { - __Init__(); - CopyFrom(rhs); - } - - ~InstrTypeId() = default; - - void __Init__() { clear(); } - void __Init__(const StreamType* stream_type, const InstructionType* instruction_type) { - __Init__(); - set_stream_type(stream_type); - instruction_type_ = instruction_type; - } - void clear() { - stream_type_ = nullptr; - instruction_type_ = nullptr; - } - void CopyFrom(const InstrTypeId& rhs) { - stream_type_ = &rhs.stream_type(); - instruction_type_ = &rhs.instruction_type(); - } - // Getters - const StreamType& stream_type() const { return *stream_type_; } - const InstructionType& instruction_type() const { return *instruction_type_; } - - // Setters - void set_stream_type(const StreamType* stream_type) { stream_type_ = stream_type; } - - bool operator==(const InstrTypeId& rhs) const { - return stream_type_ == rhs.stream_type_ && instruction_type_ == rhs.instruction_type_; - } - bool operator<(const InstrTypeId& rhs) const { - if (!(stream_type_ == rhs.stream_type_)) { return stream_type_ < rhs.stream_type_; } - if (!(instruction_type_ == rhs.instruction_type_)) { - return instruction_type_ < rhs.instruction_type_; - } - return false; - } - bool operator<=(const InstrTypeId& rhs) const { return *this < rhs || *this == rhs; } - - private: - const InstructionType* instruction_type_; - const StreamType* stream_type_; -}; - -} // namespace vm -} // namespace oneflow - -#endif // ONEFLOW_CORE_VM_INSTRUCTION_ID_H_ diff --git a/oneflow/core/vm/instruction.cpp b/oneflow/core/vm/instruction.cpp index c4c7a93f6a0..300580f78a4 100644 --- a/oneflow/core/vm/instruction.cpp +++ b/oneflow/core/vm/instruction.cpp @@ -19,6 +19,7 @@ limitations under the License. #include "oneflow/core/vm/stream.h" #include "oneflow/core/vm/thread_ctx.h" #include "oneflow/core/vm/virtual_machine_engine.h" +#include "oneflow/core/framework/stream_get_stream_role_name.h" #include "oneflow/core/common/util.h" #include "oneflow/core/common/cpp_attribute.h" #include "oneflow/core/profiler/profiler.h" @@ -27,66 +28,26 @@ namespace oneflow { namespace vm { std::string InstructionMsg::DebugName() const { - std::string op_type_name = instr_type_id().instruction_type().DebugOpTypeName(*this); - return op_type_name + ":" + instr_type_name(); + std::string instr_name = instruction_type().DebugName(*this); + return instr_name + ":" + GetStreamRoleName::Visit(stream().stream_role()); } -void InstructionMsg::__Init__() { *mut_instr_type_name() = ""; } - -void InstructionMsg::__Init__(const std::string& instr_type_name) { - __Init__(); - mut_instr_type_id()->CopyFrom(LookupInstrTypeId(instr_type_name)); - *mut_instr_type_name() = instr_type_name; -} - -void InstructionMsg::__Init__(VirtualMachineEngine* vm, const std::string& instr_type_name, - const std::shared_ptr& phy_instr_parallel_desc, +void InstructionMsg::__Init__(Stream* stream, const InstructionType* instruction_type, const std::shared_ptr& phy_instr_operand) { - __Init__(); - // There are instructions without concept of ParallelDesc, like LaunchLazyJob, - // ComputeGlobalFrontSeqBarrier. If phy_instr_parallel_desc is empty, Instructions are run on the - // sole stream within the StreamRtDesc. - if (likely(phy_instr_parallel_desc)) { - int device_id = phy_instr_parallel_desc->parallel_id2device_id().at(0); - vm->GetCachedInstrTypeIdAndPhyInstrStream(instr_type_name, device_id, mut_instr_type_id(), - &phy_instr_stream_); - } else { - vm->GetInstrTypeIdAndSoleStream(instr_type_name, mut_instr_type_id(), &phy_instr_stream_); - } - *mut_instr_type_name() = instr_type_name; - phy_instr_parallel_desc_ = phy_instr_parallel_desc; + stream_ = stream; + instruction_type_ = instruction_type; phy_instr_operand_ = phy_instr_operand; } -void InstructionMsg::__Init__(const InstructionMsg& instr_msg) { - __Init__(); - mut_instr_type_id()->CopyFrom(instr_msg.instr_type_id()); - *mut_instr_type_name() = instr_msg.instr_type_name(); - const auto& parallel_desc = instr_msg.phy_instr_parallel_desc(); - if (parallel_desc) { phy_instr_parallel_desc_ = parallel_desc; } - phy_instr_operand_ = instr_msg.phy_instr_operand(); - if (instr_msg.phy_instr_stream() != nullptr) { phy_instr_stream_ = instr_msg.phy_instr_stream(); } -} - -intrusive::shared_ptr InstructionMsg::Clone() const { - return intrusive::make_shared(*this); -} - -void Instruction::Init(InstructionMsg* instr_msg, Stream* stream, - const std::shared_ptr& parallel_desc) { - __Init__(); - reset_instr_msg(instr_msg); - set_stream(stream); - instr_msg->instr_type_id().instruction_type().InitInstructionStatusIf(this); - *mut_parallel_desc() = parallel_desc; +void Instruction::Init(InstructionMsg* instr_msg) { + instr_msg_ = instr_msg; + instr_msg->instruction_type().InitInstructionStatusIf(this); } void Instruction::Delete() { OF_PROFILER_RANGE_GUARD("Instruction::Delete"); - instr_msg().instr_type_id().instruction_type().DeleteInstructionStatusIf(this); - OF_PROFILER_RANGE_PUSH("ClearInstrMsg"); + instr_msg().instruction_type().DeleteInstructionStatusIf(this); clear_instr_msg(); - OF_PROFILER_RANGE_POP(); mut_in_edges()->Clear(); mut_out_edges()->Clear(); } diff --git a/oneflow/core/vm/instruction.h b/oneflow/core/vm/instruction.h index 3b0034d97d7..0323fb36d97 100644 --- a/oneflow/core/vm/instruction.h +++ b/oneflow/core/vm/instruction.h @@ -18,48 +18,33 @@ limitations under the License. #include #include -#include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/common/symbol.h" #include "oneflow/core/intrusive/flat_msg.h" #include "oneflow/core/intrusive/intrusive.h" #include "oneflow/core/intrusive/object_pool.h" -#include "oneflow/core/vm/stream_desc.h" #include "oneflow/core/vm/vm_object.h" #include "oneflow/core/vm/stream_type.h" -#include "oneflow/core/vm/instr_type_id.h" -#include "oneflow/core/vm/id_util.h" -#include "oneflow/core/vm/instruction.pb.h" #include "oneflow/core/vm/phy_instr_operand.h" namespace oneflow { -namespace vm { -class VirtualMachineEngine; +class Stream; + +namespace vm { class InstructionMsg final : public intrusive::Base { public: - // Getters - const std::string& instr_type_name() const { return instr_type_name_; } - const InstrTypeId& instr_type_id() const { return instr_type_id_; } - const std::shared_ptr& phy_instr_parallel_desc() const { - return phy_instr_parallel_desc_; - } - const std::shared_ptr& phy_instr_operand() const { return phy_instr_operand_; } - Stream* phy_instr_stream() const { return phy_instr_stream_; } - // Setters - std::string* mut_instr_type_name() { return &instr_type_name_; } - InstrTypeId* mut_instr_type_id() { return &instr_type_id_; } - // methods - void __Init__(); - void __Init__(const std::string& instr_type_name); - void __Init__(VirtualMachineEngine* vm, const std::string& instr_type_name, - const std::shared_ptr& phy_instr_parallel_desc, + void __Init__(Stream* stream, const InstructionType* instruction_type, const std::shared_ptr& phy_instr_operand); - void __Init__(const InstructionMsg& instr_msg); - std::string DebugName() const; + // Getters + const Stream& stream() const { return *stream_; } + Stream* mut_stream() { return stream_; } + const InstructionType& instruction_type() const { return *instruction_type_; } + const std::shared_ptr& phy_instr_operand() const { return phy_instr_operand_; } - intrusive::shared_ptr Clone() const; + std::string DebugName() const; intrusive::Ref::RefCntType ref_cnt() const { return intrusive_ref_.ref_cnt(); } @@ -68,21 +53,12 @@ class InstructionMsg final : public intrusive::Base { intrusive::Ref* mut_intrusive_ref() { return &intrusive_ref_; } InstructionMsg() - : intrusive_ref_(), - instr_type_id_(), - instr_type_name_(), - phy_instr_parallel_desc_(), - phy_instr_operand_(), - phy_instr_stream_(), - instr_msg_hook_() {} + : intrusive_ref_(), stream_(), instruction_type_(), phy_instr_operand_(), instr_msg_hook_() {} intrusive::Ref intrusive_ref_; // fields - InstrTypeId instr_type_id_; - // instr_type_name is a necessary reduandant field for method ToProto - std::string instr_type_name_; - std::shared_ptr phy_instr_parallel_desc_; + Stream* stream_; + const InstructionType* instruction_type_; std::shared_ptr phy_instr_operand_; - Stream* phy_instr_stream_; public: // list hooks @@ -158,15 +134,8 @@ class Instruction final : public intrusive::Base { intrusive::List; // Getters - void __Init__() { clear_stream(); } - bool has_stream() const { return stream_ != nullptr; } - const Stream& stream() const { return *stream_; } - const InstructionMsg& instr_msg() const { - if (instr_msg_) { return instr_msg_.Get(); } - static const auto default_val = intrusive::make_shared(); - return default_val.Get(); - } - const std::shared_ptr& parallel_desc() const { return parallel_desc_; } + const Stream& stream() const { return instr_msg_->stream(); } + const InstructionMsg& instr_msg() const { return instr_msg_.Get(); } const InstructionStatusBuffer& status_buffer() const { return status_buffer_.Get(); } const intrusive::ListHook& instruction_hook() const { return instruction_hook_; } const intrusive::ListHook& dispatched_instruction_hook() const { @@ -180,21 +149,17 @@ class Instruction final : public intrusive::Base { const DependenceAccessList& access_list() const { return access_list_; } // Setters - void set_stream(Stream* val) { stream_ = val; } - void clear_stream() { stream_ = nullptr; } - Stream* mut_stream() { return stream_; } + Stream* mut_stream() { return instr_msg_->mut_stream(); } InstructionMsg* mut_instr_msg() { return CHECK_NOTNULL(instr_msg_.Mutable()); } void reset_instr_msg(InstructionMsg* instr_msg) { instr_msg_.Reset(instr_msg); } void clear_instr_msg() { instr_msg_.Reset(); } - std::shared_ptr* mut_parallel_desc() { return ¶llel_desc_; } InstructionStatusBuffer* mut_status_buffer() { return status_buffer_.Mutable(); } InEdgeList* mut_in_edges() { return &in_edges_; } OutEdgeList* mut_out_edges() { return &out_edges_; } DependenceAccessList* mut_access_list() { return &access_list_; } // methods - void Init(InstructionMsg* instr_msg, Stream* stream, - const std::shared_ptr& parallel_desc); + void Init(InstructionMsg* instr_msg); void Delete(); bool Done() const; const StreamType& stream_type() const; @@ -209,8 +174,6 @@ class Instruction final : public intrusive::Base { : intrusive_ref_(), status_buffer_(), instr_msg_(), - parallel_desc_(), - stream_(), access_list_(), in_edges_(), out_edges_(), @@ -223,8 +186,6 @@ class Instruction final : public intrusive::Base { // fields FlatMsg status_buffer_; intrusive::shared_ptr instr_msg_; - std::shared_ptr parallel_desc_; - Stream* stream_; // lists DependenceAccessList access_list_; InEdgeList in_edges_; diff --git a/oneflow/core/vm/instruction.proto b/oneflow/core/vm/instruction.proto deleted file mode 100644 index 8c3d9a26495..00000000000 --- a/oneflow/core/vm/instruction.proto +++ /dev/null @@ -1,49 +0,0 @@ -syntax = "proto2"; -package oneflow.vm; - -message CurrentGlobalDeviceIdProto {} -message SoleMirroredObjectProto {} -message AllMirroredObjectProto {} - -message OperandProto { - required int64 logical_object_id = 1; - oneof operand_type { - CurrentGlobalDeviceIdProto current_global_device_id = 2; - SoleMirroredObjectProto sole_mirrored_object = 3; - AllMirroredObjectProto all_mirrored_object = 4; - } -} - -message OperandSeparatorProto { } - -message InstructionOperandProto { - oneof type { - // read only object - OperandProto const_operand = 1; - // writeable object - OperandProto mut_operand = 2; - // mut2 writeable object - OperandProto mut2_operand = 3; - OperandProto del_operand = 4; - // read only symbol - OperandProto symbol_operand = 5; - // initializable symbol - OperandProto init_symbol_operand = 6; - - OperandSeparatorProto separator = 7; - double double_operand = 8; - int64 int64_operand = 9; - uint64 uint64_operand = 10; - bool bool_operand = 11; - } -} - -message InstructionProto { - required string instr_type_name = 1; - optional int64 parallel_desc_symbol_id = 2 [default = 0]; - repeated InstructionOperandProto operand = 3; -}; - -message InstructionListProto { - repeated InstructionProto instruction = 1; -} diff --git a/oneflow/core/vm/instruction_type.cpp b/oneflow/core/vm/instruction_type.cpp index d2bb48f4ad8..174459b1f34 100644 --- a/oneflow/core/vm/instruction_type.cpp +++ b/oneflow/core/vm/instruction_type.cpp @@ -13,7 +13,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/vm/instr_type_id.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/vm/instruction.h" #include "oneflow/core/common/util.h" @@ -21,15 +20,6 @@ limitations under the License. namespace oneflow { namespace vm { -namespace { - -HashMap* InstrTypeId4InstructionName() { - static HashMap map; - return ↦ -} - -} // namespace - void InstructionType::InitInstructionStatus(Instruction* instruction) const { instruction->stream_type().InitInstructionStatus(instruction->stream(), instruction->mut_status_buffer()); @@ -40,23 +30,5 @@ void InstructionType::DeleteInstructionStatus(Instruction* instruction) const { instruction->mut_status_buffer()); } -const InstrTypeId& LookupInstrTypeId(const std::string& name) { - const auto& map = *InstrTypeId4InstructionName(); - const auto& iter = map.find(name); - CHECK(iter != map.end()) << "instruction type name: " << name; - return iter->second; -} - -void ForEachInstrTypeId(std::function DoEach) { - for (const auto& pair : *InstrTypeId4InstructionName()) { DoEach(pair.second); } -} - -void RegisterInstrTypeId(const std::string& instruction_name, const StreamType* stream_type, - const InstructionType* instruction_type) { - InstrTypeId instr_type_id; - instr_type_id.__Init__(stream_type, instruction_type); - CHECK(InstrTypeId4InstructionName()->emplace(instruction_name, instr_type_id).second); -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/vm/instruction_type.h b/oneflow/core/vm/instruction_type.h index 005c57751e8..ac1f3244dee 100644 --- a/oneflow/core/vm/instruction_type.h +++ b/oneflow/core/vm/instruction_type.h @@ -36,8 +36,7 @@ class InstructionType { public: virtual ~InstructionType() = default; - bool IsSequential() const { return IsFrontSequential(); } - virtual bool IsFrontSequential() const { return false; } + virtual bool IsBarrier() const { return false; } virtual InstructionFuseType fuse_type() const { return kDisableInstructionFuse; } virtual void Compute(Instruction* instruction) const = 0; @@ -49,7 +48,7 @@ class InstructionType { DeleteInstructionStatus(instruction); } - virtual std::string DebugOpTypeName(const InstructionMsg&) const { return ""; } + virtual std::string DebugName(const InstructionMsg&) const = 0; protected: InstructionType() = default; @@ -59,28 +58,6 @@ class InstructionType { virtual void DeleteInstructionStatus(Instruction* instruction) const; }; -class InstrTypeId; -const InstrTypeId& LookupInstrTypeId(const std::string& instr_type_name); -void ForEachInstrTypeId(std::function DoEach); -void RegisterInstrTypeId(const std::string& instr_type_name, const StreamType* stream_type, - const InstructionType* instruction_type); - -template -const InstructionType* StaticGlobalInstructionType() { - static const InstructionType* instruction_type = new T(); - return instruction_type; -} - -template -void RegisterInstrTypeId(const std::string& instr_type_name, const StreamType* stream_type) { - RegisterInstrTypeId(instr_type_name, stream_type, StaticGlobalInstructionType()); -} - -template -void RegisterInstructionType(const std::string& instr_type_name) { - RegisterInstrTypeId(instr_type_name, StaticGlobalStreamType()); -} - } // namespace vm } // namespace oneflow diff --git a/oneflow/core/eager/lazy_job_device_context.h b/oneflow/core/vm/lazy_job_device_context.h similarity index 93% rename from oneflow/core/eager/lazy_job_device_context.h rename to oneflow/core/vm/lazy_job_device_context.h index d0e56590c5f..593c4f8d335 100644 --- a/oneflow/core/eager/lazy_job_device_context.h +++ b/oneflow/core/vm/lazy_job_device_context.h @@ -13,8 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_CORE_EAGER_LAZY_JOB_DEVICE_CONTEXT_H_ -#define ONEFLOW_CORE_EAGER_LAZY_JOB_DEVICE_CONTEXT_H_ +#ifndef ONEFLOW_CORE_VM_LAZY_JOB_DEVICE_CONTEXT_H_ +#define ONEFLOW_CORE_VM_LAZY_JOB_DEVICE_CONTEXT_H_ #include "oneflow/core/framework/nn_graph_if.h" #include "oneflow/core/common/util.h" @@ -93,4 +93,4 @@ class LazyJobDeviceCtx final : public DeviceCtx { } // namespace vm } // namespace oneflow -#endif // ONEFLOW_CORE_EAGER_LAZY_JOB_DEVICE_CONTEXT_H_ +#endif // ONEFLOW_CORE_VM_LAZY_JOB_DEVICE_CONTEXT_H_ diff --git a/oneflow/core/eager/lazy_job_stream_type.cpp b/oneflow/core/vm/lazy_job_stream_type.cpp similarity index 75% rename from oneflow/core/eager/lazy_job_stream_type.cpp rename to oneflow/core/vm/lazy_job_stream_type.cpp index b34a2f03924..2d5720dd83c 100644 --- a/oneflow/core/eager/lazy_job_stream_type.cpp +++ b/oneflow/core/vm/lazy_job_stream_type.cpp @@ -14,11 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/eager/lazy_job_stream_type.h" +#include "oneflow/core/vm/lazy_job_stream_type.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/vm/instruction.h" #include "oneflow/core/vm/thread_ctx.h" -#include "oneflow/core/eager/lazy_job_device_context.h" +#include "oneflow/core/vm/lazy_job_device_context.h" #include "oneflow/core/vm/naive_instruction_status_querier.h" #include "oneflow/core/common/util.h" @@ -48,19 +48,7 @@ bool LazyJobStreamType::QueryInstructionStatusDone( } void LazyJobStreamType::Compute(Instruction* instruction) const { - { - const auto& instr_type_id = instruction->mut_instr_msg()->instr_type_id(); - instr_type_id.instruction_type().Compute(instruction); - } -} - -intrusive::shared_ptr LazyJobStreamType::MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const { - auto ret = intrusive::make_shared(); - ret->set_stream_type(StaticGlobalStreamType()); - ret->set_num_streams_per_machine(1); - ret->set_num_streams_per_thread(1); - return ret; + instruction->instr_msg().instruction_type().Compute(instruction); } } // namespace vm diff --git a/oneflow/core/eager/lazy_job_stream_type.h b/oneflow/core/vm/lazy_job_stream_type.h similarity index 81% rename from oneflow/core/eager/lazy_job_stream_type.h rename to oneflow/core/vm/lazy_job_stream_type.h index 10cad9c2eaf..dd2196c7347 100644 --- a/oneflow/core/eager/lazy_job_stream_type.h +++ b/oneflow/core/vm/lazy_job_stream_type.h @@ -14,8 +14,8 @@ See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_CORE_EAGER_LAZY_JOB_STREAM_TYPE_H_ -#define ONEFLOW_CORE_EAGER_LAZY_JOB_STREAM_TYPE_H_ +#ifndef ONEFLOW_CORE_VM_LAZY_JOB_STREAM_TYPE_H_ +#define ONEFLOW_CORE_VM_LAZY_JOB_STREAM_TYPE_H_ #include "oneflow/core/intrusive/flat_msg_view.h" #include "oneflow/core/vm/stream_type.h" @@ -31,8 +31,6 @@ class LazyJobStreamType final : public StreamType { LazyJobStreamType() = default; virtual ~LazyJobStreamType() = default; - const char* stream_tag() const override { return "lazy_job"; } - void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const override; void InitInstructionStatus(const Stream& stream, @@ -44,11 +42,9 @@ class LazyJobStreamType final : public StreamType { void Compute(Instruction* instruction) const override; bool OnSchedulerThread() const override { return false; } bool SupportingTransportInstructions() const override { return false; } - intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const override; }; } // namespace vm } // namespace oneflow -#endif // ONEFLOW_CORE_EAGER_LAZY_JOB_STREAM_TYPE_H_ +#endif // ONEFLOW_CORE_VM_LAZY_JOB_STREAM_TYPE_H_ diff --git a/oneflow/core/vm/runtime_instr_type_id.h b/oneflow/core/vm/runtime_instr_type_id.h deleted file mode 100644 index d146b853893..00000000000 --- a/oneflow/core/vm/runtime_instr_type_id.h +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_RUNTIME_INSTR_TYPE_ID_H_ -#define ONEFLOW_CORE_VM_RUNTIME_INSTR_TYPE_ID_H_ - -#include "oneflow/core/vm/instr_type_id.h" -#include "oneflow/core/vm/stream_runtime_desc.h" - -namespace oneflow { -namespace vm { - -class RtInstrTypeId final { - public: - RtInstrTypeId(const RtInstrTypeId&) = default; - RtInstrTypeId(RtInstrTypeId&&) = default; - ~RtInstrTypeId() = default; - - RtInstrTypeId(const InstrTypeId& instr_type_id, StreamRtDesc* stream_rt_desc) - : instr_type_id_(instr_type_id), stream_rt_desc_(stream_rt_desc) { - if (stream_rt_desc->stream_type().IsControlStreamType()) { - get_stream_ = &StreamRtDesc::GetSoleStream; - } else { - get_stream_ = &StreamRtDesc::GetDeviceStream; - } - } - - const InstrTypeId& instr_type_id() const { return instr_type_id_; } - Stream* GetStream(int device_id) const { return (stream_rt_desc_->*get_stream_)(device_id); } - - private: - const InstrTypeId instr_type_id_; - StreamRtDesc* stream_rt_desc_; - Stream* (StreamRtDesc::*get_stream_)(int device_id) const; -}; - -} // namespace vm -} // namespace oneflow - -#endif // ONEFLOW_CORE_VM_RUNTIME_INSTR_TYPE_ID_H_ diff --git a/oneflow/core/vm/sequential_instruction_type.cpp b/oneflow/core/vm/sequential_instruction_type.cpp deleted file mode 100644 index dca5a7473e0..00000000000 --- a/oneflow/core/vm/sequential_instruction_type.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/util.h" -#include "oneflow/core/intrusive/flat_msg_view.h" -#include "oneflow/core/rpc/include/base.h" -#include "oneflow/core/vm/control_stream_type.h" -#include "oneflow/core/vm/instruction_type.h" -#include "oneflow/core/vm/instruction.h" -#include "oneflow/core/vm/virtual_machine_engine.h" -#include "oneflow/core/vm/barrier_phy_instr_operand.h" -#include "oneflow/core/control/global_process_ctx.h" - -namespace oneflow { -namespace vm { - -class RankFrontSeqCallbackInstructionType : public InstructionType { - public: - RankFrontSeqCallbackInstructionType() = default; - virtual ~RankFrontSeqCallbackInstructionType() override = default; - - bool IsFrontSequential() const override { return true; } - - protected: -}; - -class ComputeRankFrontSeqCallbackInstructionType final - : public RankFrontSeqCallbackInstructionType { - public: - ComputeRankFrontSeqCallbackInstructionType() = default; - ~ComputeRankFrontSeqCallbackInstructionType() override = default; - - using stream_type = ControlStreamType; - - void Compute(Instruction* instruction) const override { - const auto* operand = instruction->instr_msg().phy_instr_operand().get(); - const auto* barrier_operand = dynamic_cast(operand); - CHECK_NOTNULL(barrier_operand)->callback(); - } - void ComputeInFuseMode(InstructionMsg* instr_msg) const override { - const auto* operand = instr_msg->phy_instr_operand().get(); - const auto* barrier_operand = dynamic_cast(operand); - CHECK_NOTNULL(barrier_operand)->callback(); - } -}; -COMMAND(RegisterInstructionType( - "ComputeRankFrontSeqCallback")); - -class CtrlComputeRankFrontSeqCallbackInstructionType final - : public RankFrontSeqCallbackInstructionType { - public: - CtrlComputeRankFrontSeqCallbackInstructionType() = default; - ~CtrlComputeRankFrontSeqCallbackInstructionType() override = default; - - using stream_type = ControlStreamType; - - void Compute(Instruction* instruction) const override { - const auto* operand = instruction->instr_msg().phy_instr_operand().get(); - const auto* barrier_operand = dynamic_cast(operand); - CHECK_NOTNULL(barrier_operand)->callback(); - } -}; -COMMAND(RegisterInstructionType( - "CtrlComputeRankFrontSeqCallback")); - -class GlobalFrontSeqBarrierInstructionType : public InstructionType { - public: - GlobalFrontSeqBarrierInstructionType() = default; - virtual ~GlobalFrontSeqBarrierInstructionType() override = default; - - using stream_type = ControlStreamType; - - virtual bool IsFrontSequential() const override { return true; } -}; - -class ComputeGlobalFrontSeqBarrierInstructionType final - : public GlobalFrontSeqBarrierInstructionType { - public: - ComputeGlobalFrontSeqBarrierInstructionType() = default; - ~ComputeGlobalFrontSeqBarrierInstructionType() override = default; - - void Compute(Instruction* instruction) const override { - OF_ENV_BARRIER(); - const auto* operand = instruction->instr_msg().phy_instr_operand().get(); - const auto* barrier_operand = dynamic_cast(operand); - CHECK_NOTNULL(barrier_operand)->callback(); - } -}; -COMMAND(RegisterInstructionType( - "ComputeGlobalFrontSeqBarrier")); - -} // namespace vm -} // namespace oneflow diff --git a/oneflow/core/vm/stream.cpp b/oneflow/core/vm/stream.cpp index 50f3ea09262..d2c7d2f055c 100644 --- a/oneflow/core/vm/stream.cpp +++ b/oneflow/core/vm/stream.cpp @@ -17,40 +17,37 @@ limitations under the License. #include "oneflow/core/vm/thread_ctx.h" #include "oneflow/core/common/util.h" #include "oneflow/core/common/cpp_attribute.h" +#include "oneflow/core/framework/device.h" +#include "oneflow/core/vm/stream_get_stream_type.h" namespace oneflow { namespace vm { -void Stream::__Init__() { clear_thread_ctx(); } - -void Stream::__Init__(ThreadCtx* thread_ctx, const StreamId& stream_id, - const int64_t max_device_num_per_machine) { - __Init__(); +void Stream::__Init__( + ThreadCtx* thread_ctx, Symbol device, StreamRole stream_role, + const intrusive::shared_ptr& schedule_local_dep_object, + const Optional>& transport_local_dep_object) { set_thread_ctx(thread_ctx); - mut_stream_id()->CopyFrom(stream_id); - // InitDeviceCtx may use max_device_num_per_machine, - // so max_device_num_per_machine must be set before InitDeviceCtx - set_max_device_num_per_machine(max_device_num_per_machine); - stream_type().InitDeviceCtx(mut_device_ctx(), this); + device_ = device; + stream_role_ = stream_role; + stream_type_ = CHECK_JUST(GetStreamType::Visit(stream_role, device->enum_type())); + stream_type_->InitDeviceCtx(mut_device_ctx(), this); + schedule_local_dep_object_ = schedule_local_dep_object; + transport_local_dep_object_ = transport_local_dep_object; } -int64_t Stream::machine_id() const { return global_device_id() / max_device_num_per_machine(); } - -int64_t Stream::device_id() const { return global_device_id() % max_device_num_per_machine(); } +int64_t Stream::device_id() const { return device_->device_id(); } -const StreamType& Stream::stream_type() const { - return thread_ctx().stream_rt_desc().stream_type(); -} +const StreamType& Stream::stream_type() const { return *stream_type_; } -intrusive::shared_ptr Stream::NewInstruction( - InstructionMsg* instr_msg, const std::shared_ptr& parallel_desc) { +intrusive::shared_ptr Stream::NewInstruction(InstructionMsg* instr_msg) { intrusive::shared_ptr instruction; if (unlikely(free_instruction_list().empty())) { instruction = intrusive::make_shared(); } else { instruction = mut_free_instruction_list()->PopFront(); } - instruction->Init(instr_msg, this, parallel_desc); + instruction->Init(instr_msg); return instruction; } diff --git a/oneflow/core/vm/stream.h b/oneflow/core/vm/stream.h index 3e1936f5b2d..d668a7d9463 100644 --- a/oneflow/core/vm/stream.h +++ b/oneflow/core/vm/stream.h @@ -16,14 +16,21 @@ limitations under the License. #ifndef ONEFLOW_CORE_VM_STREAM_H_ #define ONEFLOW_CORE_VM_STREAM_H_ -#include "oneflow/core/vm/stream_desc.h" #include "oneflow/core/vm/instruction.h" #include "oneflow/core/device/device_context.h" +#include "oneflow/core/common/symbol.h" +#include "oneflow/core/common/optional.h" +#include "oneflow/core/common/stream_role.h" namespace oneflow { + +class Device; + namespace vm { class ThreadCtx; +class StreamType; +class MirroredObject; class Stream final : public intrusive::Base { public: @@ -32,7 +39,6 @@ class Stream final : public intrusive::Base { intrusive::List; // Getters - int64_t max_device_num_per_machine() const { return max_device_num_per_machine_; } const ThreadCtx& thread_ctx() const { return *thread_ctx_; } bool has_thread_ctx() const { return thread_ctx_ != nullptr; } const std::unique_ptr& device_ctx() const { return device_ctx_; } @@ -44,10 +50,8 @@ class Stream final : public intrusive::Base { const DispatchedInstructionList& running_instruction_list() const { return running_instruction_list_; } - const StreamId& stream_id() const { return stream_id_.key(); } // Setters - void set_max_device_num_per_machine(int64_t val) { max_device_num_per_machine_ = val; } ThreadCtx* mut_thread_ctx() { return thread_ctx_; } void set_thread_ctx(ThreadCtx* val) { thread_ctx_ = val; } void clear_thread_ctx() { thread_ctx_ = nullptr; } @@ -55,20 +59,26 @@ class Stream final : public intrusive::Base { DispatchedInstructionList* mut_free_instruction_list() { return &free_instruction_list_; } DispatchedInstructionList* mut_zombie_instruction_list() { return &zombie_instruction_list_; } DispatchedInstructionList* mut_running_instruction_list() { return &running_instruction_list_; } - StreamId* mut_stream_id() { return stream_id_.mut_key(); } // methods - void __Init__(); - void __Init__(ThreadCtx* thread_ctx, const StreamId& stream_id, - const int64_t max_device_num_per_machine); - intrusive::shared_ptr NewInstruction( - InstructionMsg* instr_msg, const std::shared_ptr& parallel_desc); + void __Init__(ThreadCtx* thread_ctx, Symbol device, StreamRole stream_role, + const intrusive::shared_ptr& schedule_local_dep_object, + const Optional>& transport_local_dep_object); + intrusive::shared_ptr NewInstruction(InstructionMsg* instr_msg); void DeleteInstruction(intrusive::shared_ptr&&); - int64_t global_device_id() const { return stream_id().global_device_id(); } - int64_t machine_id() const; int64_t device_id() const; + Symbol device() const { return device_; } + StreamRole stream_role() const { return stream_role_; } const StreamType& stream_type() const; + const intrusive::shared_ptr& schedule_local_dep_object() const { + return schedule_local_dep_object_; + } + + const Optional>& transport_local_dep_object() const { + return transport_local_dep_object_; + } + private: void MoveToFreeList(intrusive::shared_ptr&& instruction); void MoveFromZombieListToFreeList(); @@ -79,27 +89,31 @@ class Stream final : public intrusive::Base { Stream() : intrusive_ref_(), thread_ctx_(), + device_(), + stream_role_(StreamRole::kInvalid), + stream_type_(), device_ctx_(), - max_device_num_per_machine_(), free_instruction_list_(), zombie_instruction_list_(), running_instruction_list_(), - stream_id_(), active_stream_hook_(), thread_ctx_stream_hook_() {} intrusive::Ref intrusive_ref_; // fields ThreadCtx* thread_ctx_; + Symbol device_; + StreamRole stream_role_; + const StreamType* stream_type_; std::unique_ptr device_ctx_; - int64_t max_device_num_per_machine_; // lists DispatchedInstructionList free_instruction_list_; DispatchedInstructionList zombie_instruction_list_; DispatchedInstructionList running_instruction_list_; + intrusive::shared_ptr schedule_local_dep_object_; + Optional> transport_local_dep_object_; + public: - // skiplist hooks - intrusive::SkipListHook stream_id_; // list hooks intrusive::ListHook active_stream_hook_; intrusive::ListHook thread_ctx_stream_hook_; diff --git a/oneflow/core/vm/stream_desc.h b/oneflow/core/vm/stream_desc.h deleted file mode 100644 index a996bc0dd03..00000000000 --- a/oneflow/core/vm/stream_desc.h +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_VPU_DESC__H_ -#define ONEFLOW_CORE_VM_VPU_DESC__H_ - -#include -#include -#include "oneflow/core/intrusive/flat_msg.h" -#include "oneflow/core/intrusive/intrusive.h" -#include "oneflow/core/vm/id_util.h" - -namespace oneflow { -namespace vm { - -class StreamType; - -class StreamId final { - public: - using self_type = StreamId; - void __Init__() {} - void __Init__(const StreamType* stream_type, int64_t global_device_id) { - stream_type_ = stream_type; - global_device_id_ = global_device_id; - } - - void CopyFrom(const StreamId& rhs) { __Init__(rhs.stream_type_, rhs.global_device_id_); } - - const StreamType& stream_type() const { return *stream_type_; } - int64_t global_device_id() const { return global_device_id_; } - - bool operator==(const StreamId& rhs) const { - return stream_type_ == rhs.stream_type_ && global_device_id_ == rhs.global_device_id_; - } - - bool operator<(const StreamId& rhs) const { - if (!(stream_type_ == rhs.stream_type_)) { return stream_type_ < rhs.stream_type_; } - return global_device_id_ < rhs.global_device_id_; - } - bool operator<=(const StreamId& rhs) const { return *this < rhs || *this == rhs; } - - private: - const StreamType* stream_type_; - int64_t global_device_id_; -}; - -class StreamDesc final : public intrusive::Base { - public: - // Getters - int32_t num_streams_per_machine() const { return num_streams_per_machine_; } - int32_t num_streams_per_thread() const { return num_streams_per_thread_; } - const StreamType& stream_type() const { return *stream_type_key_.key(); } - // Setters - void set_num_streams_per_machine(int32_t val) { num_streams_per_machine_ = val; } - void set_num_streams_per_thread(int32_t val) { num_streams_per_thread_ = val; } - void set_stream_type(const StreamType* stream_type) { *stream_type_key_.mut_key() = stream_type; } - - // methods - void __Init__() {} - void __Init__(const StreamType* stream_type, int32_t num_streams_per_machine, - int32_t num_streams_per_thread); - int32_t num_threads() const; - int32_t parallel_num() const { return num_streams_per_machine(); } - - private: - friend class intrusive::Ref; - intrusive::Ref* mut_intrusive_ref() { return &intrusive_ref_; } - - StreamDesc() - : intrusive_ref_(), - num_streams_per_machine_(), - num_streams_per_thread_(), - stream_type_key_() {} - intrusive::Ref intrusive_ref_; - // fields - int32_t num_streams_per_machine_; - int32_t num_streams_per_thread_; - - public: - // skiplist hooks - intrusive::SkipListHook stream_type_key_; -}; - -} // namespace vm -} // namespace oneflow - -#endif // ONEFLOW_CORE_VM_VPU_DESC__H_ diff --git a/oneflow/core/vm/stream_get_stream_type.h b/oneflow/core/vm/stream_get_stream_type.h new file mode 100644 index 00000000000..2eb1d6ca879 --- /dev/null +++ b/oneflow/core/vm/stream_get_stream_type.h @@ -0,0 +1,108 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_VM_STREAM_GET_STREAM_TYPE_H_ +#define ONEFLOW_CORE_VM_STREAM_GET_STREAM_TYPE_H_ + +#include "oneflow/core/common/stream_role.h" +#include "oneflow/core/common/singleton_ptr.h" +#include "oneflow/core/vm/event_recorded_cuda_stream_type.h" +#include "oneflow/core/vm/control_stream_type.h" +#include "oneflow/core/vm/cpu_stream_type.h" +#include "oneflow/core/vm/critical_section_stream_type.h" +#include "oneflow/core/vm/cuda_copy_d2h_stream_type.h" +#include "oneflow/core/vm/cuda_copy_h2d_stream_type.h" +#include "oneflow/core/vm/cuda_stream_type.h" +#include "oneflow/core/vm/lazy_job_stream_type.h" +#include "oneflow/core/vm/stream_get_stream_type.h" + +namespace oneflow { + +struct GetStreamType final : public StreamRoleVisitor { + static Maybe VisitCompute(DeviceType device_type) { + if (device_type == DeviceType::kCPU) { + return SingletonPtr(); + } else if (device_type == DeviceType::kCUDA) { +#ifdef WITH_CUDA + return SingletonPtr(); +#else + UNIMPLEMENTED_THEN_RETURN(); +#endif + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + } + static Maybe VisitHost2Device(DeviceType device_type) { + if (device_type == DeviceType::kCUDA) { +#ifdef WITH_CUDA + return SingletonPtr(); +#else + UNIMPLEMENTED_THEN_RETURN(); +#endif + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + } + static Maybe VisitDevice2Host(DeviceType device_type) { + if (device_type == DeviceType::kCUDA) { +#ifdef WITH_CUDA + return SingletonPtr(); +#else + UNIMPLEMENTED_THEN_RETURN(); +#endif + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + } + static Maybe VisitSyncedLaunchedCommNet(DeviceType device_type) { + if (device_type == DeviceType::kCPU) { + return SingletonPtr(); + } else if (device_type == DeviceType::kCUDA) { +#ifdef WITH_CUDA + return SingletonPtr(); +#else + UNIMPLEMENTED_THEN_RETURN(); +#endif + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + } + static Maybe VisitAsyncedLaunchedCommNet(DeviceType device_type) { + if (device_type == DeviceType::kCPU) { + return SingletonPtr(); + } else if (device_type == DeviceType::kCUDA) { +#ifdef WITH_CUDA + return SingletonPtr(); +#else + UNIMPLEMENTED_THEN_RETURN(); +#endif + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + } + static Maybe VisitBarrier(DeviceType device_type) { + return SingletonPtr(); + } + static Maybe VisitCriticalSection(DeviceType device_type) { + return SingletonPtr(); + } + static Maybe VisitLazyJobLauncher(DeviceType device_type) { + return SingletonPtr(); + } +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_VM_STREAM_GET_STREAM_TYPE_H_ diff --git a/oneflow/core/vm/stream_runtime_desc.h b/oneflow/core/vm/stream_runtime_desc.h deleted file mode 100644 index 6e7aa400c55..00000000000 --- a/oneflow/core/vm/stream_runtime_desc.h +++ /dev/null @@ -1,85 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_STREAM_RUNTIME_DESC__H_ -#define ONEFLOW_CORE_VM_STREAM_RUNTIME_DESC__H_ - -#include "oneflow/core/vm/stream_desc.h" -#include "oneflow/core/vm/stream.h" - -namespace oneflow { -namespace vm { - -class StreamType; -class StreamDesc; - -// Rt is short for Runtime -class StreamRtDesc final : public intrusive::Base { - public: - // Getters - const StreamDesc& stream_desc() const { - if (stream_desc_) { return stream_desc_.Get(); } - static const auto default_val = intrusive::make_shared(); - return default_val.Get(); - } - const StreamType& stream_type() const { return *stream_type_key_.key(); } - const std::vector>& device_id2stream() const { - return device_id2stream_; - } - - // The value of `device_id` is ignored. - Stream* GetSoleStream(int device_id) const { return GetSoleStream(); } - Stream* GetSoleStream() const { - CHECK_EQ(device_id2stream().size(), 1); - return device_id2stream().at(0).get(); - } - - Stream* GetDeviceStream(int device_id) const { return device_id2stream().at(device_id).get(); } - - // Setters - StreamDesc* mut_stream_desc() { - if (!stream_desc_) { stream_desc_ = intrusive::make_shared(); } - return stream_desc_.Mutable(); - } - void reset_stream_desc(StreamDesc* stream_desc) { stream_desc_.Reset(stream_desc); } - void set_stream_type(const StreamType* stream_type) { *stream_type_key_.mut_key() = stream_type; } - void add_stream(intrusive::shared_ptr stream) { - CHECK_EQ(stream->device_id(), device_id2stream_.size()); - device_id2stream_.emplace_back(stream); - } - - // methods - void __Init__(StreamDesc* stream_desc); - - private: - friend class intrusive::Ref; - intrusive::Ref* mut_intrusive_ref() { return &intrusive_ref_; } - - StreamRtDesc() : intrusive_ref_(), stream_desc_(), device_id2stream_(), stream_type_key_() {} - intrusive::Ref intrusive_ref_; - // fields - intrusive::shared_ptr stream_desc_; - // containers - std::vector> device_id2stream_; - - public: - // skiplist hooks - intrusive::SkipListHook stream_type_key_; -}; - -} // namespace vm -} // namespace oneflow - -#endif // ONEFLOW_CORE_VM_STREAM_RUNTIME_DESC__H_ diff --git a/oneflow/core/vm/stream_type.h b/oneflow/core/vm/stream_type.h index 8fee7b6054d..0a8868dddc4 100644 --- a/oneflow/core/vm/stream_type.h +++ b/oneflow/core/vm/stream_type.h @@ -19,8 +19,6 @@ limitations under the License. #include #include #include -#include "oneflow/core/vm/stream_desc.h" -#include "oneflow/core/vm/instr_type_id.h" #include "oneflow/core/device/device_context.h" #include "oneflow/core/job/resource.pb.h" @@ -40,8 +38,6 @@ class StreamType { void Run(Instruction* instruction) const { Compute(instruction); } - virtual const char* stream_tag() const = 0; - virtual void InitDeviceCtx(std::unique_ptr* device_ctx, Stream* stream) const = 0; virtual void InitInstructionStatus(const Stream& stream, @@ -52,9 +48,6 @@ class StreamType { const InstructionStatusBuffer& status_buffer) const = 0; virtual void Compute(Instruction* instruction) const = 0; - virtual intrusive::shared_ptr MakeStreamDesc(const Resource& resource, - int64_t this_machine_id) const = 0; - virtual bool OnSchedulerThread() const = 0; virtual bool SupportingTransportInstructions() const = 0; virtual bool IsControlStreamType() const { return false; } diff --git a/oneflow/core/vm/thread_ctx.cpp b/oneflow/core/vm/thread_ctx.cpp index c347fa1d9ed..f91e52867b3 100644 --- a/oneflow/core/vm/thread_ctx.cpp +++ b/oneflow/core/vm/thread_ctx.cpp @@ -20,12 +20,12 @@ namespace oneflow { namespace vm { size_t ThreadCtx::TryReceiveAndRun() { - const StreamType& stream_type = stream_rt_desc().stream_type(); intrusive::List tmp_list; mut_pending_instruction_list()->MoveTo(&tmp_list); size_t size = tmp_list.size(); INTRUSIVE_FOR_EACH(instruction, &tmp_list) { tmp_list.Erase(instruction.Mutable()); + const StreamType& stream_type = instruction->stream().stream_type(); stream_type.Run(instruction.Mutable()); } return size; diff --git a/oneflow/core/vm/thread_ctx.h b/oneflow/core/vm/thread_ctx.h index 150b09f29fc..31d64d8aae8 100644 --- a/oneflow/core/vm/thread_ctx.h +++ b/oneflow/core/vm/thread_ctx.h @@ -21,41 +21,28 @@ limitations under the License. #include "oneflow/core/intrusive/mutexed_list.h" #include "oneflow/core/common/notifier.h" #include "oneflow/core/vm/stream.h" -#include "oneflow/core/vm/stream_runtime_desc.h" namespace oneflow { namespace vm { using PendingInstructionMutexedList = intrusive::MutexedList; -using PendingInstructionList = - intrusive::List; class ThreadCtx final : public intrusive::Base { public: - void __Init__() { clear_stream_rt_desc(); } - // types using StreamList = intrusive::List; // Getters - bool has_stream_rt_desc() const { return stream_rt_desc_ != nullptr; } - const StreamRtDesc& stream_rt_desc() const { return *stream_rt_desc_; } const StreamList& stream_list() const { return stream_list_; } // Setters - void set_stream_rt_desc(const StreamRtDesc* val) { stream_rt_desc_ = val; } - void clear_stream_rt_desc() { stream_rt_desc_ = nullptr; } StreamList* mut_stream_list() { return &stream_list_; } PendingInstructionMutexedList* mut_pending_instruction_list() { return &pending_instruction_list_; } // methods - void __Init__(const StreamRtDesc& stream_rt_desc) { - __Init__(); - set_stream_rt_desc(&stream_rt_desc); - } size_t TryReceiveAndRun(); Notifier* mut_notifier() { return ¬ifier_; } @@ -66,14 +53,12 @@ class ThreadCtx final : public intrusive::Base { ThreadCtx() : intrusive_ref_(), - stream_rt_desc_(), stream_list_(), pending_instruction_mutex_(), pending_instruction_list_(&pending_instruction_mutex_), + notifier_(), thread_ctx_hook_() {} intrusive::Ref intrusive_ref_; - // fields - const StreamRtDesc* stream_rt_desc_; // lists StreamList stream_list_; std::mutex pending_instruction_mutex_; diff --git a/oneflow/core/vm/virtual_machine.cpp b/oneflow/core/vm/virtual_machine.cpp index 6527f8c92b2..fb712e6f255 100644 --- a/oneflow/core/vm/virtual_machine.cpp +++ b/oneflow/core/vm/virtual_machine.cpp @@ -18,18 +18,27 @@ limitations under the License. #include "oneflow/core/vm/instruction.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/vm/barrier_phy_instr_operand.h" +#include "oneflow/core/vm/barrier_instruction_type.h" +#include "oneflow/core/vm/barrier_phy_instr_operand.h" #include "oneflow/core/vm/vm_util.h" #include "oneflow/core/common/blocking_counter.h" #include "oneflow/core/common/cpp_attribute.h" +#include "oneflow/core/common/singleton_ptr.h" #include "oneflow/core/control/global_process_ctx.h" #include "oneflow/core/job/global_for.h" #include "oneflow/core/common/foreign_lock_helper.h" #include "oneflow/core/thread/thread_consistent_id.h" #include "oneflow/core/framework/transport_token.h" +#include "oneflow/core/framework/to_string.h" +#include "oneflow/core/framework/stream_on_independent_thread.h" +#include "oneflow/core/framework/stream_is_comm_net_stream.h" #include "oneflow/core/profiler/profiler.h" #include "oneflow/core/platform/include/pthread_fork.h" #include "oneflow/core/common/env_var/env_var.h" +#include "oneflow/core/common/container_util.h" #include "oneflow/core/framework/device.h" +#include "oneflow/core/framework/stream.h" +#include "oneflow/core/framework/stream_mgr.h" namespace oneflow { @@ -42,11 +51,9 @@ int MicrosecondsFrom(const T& start) { .count(); } -Maybe ForEachThreadCtx(vm::VirtualMachineEngine* vm, +Maybe ForEachThreadCtx(vm::VirtualMachineEngine* engine, const std::function(vm::ThreadCtx*)>& DoEach) { - INTRUSIVE_UNSAFE_FOR_EACH_PTR(thread_ctx, vm->mut_thread_ctx_list()) { - const auto& stream_type = thread_ctx->stream_rt_desc().stream_type(); - if (stream_type.OnSchedulerThread()) { continue; } + INTRUSIVE_UNSAFE_FOR_EACH_PTR(thread_ctx, engine->mut_thread_ctx_list()) { JUST(DoEach(thread_ctx)); } return Maybe::Ok(); @@ -59,45 +66,6 @@ void GetSchedulerThreadInitializer(std::function* Initializer) { }; } -std::type_index GetStreamTypeIndex(const vm::ThreadCtx* thread_ctx) { - const auto& stream_rt_desc = thread_ctx->stream_rt_desc(); - const auto& stream_type = stream_rt_desc.stream_type(); - return typeid(stream_type); -} - -// Threads with the same stream_type share a thread_consistent_id. -// e.g. -// Given there are 8 gpu thread in a single process. -// thread #0 is active in process #0, while others are not. -// thread #1 is active in process #1, while others are not. -// ... -// thread #7 is active in process #7, while others are not. -// to make them communicate with each other, we can allocate thread_consistent_id 1 to all those -// gpu threads in all processes. -void GetWorkerThreadInitializer(intrusive::shared_ptr vm, - std::function* Initializer) { - std::set stream_type_indexes; - INTRUSIVE_UNSAFE_FOR_EACH_PTR(thread_ctx, vm->mut_thread_ctx_list()) { - const auto& stream_type = thread_ctx->stream_rt_desc().stream_type(); - if (!stream_type.SupportingTransportInstructions()) { continue; } - stream_type_indexes.insert(GetStreamTypeIndex(thread_ctx)); - } - HashMap stream_type_index2consistent_id; - int64_t thread_consistent_id = kThreadConsistentIdScheduler + 1; - for (const auto& stream_type_index : stream_type_indexes) { - VLOG(3) << "transport stream type: " << stream_type_index.name(); - stream_type_index2consistent_id[stream_type_index] = thread_consistent_id++; - } - *Initializer = [stream_type_index2consistent_id](vm::ThreadCtx* thread_ctx) { - const auto& stream_type_index = GetStreamTypeIndex(thread_ctx); - const auto& iter = stream_type_index2consistent_id.find(stream_type_index); - if (iter != stream_type_index2consistent_id.end()) { - CHECK_JUST(InitThisThreadConsistentId(iter->second, stream_type_index.name())); - } - OF_PROFILER_NAME_THIS_HOST_THREAD("_VM::Worker"); - }; -} - void WorkerLoop(vm::ThreadCtx* thread_ctx, const std::function& Initializer) { Initializer(thread_ctx); while (thread_ctx->mut_notifier()->WaitAndClearNotifiedCnt() == kNotifierStatusSuccess) { @@ -107,36 +75,45 @@ void WorkerLoop(vm::ThreadCtx* thread_ctx, const std::function( - vm::MakeVmDesc(resource, this_machine_id).Get()); + engine_ = intrusive::make_shared(); OF_PROFILER_NAME_THIS_HOST_THREAD("_Main"); - std::function WorkerInitializer; - GetWorkerThreadInitializer(vm_, &WorkerInitializer); - CHECK_JUST(ForEachThreadCtx(vm_.Mutable(), [&](vm::ThreadCtx* thread_ctx) -> Maybe { - auto thread = std::make_unique(&WorkerLoop, thread_ctx, WorkerInitializer); - worker_threads_.push_back(std::move(thread)); - return Maybe::Ok(); - })); std::function SchedulerInitializer; GetSchedulerThreadInitializer(&SchedulerInitializer); schedule_thread_ = std::thread(&VirtualMachine::ScheduleLoop, this, SchedulerInitializer); + transport_local_dep_object_.Reset(); } namespace { -void MakeCtrlSeqInstructions(vm::VirtualMachineEngine* vm, vm::InstructionMsgList* list, - const std::function& ComputeCallback) { - const auto& phy_instr_operand = std::make_shared(ComputeCallback); - auto instruction = intrusive::make_shared( - vm, "CtrlComputeRankFrontSeqCallback", std::shared_ptr(), - phy_instr_operand); - list->EmplaceBack(std::move(instruction)); +Maybe> GetBarrierStream() { + auto device = JUST(Device::New("cpu")); + return Stream::New(device, StreamRole::kBarrier); +} + +void MakeBarrierInstructions(vm::InstructionMsgList* list, + const std::function& BarrierCallback) { + auto* vm = Global::Get(); + { + const auto& phy_instr_operand = std::make_shared([]() {}); + auto stream = CHECK_JUST(GetBarrierStream()); + auto instruction = intrusive::make_shared( + CHECK_JUST(vm->GetVmStream(stream)), SingletonPtr(), + phy_instr_operand); + list->EmplaceBack(std::move(instruction)); + } + { + const auto& phy_instr_operand = std::make_shared(BarrierCallback); + auto stream = CHECK_JUST(GetBarrierStream()); + auto instruction = intrusive::make_shared( + CHECK_JUST(vm->GetVmStream(stream)), SingletonPtr(), + phy_instr_operand); + list->EmplaceBack(std::move(instruction)); + } } } // namespace @@ -144,30 +121,30 @@ void MakeCtrlSeqInstructions(vm::VirtualMachineEngine* vm, vm::InstructionMsgLis void VirtualMachine::ControlSync() { auto bc = std::make_shared(1); vm::InstructionMsgList list; - MakeCtrlSeqInstructions(mut_vm(), &list, [bc] { bc->Decrease(); }); + MakeBarrierInstructions(&list, [bc] { bc->Decrease(); }); CHECK_JUST(Receive(&list)); CHECK_JUST(bc->WaitUntilCntEqualZero(VirtualMachine::GetPredicatorNoMoreInstructionsFinished())); } Maybe VirtualMachine::CloseVMThreads() { - CHECK_OR_RETURN(!vm_threads_closed_); + CHECK_OR_RETURN(!disable_vm_threads_) << "vm threads closed"; ControlSync(); pending_notifier_.Close(); schedule_thread_.join(); - vm_threads_closed_ = true; + disable_vm_threads_ = true; return Maybe::Ok(); } VirtualMachine::~VirtualMachine() { - if (!vm_threads_closed_) { CHECK_JUST(CloseVMThreads()); } - CHECK(vm_->SchedulerEmpty()); - vm_.Reset(); + if (!disable_vm_threads_) { CHECK_JUST(CloseVMThreads()); } + CHECK(engine_->SchedulerEmpty()); + engine_.Reset(); } std::function()> VirtualMachine::GetPredicatorNoMoreInstructionsFinished() { auto last_total_erased = std::make_shared(0); auto* vm = Global::Get(); - if (vm != nullptr) { *last_total_erased = vm->vm().total_erased_instruction_cnt(); } + if (vm != nullptr) { *last_total_erased = vm->engine_->total_erased_instruction_cnt(); } return [last_total_erased]() -> Maybe { auto* vm = Global::Get(); CHECK_NOTNULL_OR_RETURN(vm) << "virtual machine not initialized."; @@ -179,7 +156,7 @@ std::function()> VirtualMachine::GetPredicatorNoMoreInstructionsFini } bool VirtualMachine::NoMoreErasedInstructions(size_t* last_total_erased_instruction_cnt) const { - size_t cnt = vm_->total_erased_instruction_cnt(); + size_t cnt = engine_->total_erased_instruction_cnt(); bool no_more_erased = (*last_total_erased_instruction_cnt == cnt); *last_total_erased_instruction_cnt = cnt; return no_more_erased; @@ -187,29 +164,29 @@ bool VirtualMachine::NoMoreErasedInstructions(size_t* last_total_erased_instruct std::string VirtualMachine::GetBlockingDebugString() { size_t limit = EnvInteger(); - return vm_->GetLivelyInstructionListDebugString(limit); + return engine_->GetLivelyInstructionListDebugString(limit); } Maybe VirtualMachine::Receive(vm::InstructionMsgList* instr_list) { if (unlikely(pthread_fork::IsForkedSubProcess())) { INTRUSIVE_FOR_EACH_PTR(instr_msg, instr_list) { - const auto& parallel_desc = instr_msg->phy_instr_parallel_desc(); - CHECK_OR_RETURN(!parallel_desc || parallel_desc->device_type() == DeviceType::kCPU) + const auto& device = instr_msg->stream().device(); + CHECK_OR_RETURN(device->enum_type() == DeviceType::kCPU) << pthread_fork::kOfCudaNotSupportInForkedSubProcess; - // NOTE: operate `vm_` in forked subprocesses causes mysterious problems. + // NOTE: operate `engine_` in forked subprocesses causes mysterious problems. // `ComputeInFuseMode` will be replaced by `Compute` soon. - instr_msg->mut_instr_type_id()->instruction_type().ComputeInFuseMode(instr_msg); + instr_msg->instruction_type().ComputeInFuseMode(instr_msg); } - } else if (unlikely(vm_threads_closed_)) { + } else if (unlikely(disable_vm_threads_)) { JUST(RunInCurrentThread(instr_list)); } else { const int64_t kHighWaterMark = GetInstructionHighWaterMark(); - if (vm_->flying_instruction_cnt() > kHighWaterMark) { + if (engine_->flying_instruction_cnt() > kHighWaterMark) { JUST(Global::Get()->WithScopedRelease([&, this]() -> Maybe { auto bc = std::make_shared(1); - vm_->InsertProbe([bc](vm::VirtualMachineEngine* vm) { + engine_->InsertProbe([bc](vm::VirtualMachineEngine* engine) { const int64_t kLowWaterMark = GetInstructionLowWaterMark(); - if (vm->flying_instruction_cnt() > kLowWaterMark) { return false; } + if (engine->flying_instruction_cnt() > kLowWaterMark) { return false; } bc->Decrease(); return true; }); @@ -218,7 +195,7 @@ Maybe VirtualMachine::Receive(vm::InstructionMsgList* instr_list) { return Maybe::Ok(); })); } - if (JUST(vm_->Receive(instr_list))) { + if (JUST(engine_->Receive(instr_list))) { // old pending_instruction_list is empty. pending_notifier_.Notify(); } @@ -238,16 +215,26 @@ class SingleThreadScheduleCtx : public vm::ScheduleCtx { } }; -void ScheduleUntilVMEmpty(vm::VirtualMachineEngine* vm, const vm::ScheduleCtx& schedule_ctx) { - do { vm->Schedule(schedule_ctx); } while (!(vm->SchedulerEmpty())); +void ScheduleUntilVMEmpty(vm::VirtualMachineEngine* engine, const vm::ScheduleCtx& schedule_ctx) { + do { engine->Schedule(schedule_ctx); } while (!(engine->SchedulerEmpty())); } } // namespace +Maybe VirtualMachine::NotifyOrRunScheduler() { + if (unlikely(pthread_fork::IsForkedSubProcess() || disable_vm_threads_)) { + ScheduleUntilVMEmpty(engine_.Mutable(), SingleThreadScheduleCtx()); + } else { + pending_notifier_.Notify(); + } + return Maybe::Ok(); +} + Maybe VirtualMachine::RunInCurrentThread(vm::InstructionMsgList* instr_list) { - CHECK_OR_RETURN(vm_->SchedulerEmpty()) << "vm scheduler not empty. May be a fatal error occured"; - JUST(vm_->Receive(instr_list)); - ScheduleUntilVMEmpty(vm_.Mutable(), SingleThreadScheduleCtx()); + CHECK_OR_RETURN(engine_->SchedulerEmpty()) + << "vm scheduler not empty. May be a fatal error occured"; + JUST(engine_->Receive(instr_list)); + ScheduleUntilVMEmpty(engine_.Mutable(), SingleThreadScheduleCtx()); return Maybe::Ok(); } @@ -268,17 +255,16 @@ class MultiThreadScheduleCtx : public vm::ScheduleCtx { void VirtualMachine::ScheduleLoop(const std::function& Initializer) { Initializer(); MultiThreadScheduleCtx schedule_ctx{}; - auto* vm = mut_vm(); while (pending_notifier_.WaitAndClearNotifiedCnt() == kNotifierStatusSuccess) { OF_PROFILER_RANGE_GUARD("VirtualMachine::ScheduleLoop"); auto start = std::chrono::steady_clock::now(); static constexpr int kWorkingMicroseconds = 1000; - // Every time this thread wakes up, vm is scheduled for about `kWorkingMicroseconds`. + // Every time this thread wakes up, engine_ is scheduled for about `kWorkingMicroseconds`. // The cost of os thread switching is about 5-10 microseconds. Doing more scheduling in // a single waiting up can reach higher performance. do { static constexpr int kNumSchedulingPerTimoutTest = 10000; - // Every time kWorkingMicroseconds timeout tested, vm is scheduled for about + // Every time kWorkingMicroseconds timeout tested, engine_ is scheduled for about // kNumSchedulingPerTimoutTest. // The cost of `MicrosecondsFrom(start)` is about 400ns, while the empty scheduling costs // about 10ns. @@ -287,24 +273,146 @@ void VirtualMachine::ScheduleLoop(const std::function& Initializer) { // Use SchedulerThreadUnsafeEmpty to avoid acquiring mutex lock. // It's safe to use SchedulerThreadUnsafeEmpty here. pending_notifier_.notified_cnt_ will be // greater than zero when inconsistency between - // vm->pending_msg_list.list_head_.list_head_.container_ and - // vm->pending_msg_list.list_head_.list_head_.size_ occured. hence the pending + // engine_->pending_msg_list.list_head_.list_head_.container_ and + // engine_->pending_msg_list.list_head_.list_head_.size_ occured. hence the pending // instructions // will get handled in the next iteration. // VirtualMachine::Receive may be less effiencient if the thread safe version - // `vm->SchedulerEmpty()` + // `engine_->SchedulerEmpty()` // used // here, because VirtualMachine::ScheduleLoop is more likely to get the mutex lock. - do { vm->Schedule(schedule_ctx); } while (!vm->SchedulerThreadUnsafeEmpty()); + do { engine_->Schedule(schedule_ctx); } while (!engine_->SchedulerThreadUnsafeEmpty()); } while (++i < kNumSchedulingPerTimoutTest); } while (MicrosecondsFrom(start) < kWorkingMicroseconds); } - ScheduleUntilVMEmpty(vm, schedule_ctx); - CHECK_JUST(ForEachThreadCtx(vm_.Mutable(), [&](vm::ThreadCtx* thread_ctx) -> Maybe { + ScheduleUntilVMEmpty(engine_.Mutable(), schedule_ctx); + CHECK_JUST(ForEachThreadCtx(engine_.Mutable(), [&](vm::ThreadCtx* thread_ctx) -> Maybe { thread_ctx->mut_notifier()->Close(); return Maybe::Ok(); })); - for (const auto& worker_thread : worker_threads_) { worker_thread->join(); } + { + std::unique_lock lock(worker_threads_mutex_); + for (const auto& worker_thread : worker_threads_) { worker_thread->join(); } + } + scheduler_stopped_ = true; +} + +intrusive::shared_ptr VirtualMachine::FindOrCreateScheduleLocalDepObject( + Symbol device, StreamRole stream_role) { + std::unique_lock lock(creating_stream_and_thread_ctx_mutex_); + auto key = std::make_pair(device, stream_role); + intrusive::shared_ptr* ptr = &device_stream_role2local_dep_object_[key]; + if (!*ptr) { *ptr = intrusive::make_shared(); } + return *ptr; +} + +intrusive::shared_ptr VirtualMachine::FindOrCreateTransportLocalDepObject() { + std::unique_lock lock(creating_stream_and_thread_ctx_mutex_); + if (!transport_local_dep_object_) { + transport_local_dep_object_ = intrusive::make_shared(); + } + return transport_local_dep_object_; +} + +Maybe VirtualMachine::CreateStream(Symbol device, StreamRole stream_role) { + std::unique_lock lock(creating_stream_and_thread_ctx_mutex_); + vm::ThreadCtx* thread_ctx = JUST(FindOrCreateThreadCtx(device, stream_role)); + return JUST(CreateStream(thread_ctx, device, stream_role)); +} + +Maybe VirtualMachine::GetVmStream(Symbol stream) { + if (stream->unique_stream_id() >= unique_stream_id2vm_stream_.size()) { + std::unique_lock lock(creating_stream_and_thread_ctx_mutex_); + if (stream->unique_stream_id() >= unique_stream_id2vm_stream_.size()) { + auto* stream_mgr = JUST(GlobalMaybe()); + for (int i = unique_stream_id2vm_stream_.size(); i <= stream->unique_stream_id(); ++i) { + Symbol cur_stream = JUST(stream_mgr->GetStreamSymbol(i)); + CHECK_EQ_OR_RETURN(cur_stream->unique_stream_id(), i) + << "invalid Stream::unique_stream_id()"; + *unique_stream_id2vm_stream_.MutableOrAdd(cur_stream->unique_stream_id()) = + JUST(CreateStream(cur_stream->device(), cur_stream->stream_role())); + } + } + } + return JUST(VectorAt(unique_stream_id2vm_stream_, stream->unique_stream_id())); +} + +Maybe VirtualMachine::FindOrCreateThreadCtx(Symbol device, + StreamRole stream_role) { + std::unique_lock lock(creating_stream_and_thread_ctx_mutex_); + vm::ThreadCtx** thread_ctx_ptr = nullptr; + if (StreamOnIndependentThread::Visit(stream_role)) { + auto key = std::make_pair(device->enum_type(), stream_role); + thread_ctx_ptr = &devcie_type_stream_role_2independent_thread_ctx_[key]; + } else { + thread_ctx_ptr = &devcie_type2non_independent_thread_ctx_[device->enum_type()]; + } + if (*thread_ctx_ptr == nullptr) { *thread_ctx_ptr = JUST(CreateThreadCtx(device, stream_role)); } + return *thread_ctx_ptr; +} + +Maybe VirtualMachine::CreateThreadCtx(Symbol device, + StreamRole stream_role) { + std::unique_lock lock(creating_stream_and_thread_ctx_mutex_); + // thread_ctx_ptr may be used after timout. + auto thread_ctx_ptr = std::make_shared(nullptr); + { + auto bc = std::make_shared(1); + engine_->InsertProbe([thread_ctx_ptr, bc](vm::VirtualMachineEngine* engine) { + auto thread_ctx = intrusive::make_shared(); + engine->mut_thread_ctx_list()->PushBack(thread_ctx.Mutable()); + *thread_ctx_ptr = thread_ctx.Mutable(); + bc->Decrease(); + return true; + }); + JUST(NotifyOrRunScheduler()); + JUST(bc->WaitUntilCntEqualZero(VirtualMachine::GetPredicatorNoMoreInstructionsFinished())); + } + auto* thread_ctx = *thread_ctx_ptr; + { + const auto& WorkerInitializer = [device, stream_role](vm::ThreadCtx* thread_ctx) { + int device_type_value = static_cast(device->enum_type()); + CHECK_GT(device_type_value, 0); + std::string device_tag = *CHECK_JUST(DeviceTag4DeviceType(device->enum_type())); + if (!StreamOnIndependentThread::Visit(stream_role)) { + CHECK_JUST(InitThisThreadConsistentId(device_type_value + kThreadConsistentIdScheduler, + device_tag)); + } + OF_PROFILER_NAME_THIS_HOST_THREAD("_VM::Worker_" + device_tag); + }; + auto thread = std::make_unique(&WorkerLoop, thread_ctx, WorkerInitializer); + { + std::unique_lock lock(worker_threads_mutex_); + worker_threads_.push_back(std::move(thread)); + } + } + return thread_ctx; +} + +Maybe VirtualMachine::CreateStream(vm::ThreadCtx* thread_ctx, Symbol device, + StreamRole stream_role) { + std::unique_lock lock(creating_stream_and_thread_ctx_mutex_); + // stream_ptr may be used after timout. + auto stream_ptr = std::make_shared(nullptr); + auto bc = std::make_shared(1); + intrusive::shared_ptr schedule_local_dep_object = + FindOrCreateScheduleLocalDepObject(device, stream_role); + Optional> transport_local_dep_object; + if (IsCommNetStream::Visit(stream_role)) { + transport_local_dep_object = FindOrCreateTransportLocalDepObject(); + } + engine_->InsertProbe([stream_ptr, thread_ctx, device, stream_role, bc, schedule_local_dep_object, + transport_local_dep_object](vm::VirtualMachineEngine* engine) { + auto stream = intrusive::make_shared( + thread_ctx, device, stream_role, schedule_local_dep_object, transport_local_dep_object); + thread_ctx->mut_stream_list()->PushBack(stream.Mutable()); + *stream_ptr = stream.Mutable(); + bc->Decrease(); + return true; + }); + JUST(NotifyOrRunScheduler()); + JUST(bc->WaitUntilCntEqualZero(VirtualMachine::GetPredicatorNoMoreInstructionsFinished())); + return *stream_ptr; } } // namespace oneflow diff --git a/oneflow/core/vm/virtual_machine.h b/oneflow/core/vm/virtual_machine.h index 29e17f0aa3e..2f06401b2d2 100644 --- a/oneflow/core/vm/virtual_machine.h +++ b/oneflow/core/vm/virtual_machine.h @@ -16,47 +16,79 @@ limitations under the License. #ifndef ONEFLOW_CORE_VM_VIRTUAL_MACHINE_H_ #define ONEFLOW_CORE_VM_VIRTUAL_MACHINE_H_ +#include #include "oneflow/core/common/notifier.h" -#include "oneflow/core/vm/vm_desc.h" #include "oneflow/core/vm/virtual_machine_engine.h" #include "oneflow/core/thread/thread_pool.h" +#include "oneflow/core/common/stream_role.h" +#include "oneflow/core/common/steady_vector.h" namespace oneflow { class InstructionsBuilder; +class Device; class VirtualMachine final { public: VirtualMachine(const VirtualMachine&) = delete; VirtualMachine(VirtualMachine&&) = delete; - VirtualMachine(const Resource& resource, int64_t this_machine_id); + VirtualMachine(); ~VirtualMachine(); static std::function()> GetPredicatorNoMoreInstructionsFinished(); - bool NoMoreErasedInstructions(size_t* last_total_erased_instruction_cnt) const; + intrusive::shared_ptr FindOrCreateTransportLocalDepObject(); + std::string GetBlockingDebugString(); Maybe Receive(vm::InstructionMsgList* instr_list); - const vm::VirtualMachineEngine& vm() const { return *vm_; } - Maybe CloseVMThreads(); + Maybe GetVmStream(Symbol stream); + private: friend class InstructionsBuilder; void ScheduleLoop(const std::function& Initializer); - vm::VirtualMachineEngine* mut_vm() { return vm_.Mutable(); } + intrusive::shared_ptr FindOrCreateScheduleLocalDepObject( + Symbol device, StreamRole stream_role); + bool NoMoreErasedInstructions(size_t* last_total_erased_instruction_cnt) const; + + const vm::VirtualMachineEngine& engine() const { return *engine_; } + vm::VirtualMachineEngine* mut_engine() { return engine_.Mutable(); } + void ControlSync(); + Maybe FindOrCreateThreadCtx(Symbol device, StreamRole stream_role); + Maybe CreateThreadCtx(Symbol device, StreamRole stream_role); + Maybe CreateStream(Symbol device, StreamRole stream_role); + + Maybe CreateStream(vm::ThreadCtx* thread_ctx, Symbol device, + StreamRole stream_role); Maybe RunInCurrentThread(vm::InstructionMsgList* instr_list); - bool vm_threads_closed_; - intrusive::shared_ptr vm_; + Maybe NotifyOrRunScheduler(); + + bool disable_vm_threads_; + bool scheduler_stopped_; + intrusive::shared_ptr engine_; + // for asynchronized execution + std::mutex worker_threads_mutex_; std::list> worker_threads_; + + // for creating vm::Stream and vm::ThreadCtx + std::recursive_mutex creating_stream_and_thread_ctx_mutex_; + HashMap devcie_type2non_independent_thread_ctx_; + HashMap, vm::ThreadCtx*> + devcie_type_stream_role_2independent_thread_ctx_; + HashMap, StreamRole>, intrusive::shared_ptr> + device_stream_role2local_dep_object_; + intrusive::shared_ptr transport_local_dep_object_; + SteadyVector unique_stream_id2vm_stream_; + std::thread schedule_thread_; Notifier pending_notifier_; }; diff --git a/oneflow/core/vm/virtual_machine_engine.cpp b/oneflow/core/vm/virtual_machine_engine.cpp index 05052ce654a..5d2a4b157df 100644 --- a/oneflow/core/vm/virtual_machine_engine.cpp +++ b/oneflow/core/vm/virtual_machine_engine.cpp @@ -14,21 +14,20 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/vm/virtual_machine_engine.h" -#include "oneflow/core/vm/vm_desc.h" #include "oneflow/core/vm/instruction_type.h" +#include "oneflow/core/vm/fuse_instruction_type.h" #include "oneflow/core/vm/fuse_phy_instr_operand.h" #include "oneflow/core/vm/barrier_phy_instr_operand.h" #include "oneflow/core/common/util.h" #include "oneflow/core/common/balanced_splitter.h" #include "oneflow/core/common/cpp_attribute.h" #include "oneflow/core/framework/device.h" -#include "oneflow/core/job/parallel_desc.h" #include "oneflow/core/platform/include/pthread_fork.h" #include "oneflow/core/profiler/profiler.h" #include "oneflow/core/common/cpp_attribute.h" #include "oneflow/core/common/global.h" +#include "oneflow/core/common/singleton_ptr.h" #include "oneflow/core/common/foreign_lock_helper.h" -#include namespace oneflow { namespace vm { @@ -80,16 +79,14 @@ namespace { bool FusableBetween(InstructionFuseType fuse_type, InstructionMsg* instr_msg, InstructionMsg* prev_instr_msg) { - if (unlikely(instr_msg->instr_type_id().instruction_type().fuse_type() != fuse_type)) { - return false; - } - auto* phy_instr_stream = instr_msg->phy_instr_stream(); - if (unlikely(phy_instr_stream == nullptr)) { return false; } + if (unlikely(instr_msg->instruction_type().fuse_type() != fuse_type)) { return false; } + auto* stream = instr_msg->mut_stream(); + if (unlikely(stream == nullptr)) { return false; } auto* sequential_dep = instr_msg->phy_instr_operand()->stream_sequential_dependence(); if (unlikely(sequential_dep == nullptr)) { return false; } if (unlikely(prev_instr_msg == nullptr)) { return true; } - if (unlikely(phy_instr_stream != prev_instr_msg->phy_instr_stream())) { return false; } + if (unlikely(stream != prev_instr_msg->mut_stream())) { return false; } if (unlikely(sequential_dep != prev_instr_msg->phy_instr_operand()->stream_sequential_dependence())) { return false; @@ -108,9 +105,8 @@ void VirtualMachineEngine::MakeAndAppendFusedInstruction( } auto* begin = fused_instr_msg_list.Begin(); auto phy_instr_operand = std::make_shared(std::move(fused_instr_msg_list)); - const auto* stream_tag = begin->phy_instr_stream()->stream_type().stream_tag(); auto instr_msg = intrusive::make_shared( - this, std::string(stream_tag) + ".Fuse", begin->phy_instr_parallel_desc(), phy_instr_operand); + begin->mut_stream(), SingletonPtr(), phy_instr_operand); pending_instr_msgs->EmplaceBack(std::move(instr_msg)); } @@ -190,18 +186,12 @@ void VirtualMachineEngine::ReleaseFinishedInstructions(const ScheduleCtx& schedu OF_PROFILER_RANGE_POP(); } -int64_t VirtualMachineEngine::this_machine_id() const { - CHECK_EQ(machine_id_range().size(), 1); - return machine_id_range().begin(); -} - void VirtualMachineEngine::MakeInstructions(InstructionMsg* instr_msg, /*out*/ InstructionList* new_instruction_list) { - const auto& instruction_type = instr_msg->instr_type_id().instruction_type(); - bool is_barrier_instruction = instruction_type.IsFrontSequential(); - Stream* stream = CHECK_NOTNULL(instr_msg->phy_instr_stream()); - const auto& pd = instr_msg->phy_instr_parallel_desc(); - intrusive::shared_ptr instr = stream->NewInstruction(instr_msg, pd); + const auto& instruction_type = instr_msg->instruction_type(); + bool is_barrier_instruction = instruction_type.IsBarrier(); + Stream* stream = CHECK_NOTNULL(instr_msg->mut_stream()); + intrusive::shared_ptr instr = stream->NewInstruction(instr_msg); LivelyInstructionListPushBack(instr.Mutable()); if (unlikely(is_barrier_instruction)) { mut_barrier_instruction_list()->PushBack(instr.Mutable()); @@ -324,58 +314,6 @@ void VirtualMachineEngine::DispatchInstruction(Instruction* instruction, } } -void VirtualMachineEngine::__Init__(const VmDesc& vm_desc) { - mut_vm_resource_desc()->CopyFrom(vm_desc.vm_resource_desc()); - CHECK_GT(vm_desc.machine_id_range().size(), 0); - *mut_machine_id_range() = vm_desc.machine_id_range(); - INTRUSIVE_UNSAFE_FOR_EACH_PTR(stream_desc, &vm_desc.stream_type2desc()) { - if (stream_desc->num_threads() == 0) { continue; } - auto stream_rt_desc = intrusive::make_shared(stream_desc); - mut_stream_type2stream_rt_desc()->Insert(stream_rt_desc.Mutable()); - BalancedSplitter bs(stream_desc->parallel_num(), stream_desc->num_threads()); - for (int64_t i = 0, rel_global_device_id = 0; i < stream_desc->num_threads(); ++i) { - auto thread_ctx = intrusive::make_shared(stream_rt_desc.Get()); - mut_thread_ctx_list()->PushBack(thread_ctx.Mutable()); - for (int j = bs.At(i).begin(); j < bs.At(i).end(); ++j, ++rel_global_device_id) { - StreamId stream_id; - stream_id.__Init__(&stream_desc->stream_type(), - this_start_global_device_id() + rel_global_device_id); - auto stream = intrusive::make_shared( - thread_ctx.Mutable(), stream_id, vm_resource_desc().max_device_num_per_machine()); - stream_rt_desc->add_stream(stream); - thread_ctx->mut_stream_list()->PushBack(stream.Mutable()); - } - } - } -} - -void VirtualMachineEngine::GetCachedInstrTypeIdAndPhyInstrStream(const std::string& instr_type_name, - int device_id, - InstrTypeId* instr_type_id, - Stream** stream) { - auto* cache = &instr_type_name2rt_instr_type_id_; - auto iter = cache->find(instr_type_name); - if (unlikely(iter == cache->end())) { - const auto& instr_type_id_val = LookupInstrTypeId(instr_type_name); - const auto* stream_type = &instr_type_id_val.stream_type(); - auto* stream_rt_desc = this->mut_stream_type2stream_rt_desc()->FindPtr(stream_type); - iter = cache->emplace(instr_type_name, RtInstrTypeId(instr_type_id_val, stream_rt_desc)).first; - } - instr_type_id->CopyFrom(iter->second.instr_type_id()); - *stream = iter->second.GetStream(device_id); -} - -void VirtualMachineEngine::GetInstrTypeIdAndSoleStream(const std::string& instr_type_name, - InstrTypeId* instr_type_id, - Stream** stream) { - instr_type_id->CopyFrom(LookupInstrTypeId(instr_type_name)); - const auto* stream_type = &instr_type_id->stream_type(); - auto* stream_rt_desc = this->mut_stream_type2stream_rt_desc()->FindPtr(stream_type); - *stream = stream_rt_desc->GetSoleStream(); -} - -int64_t InstructionMaxRunningSeconds() { return 60 * 5; } - // Returns true if old pending_instruction_list is empty Maybe VirtualMachineEngine::Receive(InstructionMsgList* compute_instr_msg_list) { OF_PROFILER_RANGE_GUARD("vm:Receive"); @@ -387,13 +325,6 @@ Maybe VirtualMachineEngine::Receive(InstructionMsgList* compute_instr_msg_ return old_list_empty; } -Maybe VirtualMachineEngine::Receive( - intrusive::shared_ptr&& compute_instr_msg) { - InstructionMsgList instr_msg_list; - instr_msg_list.EmplaceBack(std::move(compute_instr_msg)); - return Receive(&instr_msg_list); -} - bool VirtualMachineEngine::OnSchedulerThread(const StreamType& stream_type) { return stream_type.OnSchedulerThread() || pthread_fork::IsForkedSubProcess(); } @@ -456,7 +387,7 @@ bool VirtualMachineEngine::OnSchedulerThread(const StreamType& stream_type) { // instructions are scarcely received by vm, there is no need for vm to run // VirtualMachineEngine::TryRunBarrierInstruction every time VirtualMachineEngine::Schedule run. On // the other hand, `barrier_instruction_hook_.size() == 0` is more lightweight than -// `lively_instruction_list_.Begin()?->instr_msg().instr_type_id().instruction_type().IsFrontSequential()` +// `lively_instruction_list_.Begin()?->instr_msg().instruction_type().IsBarrier()` // void VirtualMachineEngine::TryRunBarrierInstruction(const ScheduleCtx& schedule_ctx) { auto* sequnential_instruction = mut_barrier_instruction_list()->Begin(); @@ -465,10 +396,9 @@ void VirtualMachineEngine::TryRunBarrierInstruction(const ScheduleCtx& schedule_ // All instructions before `sequnential_instruction` are handled now, it's time to handle // `sequnential_instruction`. OF_PROFILER_RANGE_GUARD("RunBarrierInstruction"); - const auto& instr_type_id = sequnential_instruction->instr_msg().instr_type_id(); - const auto& instruction_type = instr_type_id.instruction_type(); - CHECK(instruction_type.IsFrontSequential()); - const StreamType& stream_type = instr_type_id.stream_type(); + const auto& instruction_type = sequnential_instruction->instr_msg().instruction_type(); + CHECK(instruction_type.IsBarrier()); + const StreamType& stream_type = sequnential_instruction->instr_msg().stream().stream_type(); CHECK(OnSchedulerThread(stream_type)); stream_type.Run(sequnential_instruction); mut_barrier_instruction_list()->Erase(sequnential_instruction); diff --git a/oneflow/core/vm/virtual_machine_engine.h b/oneflow/core/vm/virtual_machine_engine.h index 000dc38ab49..4b7df3a182b 100644 --- a/oneflow/core/vm/virtual_machine_engine.h +++ b/oneflow/core/vm/virtual_machine_engine.h @@ -20,13 +20,10 @@ limitations under the License. #include "oneflow/core/common/maybe.h" #include "oneflow/core/vm/instruction.h" #include "oneflow/core/vm/stream.h" -#include "oneflow/core/vm/stream_runtime_desc.h" -#include "oneflow/core/vm/runtime_instr_type_id.h" #include "oneflow/core/vm/thread_ctx.h" #include "oneflow/core/vm/vm_object.h" #include "oneflow/core/vm/vm_resource_desc.h" #include "oneflow/core/common/range.h" -#include "oneflow/core/job/parallel_desc.h" #include "oneflow/core/intrusive/mutexed_list.h" #include "oneflow/core/intrusive/object_pool.h" #include "oneflow/core/vm/probe.h" @@ -45,7 +42,6 @@ class ScheduleCtx { virtual void OnWorkerLoadPending(vm::ThreadCtx* thread_ctx) const = 0; }; -class VmDesc; class VirtualMachineEngine final : public intrusive::Base { public: // types @@ -58,16 +54,8 @@ class VirtualMachineEngine final : public intrusive::Base { intrusive::List; using InstructionMsgMutexedList = intrusive::MutexedList; - using StreamType2StreamRtDesc = - intrusive::SkipList; // Getters - const VmResourceDesc& vm_resource_desc() const { - if (vm_resource_desc_) { return vm_resource_desc_.Get(); } - static const auto default_val = intrusive::make_shared(); - return default_val.Get(); - } - const Range& machine_id_range() const { return machine_id_range_; } std::size_t flying_instruction_cnt() const { return pending_msg_list().thread_unsafe_size() + local_pending_msg_list().size() + (total_inserted_instruction_cnt() - total_erased_instruction_cnt()); @@ -83,46 +71,22 @@ class VirtualMachineEngine final : public intrusive::Base { } const InstructionMsgMutexedList& pending_msg_list() const { return pending_msg_list_; } const InstructionMsgList& local_pending_msg_list() const { return local_pending_msg_list_; } - const StreamType2StreamRtDesc& stream_type2stream_rt_desc() const { - return stream_type2stream_rt_desc_; - } // Setters - VmResourceDesc* mut_vm_resource_desc() { - if (!vm_resource_desc_) { vm_resource_desc_ = intrusive::make_shared(); } - return vm_resource_desc_.Mutable(); - } - Range* mut_machine_id_range() { return &machine_id_range_; } ActiveStreamList* mut_active_stream_list() { return &active_stream_list_; } ThreadCtxList* mut_thread_ctx_list() { return &thread_ctx_list_; } LivelyInstructionList* mut_lively_instruction_list() { return &lively_instruction_list_; } BarrierInstructionList* mut_barrier_instruction_list() { return &barrier_instruction_list_; } InstructionMsgMutexedList* mut_pending_msg_list() { return &pending_msg_list_; } InstructionMsgList* mut_local_pending_msg_list() { return &local_pending_msg_list_; } - StreamType2StreamRtDesc* mut_stream_type2stream_rt_desc() { return &stream_type2stream_rt_desc_; } - // methods - void __Init__(const VmDesc& vm_desc); - // Returns true if old pending_instruction_list is empty - Maybe Receive(InstructionMsgList* instr_list); // Returns true if old pending_instruction_list is empty - Maybe Receive(intrusive::shared_ptr&& instruction_msg); + Maybe Receive(InstructionMsgList* compute_instr_msg_list); void Schedule(const ScheduleCtx& schedule_ctx); void Callback(); bool SchedulerThreadUnsafeEmpty() const; bool SchedulerEmpty() const; std::string GetLivelyInstructionListDebugString(int64_t debug_cnt); - int64_t this_machine_id() const; - int64_t this_start_global_device_id() const { - return this_machine_id() * vm_resource_desc().max_device_num_per_machine(); - } - - void GetCachedInstrTypeIdAndPhyInstrStream(const std::string& instr_type_name, int device_id, - InstrTypeId* instr_type_id, Stream** stream); - - void GetInstrTypeIdAndSoleStream(const std::string& instr_type_name, InstrTypeId* instr_type_id, - Stream** stream); - private: using ReadyInstructionList = intrusive::List; @@ -164,11 +128,8 @@ class VirtualMachineEngine final : public intrusive::Base { VirtualMachineEngine() : intrusive_ref_(), - vm_resource_desc_(), - machine_id_range_(), active_stream_list_(), thread_ctx_list_(), - stream_type2stream_rt_desc_(), pending_msg_mutex_(), pending_msg_list_(&pending_msg_mutex_), local_pending_msg_list_(), @@ -181,14 +142,10 @@ class VirtualMachineEngine final : public intrusive::Base { local_probe_list_(), barrier_instruction_list_() {} intrusive::Ref intrusive_ref_; - // fields - intrusive::shared_ptr vm_resource_desc_; - Range machine_id_range_; // lists or maps // Do not change the order of the following fields ActiveStreamList active_stream_list_; ThreadCtxList thread_ctx_list_; - StreamType2StreamRtDesc stream_type2stream_rt_desc_; std::mutex pending_msg_mutex_; InstructionMsgMutexedList pending_msg_list_; // local_pending_msg_list_ should be consider as the cache of pending_msg_list_. @@ -204,7 +161,6 @@ class VirtualMachineEngine final : public intrusive::Base { intrusive::List local_probe_list_; BarrierInstructionList barrier_instruction_list_; - std::map instr_type_name2rt_instr_type_id_; DependenceAccess::object_pool_type access_pool_; InstructionEdge::object_pool_type instruction_edge_pool_; }; diff --git a/oneflow/core/vm/virtual_machine_scope.cpp b/oneflow/core/vm/virtual_machine_scope.cpp index d326c4cee5c..0f6233a194a 100644 --- a/oneflow/core/vm/virtual_machine_scope.cpp +++ b/oneflow/core/vm/virtual_machine_scope.cpp @@ -22,7 +22,7 @@ namespace oneflow { namespace vm { VirtualMachineScope::VirtualMachineScope(const Resource& resource) { - Global::New(resource, GlobalProcessCtx::Rank()); + Global::New(); } VirtualMachineScope::~VirtualMachineScope() { Global::Delete(); } diff --git a/oneflow/core/vm/vm_desc.cpp b/oneflow/core/vm/vm_desc.cpp deleted file mode 100644 index f106d935b4a..00000000000 --- a/oneflow/core/vm/vm_desc.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/vm/vm_desc.h" -#include "oneflow/core/vm/stream_desc.h" -#include "oneflow/core/vm/stream_type.h" -#include "oneflow/core/vm/instruction_type.h" -#include "oneflow/core/common/util.h" - -namespace oneflow { -namespace vm { - -namespace { - -void SetMachineIdRange(Range* range, int64_t machine_num, int64_t this_machine_id) { - *range = Range(this_machine_id, this_machine_id + 1); -} - -intrusive::shared_ptr MakeVmDesc( - const Resource& resource, int64_t this_machine_id, - const std::function&)>& ForEachInstrTypeId) { - std::set stream_types; - ForEachInstrTypeId( - [&](const InstrTypeId& instr_type_id) { stream_types.insert(&instr_type_id.stream_type()); }); - auto vm_desc = - intrusive::make_shared(intrusive::make_shared(resource).Get()); - SetMachineIdRange(vm_desc->mut_machine_id_range(), resource.machine_num(), this_machine_id); - int cnt = 0; - for (const auto* stream_type : stream_types) { - auto stream_desc = stream_type->MakeStreamDesc(resource, this_machine_id); - if (stream_desc) { - ++cnt; - CHECK(vm_desc->mut_stream_type2desc()->Insert(stream_desc.Mutable()).second); - } - } - CHECK_EQ(vm_desc->stream_type2desc().size(), cnt); - return vm_desc; -} - -} // namespace - -intrusive::shared_ptr MakeVmDesc(const Resource& resource, int64_t this_machine_id) { - return MakeVmDesc(resource, this_machine_id, &ForEachInstrTypeId); -} - -intrusive::shared_ptr MakeVmDesc(const Resource& resource, int64_t this_machine_id, - const std::set& instr_type_names) { - const auto& ForEachInstrTypeId = [&](const std::function& Handler) { - for (const auto& instr_type_name : instr_type_names) { - Handler(LookupInstrTypeId(instr_type_name)); - Handler(LookupInstrTypeId(std::string("Infer-") + instr_type_name)); - } - }; - return MakeVmDesc(resource, this_machine_id, ForEachInstrTypeId); -} - -} // namespace vm -} // namespace oneflow diff --git a/oneflow/core/vm/vm_desc.h b/oneflow/core/vm/vm_desc.h deleted file mode 100644 index b28d29db00c..00000000000 --- a/oneflow/core/vm/vm_desc.h +++ /dev/null @@ -1,74 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_MEM_ZONE_TYPE_DESC__H_ -#define ONEFLOW_CORE_VM_MEM_ZONE_TYPE_DESC__H_ - -#include "oneflow/core/vm/stream_desc.h" -#include "oneflow/core/vm/virtual_machine_engine.h" -#include "oneflow/core/vm/vm_resource_desc.h" -#include "oneflow/core/common/range.h" - -namespace oneflow { -namespace vm { - -class VmDesc final : public intrusive::Base { - public: - // types - using StreamType2StreamDesc = intrusive::SkipList; - // Getters - const VmResourceDesc& vm_resource_desc() const { - if (vm_resource_desc_) { return vm_resource_desc_.Get(); } - static const auto default_val = intrusive::make_shared(); - return default_val.Get(); - } - const Range& machine_id_range() const { return machine_id_range_; } - const StreamType2StreamDesc& stream_type2desc() const { return stream_type2desc_; } - // Setters - VmResourceDesc* mut_vm_resource_desc() { - if (!vm_resource_desc_) { vm_resource_desc_ = intrusive::make_shared(); } - return vm_resource_desc_.Mutable(); - } - Range* mut_machine_id_range() { return &machine_id_range_; } - StreamType2StreamDesc* mut_stream_type2desc() { return &stream_type2desc_; } - - // methods - void __Init__(const VmResourceDesc& vm_resource_desc) { __Init__(vm_resource_desc, Range(0, 1)); } - void __Init__(const VmResourceDesc& vm_resource_desc, const Range& machine_id_range) { - mut_vm_resource_desc()->CopyFrom(vm_resource_desc); - *mut_machine_id_range() = machine_id_range; - } - - private: - friend class intrusive::Ref; - intrusive::Ref* mut_intrusive_ref() { return &intrusive_ref_; } - - VmDesc() : intrusive_ref_(), vm_resource_desc_(), machine_id_range_(), stream_type2desc_() {} - intrusive::Ref intrusive_ref_; - // fields - intrusive::shared_ptr vm_resource_desc_; - Range machine_id_range_; - // maps - StreamType2StreamDesc stream_type2desc_; -}; - -intrusive::shared_ptr MakeVmDesc(const Resource& resource, int64_t this_machine_id); -intrusive::shared_ptr MakeVmDesc(const Resource& resource, int64_t this_machine_id, - const std::set& instr_type_names); - -} // namespace vm -} // namespace oneflow - -#endif // ONEFLOW_CORE_VM_MEM_ZONE_TYPE_DESC__H_ diff --git a/oneflow/core/vm/vm_object.h b/oneflow/core/vm/vm_object.h index cfc6b69a784..fae0c74bf38 100644 --- a/oneflow/core/vm/vm_object.h +++ b/oneflow/core/vm/vm_object.h @@ -20,9 +20,6 @@ limitations under the License. #include "oneflow/core/intrusive/flat_msg.h" #include "oneflow/core/intrusive/intrusive.h" #include "oneflow/core/intrusive/object_pool.h" -#include "oneflow/core/vm/id_util.h" -#include "oneflow/core/vm/stream_desc.h" -#include "oneflow/core/job/parallel_desc.h" namespace oneflow { diff --git a/oneflow/core/vm/vm_util.cpp b/oneflow/core/vm/vm_util.cpp index 3a39a93256c..d5ce990e0e6 100644 --- a/oneflow/core/vm/vm_util.cpp +++ b/oneflow/core/vm/vm_util.cpp @@ -20,7 +20,6 @@ limitations under the License. #include "oneflow/core/job/cluster_instruction.h" #include "oneflow/core/vm/vm_util.h" #include "oneflow/core/vm/virtual_machine.h" -#include "oneflow/core/vm/instruction.pb.h" #include "oneflow/core/vm/stream_type.h" #include "oneflow/core/vm/instruction_type.h" #include "oneflow/core/framework/instructions_builder.h" @@ -40,8 +39,8 @@ Maybe Run(vm::InstructionMsgList* instr_msg_list) { Maybe ClusterSync() { auto bc = std::make_shared(1); JUST(PhysicalRun([bc](InstructionsBuilder* builder) -> Maybe { - JUST(builder->ComputeGlobalFrontSeqBarrier()); - JUST(builder->ComputeRankFrontSeqCallback([bc]() { bc->Decrease(); })); + JUST(builder->GlobalSync()); + JUST(builder->Barrier([bc]() { bc->Decrease(); })); return Maybe::Ok(); })); JUST(bc->WaitUntilCntEqualZero(VirtualMachine::GetPredicatorNoMoreInstructionsFinished())); @@ -51,7 +50,7 @@ Maybe ClusterSync() { Maybe CurrentRankSync() { auto bc = std::make_shared(1); JUST(PhysicalRun([bc](InstructionsBuilder* builder) -> Maybe { - JUST(builder->ComputeRankFrontSeqCallback([bc]() { bc->Decrease(); })); + JUST(builder->Barrier([bc]() { bc->Decrease(); })); return Maybe::Ok(); })); JUST(bc->WaitUntilCntEqualZero(VirtualMachine::GetPredicatorNoMoreInstructionsFinished())); diff --git a/oneflow/extension/python/numpy.cpp b/oneflow/extension/python/numpy.cpp index 615636769ad..6cc9d61c0ee 100644 --- a/oneflow/extension/python/numpy.cpp +++ b/oneflow/extension/python/numpy.cpp @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "oneflow/core/common/stride.h" #include "oneflow/core/common/throw.h" #include "oneflow/core/common/registry_error.h" #include "oneflow/extension/python/numpy_internal.h" @@ -82,11 +83,11 @@ std::vector OFShapeToNumpyShape(const DimVector& fixed_vec) { } // NumPy strides use bytes. OneFlow strides use element counts. -std::vector OFStrideToNumpyStride(const DimVector& fixed_vec, const DataType data_type) { - size_t ndim = fixed_vec.size(); +std::vector OFStrideToNumpyStride(const Stride& stride, const DataType data_type) { + size_t ndim = stride.size(); auto result = std::vector(ndim); int byte_per_elem = GetSizeOfDataType(data_type); - for (int i = 0; i < ndim; i++) { result[i] = fixed_vec.at(i) * byte_per_elem; } + for (int i = 0; i < ndim; i++) { result[i] = stride.at(i) * byte_per_elem; } return result; } diff --git a/oneflow/extension/python/numpy_internal.h b/oneflow/extension/python/numpy_internal.h index 84590a38990..c55290c26df 100644 --- a/oneflow/extension/python/numpy_internal.h +++ b/oneflow/extension/python/numpy_internal.h @@ -34,6 +34,8 @@ limitations under the License. namespace oneflow { +class Stride; + namespace numpy { class NumPyArrayInternal final { @@ -60,7 +62,7 @@ Maybe GetOFDataTypeFromNpArray(PyArrayObject* array); std::vector OFShapeToNumpyShape(const DimVector& fixed_vec); -std::vector OFStrideToNumpyStride(const DimVector& fixed_vec, const DataType data_type); +std::vector OFStrideToNumpyStride(const Stride& stride, const DataType data_type); bool PyArrayCheckLongScalar(PyObject* obj); diff --git a/oneflow/extension/python/py_compute.cpp b/oneflow/extension/python/py_compute.cpp index eeb62754234..3910aca3657 100644 --- a/oneflow/extension/python/py_compute.cpp +++ b/oneflow/extension/python/py_compute.cpp @@ -58,9 +58,9 @@ void TensorToNumpy(const user_op::Tensor* tensor, PyObject** arg_ptr) { int type_num = CHECK_JUST(numpy::OFDataTypeToNumpyType(tensor->data_type())); VLOG(3) << "Tensor data type " << DataType_Name(tensor->data_type()) << " Numpy type " << type_num; - int dim_size = tensor->shape().NumAxes(); + int dim_size = tensor->shape_view().NumAxes(); npy_intp dims[dim_size]; - FOR_RANGE(size_t, i, 0, dim_size) { dims[i] = tensor->shape().At(i); } + FOR_RANGE(size_t, i, 0, dim_size) { dims[i] = tensor->shape_view().At(i); } void* data = TensorToMem(tensor); auto* np_array = @@ -105,9 +105,9 @@ void NumpyToTensor(PyObject* arg, user_op::Tensor* tensor) { int64_t array_elem_cnt = 1; FOR_RANGE(int, i, 0, PyArray_NDIM(array)) { array_elem_cnt *= PyArray_SHAPE(array)[i]; } - CHECK_EQ(array_elem_cnt, tensor->shape().elem_cnt()) + CHECK_EQ(array_elem_cnt, tensor->shape_view().elem_cnt()) << "Numpy array element count " << array_elem_cnt - << " is not equal to OneFlow tensor element count " << tensor->shape().elem_cnt(); + << " is not equal to OneFlow tensor element count " << tensor->shape_view().elem_cnt(); void* array_data_ptr = PyArray_DATA(array); MemToTensor(array_data_ptr, array_elem_cnt, tensor); diff --git a/oneflow/ir/include/OneFlow/OneFlowDialect.td b/oneflow/ir/include/OneFlow/OneFlowDialect.td index 10bfca306c0..94e4d31ac5b 100644 --- a/oneflow/ir/include/OneFlow/OneFlowDialect.td +++ b/oneflow/ir/include/OneFlow/OneFlowDialect.td @@ -14,6 +14,7 @@ def OneFlow_Dialect : Dialect { "func::FuncDialect" ]; let hasConstantMaterializer = 1; + let useDefaultTypePrinterParser = 1; } #endif // ONEFLOW_DIALECT diff --git a/oneflow/ir/include/OneFlow/OneFlowOps.td b/oneflow/ir/include/OneFlow/OneFlowOps.td index 405ff4499e0..c22a87143b3 100644 --- a/oneflow/ir/include/OneFlow/OneFlowOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowOps.td @@ -288,12 +288,6 @@ def LowerOneFlowToTosaPass : Pass<"lower-oneflow-to-tosa", "ModuleOp"> { ]; } -def MapSCFToGPUPass : Pass<"gpu-greedy-parallel-loop-mapping", "ModuleOp"> { - let summary = "Greedily maps all parallel loops to gpu hardware ids"; - let constructor = "mlir::oneflow::createMapSCFToGPUPass()"; - let dependentDialects = ["scf::SCFDialect"]; -} - def BufferHostRegisterPass : Pass<"buffer-host-register", "func::FuncOp"> { let summary = ""; let constructor = "mlir::oneflow::createBufferHostRegisterPass()"; diff --git a/oneflow/ir/include/OneFlow/OneFlowPatterns.td b/oneflow/ir/include/OneFlow/OneFlowPatterns.td index 5ea5d776f36..097d76c5fbb 100644 --- a/oneflow/ir/include/OneFlow/OneFlowPatterns.td +++ b/oneflow/ir/include/OneFlow/OneFlowPatterns.td @@ -5,7 +5,7 @@ include "mlir/IR/PatternBase.td" include "OneFlow/OneFlowOps.td" include "mlir/Dialect/MemRef/IR/MemRefOps.td" -include "mlir/Dialect/GPU/GPUOps.td" +include "mlir/Dialect/GPU/IR/GPUOps.td" def IsNotNestedInJit: ConstraintgetParentOfType<::mlir::oneflow::Job>())">, "">; def IsScalarTensor: Constraint, "">; diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 60d13342c1e..1305bfeb6c9 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -108,7 +108,7 @@ */ // Group: ASSIGN -// assign, assign_if, assign_if_not, logical_slice_assign +// assign, assign_if, assign_if_not // Total: 4 #ifdef GET_ONEFLOW_ASSIGN_OP_DEFINITIONS @@ -151,25 +151,6 @@ def OneFlow_AssignIfNotOp : OneFlow_BaseOp<"assign_if_not", [NoGrad, DeclareOpIn let has_input_arg_modify_fn = 1; } -def OneFlow_LogicalSliceAssignOp : OneFlow_BaseOp<"logical_slice_assign", [DeclareOpInterfaceMethods]> { - let input = (ins - OneFlow_Tensor:$ref, - OneFlow_Tensor:$value - ); - let output = (outs - OneFlow_Tensor:$y - ); - let attrs = (ins - SI64ArrayAttr:$start, - SI64ArrayAttr:$stop, - SI64ArrayAttr:$step - ); - let has_logical_tensor_desc_infer_fn = 1; - let has_physical_tensor_desc_infer_fn = 1; - let has_get_sbp_fn = 1; - let has_data_type_infer_fn = 1; -} - #endif // GET_ONEFLOW_ASSIGN_OP_DEFINITIONS // Group: BASE @@ -2859,7 +2840,7 @@ def OneFlow_ImageResizeToFixedOp : OneFlow_BaseOp<"image_resize_to_fixed", [NoSi #endif // GET_ONEFLOW_IMAGE_OP_DEFINITIONS // Group: INDICES -// arg_sort, argmax, argwhere, batch_gather, dim_gather, dim_scatter_add, dim_scatter_add_like, dim_scatter_add_scalar, dim_scatter_mul, dim_scatter_mul_scalar, dim_scatter_update, dim_scatter_update_scalar, embedding_renorm, embedding, embedding_grad, gather, gather_nd, generate_random_batch_permutation_indices, image_target_resize, logical_slice, scatter_nd, scatter_nd_like, slice, slice_grad, tensor_scatter_nd_add, tensor_scatter_nd_update, unsorted_batch_segment_sum, unsorted_segment_sum, unsorted_segment_sum_like, where, where_scalar_x, where_scalar_xy, where_scalar_y, median, searchsorted, searchsorted_scalar +// arg_sort, argmax, argwhere, batch_gather, dim_gather, dim_scatter_add, dim_scatter_add_like, dim_scatter_add_scalar, dim_scatter_mul, dim_scatter_mul_scalar, dim_scatter_update, dim_scatter_update_scalar, embedding_renorm, embedding, embedding_grad, gather, gather_nd, generate_random_batch_permutation_indices, image_target_resize, scatter_nd, scatter_nd_like, slice, slice_update, slice_grad, tensor_scatter_nd_add, tensor_scatter_nd_update, unsorted_batch_segment_sum, unsorted_segment_sum, unsorted_segment_sum_like, where, where_scalar_x, where_scalar_xy, where_scalar_y, median, searchsorted, searchsorted_scalar // Total: 36 #ifdef GET_ONEFLOW_INDICES_OP_DEFINITIONS @@ -3203,7 +3184,7 @@ def OneFlow_ImageTargetResizeOp : OneFlow_BaseOp<"image_target_resize", [NoSideE let has_data_type_infer_fn = 1; } -def OneFlow_LogicalSliceOp : OneFlow_BaseOp<"logical_slice", [NoSideEffect, DeclareOpInterfaceMethods]> { +def OneFlow_SliceOp : OneFlow_BaseOp<"slice", [NoSideEffect, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$x ); @@ -3221,75 +3202,76 @@ def OneFlow_LogicalSliceOp : OneFlow_BaseOp<"logical_slice", [NoSideEffect, Decl let has_data_type_infer_fn = 1; } -def OneFlow_ScatterNdOp : OneFlow_BaseOp<"scatter_nd", [NoSideEffect, DeclareOpInterfaceMethods]> { +def OneFlow_SliceUpdateOp : OneFlow_BaseOp<"slice_update", [DeclareOpInterfaceMethods]> { let input = (ins - OneFlow_Tensor:$indices, - OneFlow_Tensor:$updates + OneFlow_Tensor:$ref, + OneFlow_Tensor:$value ); let output = (outs - OneFlow_Tensor:$out + OneFlow_Tensor:$y ); let attrs = (ins - ShapeAttr:$shape + SI64ArrayAttr:$start, + SI64ArrayAttr:$stop, + SI64ArrayAttr:$step ); let has_logical_tensor_desc_infer_fn = 1; let has_physical_tensor_desc_infer_fn = 1; let has_get_sbp_fn = 1; let has_data_type_infer_fn = 1; - let has_input_arg_modify_fn = 1; } -def OneFlow_ScatterNdLikeOp : OneFlow_BaseOp<"scatter_nd_like", [NoSideEffect, DeclareOpInterfaceMethods]> { +def OneFlow_SliceGradOp : OneFlow_BaseOp<"slice_grad", [NoSideEffect, DeclareOpInterfaceMethods]> { let input = (ins - OneFlow_Tensor:$like, - OneFlow_Tensor:$indices, - OneFlow_Tensor:$updates + OneFlow_Tensor:$dy ); let output = (outs - OneFlow_Tensor:$out + OneFlow_Tensor:$dx + ); + let attrs = (ins + ShapeAttr:$like_shape, + SI64ArrayAttr:$start, + SI64ArrayAttr:$stop, + SI64ArrayAttr:$step ); let has_logical_tensor_desc_infer_fn = 1; let has_physical_tensor_desc_infer_fn = 1; let has_get_sbp_fn = 1; let has_data_type_infer_fn = 1; + let has_input_arg_modify_fn = 1; } -def OneFlow_SliceOp : OneFlow_BaseOp<"slice", [NoSideEffect, DeclareOpInterfaceMethods]> { +def OneFlow_ScatterNdOp : OneFlow_BaseOp<"scatter_nd", [NoSideEffect, DeclareOpInterfaceMethods]> { let input = (ins - OneFlow_Tensor:$x + OneFlow_Tensor:$indices, + OneFlow_Tensor:$updates ); let output = (outs - OneFlow_Tensor:$y + OneFlow_Tensor:$out ); let attrs = (ins - SI64ArrayAttr:$start, - SI64ArrayAttr:$stop, - SI64ArrayAttr:$step + ShapeAttr:$shape ); let has_logical_tensor_desc_infer_fn = 1; let has_physical_tensor_desc_infer_fn = 1; let has_get_sbp_fn = 1; let has_data_type_infer_fn = 1; + let has_input_arg_modify_fn = 1; } -def OneFlow_SliceGradOp : OneFlow_BaseOp<"slice_grad", [NoSideEffect, DeclareOpInterfaceMethods]> { +def OneFlow_ScatterNdLikeOp : OneFlow_BaseOp<"scatter_nd_like", [NoSideEffect, DeclareOpInterfaceMethods]> { let input = (ins - OneFlow_Tensor:$dy + OneFlow_Tensor:$like, + OneFlow_Tensor:$indices, + OneFlow_Tensor:$updates ); let output = (outs - OneFlow_Tensor:$dx - ); - let attrs = (ins - ShapeAttr:$like_shape, - SI64ArrayAttr:$start, - SI64ArrayAttr:$stop, - SI64ArrayAttr:$step + OneFlow_Tensor:$out ); let has_logical_tensor_desc_infer_fn = 1; let has_physical_tensor_desc_infer_fn = 1; let has_get_sbp_fn = 1; let has_data_type_infer_fn = 1; - let has_input_arg_modify_fn = 1; } def OneFlow_TensorScatterNdAddOp : OneFlow_BaseOp<"tensor_scatter_nd_add", [NoSideEffect, DeclareOpInterfaceMethods]> { @@ -4969,44 +4951,41 @@ def OneFlow_LocalMultiReduceMinAbsOp : OneFlow_BaseOp<"local_multi_reduce_min_ab let has_get_sbp_fn = 1; } -def OneFlow_NllOp : OneFlow_BaseOp<"nll", [NoSideEffect, DeclareOpInterfaceMethods]> { +def OneFlow_NLLOp : OneFlow_BaseOp<"nll", [NoSideEffect, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$input, OneFlow_Tensor:$target, Optional:$weight ); let output = (outs - OneFlow_Tensor:$out, - OneFlow_Tensor:$total_weight + OneFlow_Tensor:$output, + OneFlow_Tensor:$out_weight ); let attrs = (ins DefaultValuedAttr:$ignore_index ); + let has_data_type_infer_fn = 1; let has_logical_tensor_desc_infer_fn = 1; - let has_physical_tensor_desc_infer_fn = 1; let has_get_sbp_fn = 1; - let has_data_type_infer_fn = 1; let has_input_arg_modify_fn = 1; } -def OneFlow_NllGradOp : OneFlow_BaseOp<"nll_grad", [NoSideEffect, DeclareOpInterfaceMethods]> { +def OneFlow_NLLGradOp : OneFlow_BaseOp<"nll_grad", [NoSideEffect, DeclareOpInterfaceMethods]> { let input = (ins + OneFlow_Tensor:$out_grad, OneFlow_Tensor:$input, OneFlow_Tensor:$target, - OneFlow_Tensor:$total_weight, - Optional:$weight, - OneFlow_Tensor:$dy + Optional:$weight ); let output = (outs - OneFlow_Tensor:$dx + OneFlow_Tensor:$in_grad ); let attrs = (ins DefaultValuedAttr:$ignore_index ); + let has_data_type_infer_fn = 1; let has_logical_tensor_desc_infer_fn = 1; - let has_physical_tensor_desc_infer_fn = 1; let has_get_sbp_fn = 1; - let has_data_type_infer_fn = 1; } def OneFlow_PowXGradOp : OneFlow_BaseOp<"pow_x_grad", [NoSideEffect, DeclareOpInterfaceMethods]> { @@ -5746,7 +5725,7 @@ def OneFlow_NormalizationGradOp : OneFlow_BaseOp<"normalization_grad", [NoSideEf #endif // GET_ONEFLOW_NORMALIZATION_OP_DEFINITIONS // Group: OPTIMIZER -// adagrad_update, adam_bias_correction_factor, adam_update, indexed_slices_adam_update, indexed_slices_momentum_update, indexed_slices_sgd_update, lamb_update, lars_update, momentum_update, rmsprop_update, sgd_update, slice_update, ftrl_update +// adagrad_update, adam_bias_correction_factor, adam_update, indexed_slices_adam_update, indexed_slices_momentum_update, indexed_slices_sgd_update, lamb_update, lars_update, momentum_update, rmsprop_update, sgd_update, ftrl_update // Total: 13 #ifdef GET_ONEFLOW_OPTIMIZER_OP_DEFINITIONS @@ -6046,25 +6025,6 @@ def OneFlow_SgdUpdateOp : OneFlow_BaseOp<"sgd_update", [NoGrad, AttrSizedOperand let has_input_arg_modify_fn = 1; } -def OneFlow_SliceUpdateOp : OneFlow_BaseOp<"slice_update", [DeclareOpInterfaceMethods]> { - let input = (ins - OneFlow_Tensor:$x, - OneFlow_Tensor:$update - ); - let output = (outs - OneFlow_Tensor:$y - ); - let attrs = (ins - SI64ArrayAttr:$start, - SI64ArrayAttr:$stop, - SI64ArrayAttr:$step - ); - let has_logical_tensor_desc_infer_fn = 1; - let has_physical_tensor_desc_infer_fn = 1; - let has_get_sbp_fn = 1; - let has_data_type_infer_fn = 1; -} - def OneFlow_FtrlUpdateOp : OneFlow_BaseOp<"ftrl_update", [NoGrad, AttrSizedOperandSegments, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$model, diff --git a/oneflow/ir/include/OneFlow/Passes.h b/oneflow/ir/include/OneFlow/Passes.h index 59c05c42d34..7c46d8f3e59 100644 --- a/oneflow/ir/include/OneFlow/Passes.h +++ b/oneflow/ir/include/OneFlow/Passes.h @@ -19,13 +19,12 @@ limitations under the License. #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/Dialect/SCF/SCF.h" -#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Pass/Pass.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "OneFlow/Conversion/OneFlowToTosa.h" -#include "OneFlow/Conversion/SCFToGPU.h" #include "OneFlow/Transform/BufferHostRegister.h" #include "OneFlow/Transform/ConvertInferenceOp.h" #include "OneFlow/Transform/OutlineAndFuse.h" diff --git a/oneflow/ir/install-llvm.cmake b/oneflow/ir/install-llvm.cmake index e01bba1b36d..d25b1911634 100644 --- a/oneflow/ir/install-llvm.cmake +++ b/oneflow/ir/install-llvm.cmake @@ -10,6 +10,7 @@ if(NOT llvm_monorepo_POPULATED) execute_process( COMMAND "${CMAKE_COMMAND}" ${llvm_monorepo_SOURCE_DIR}/llvm + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} # this is required in newer version of LLVM -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_CUDA_COMPILER_LAUNCHER=${CMAKE_CUDA_COMPILER_LAUNCHER} diff --git a/oneflow/ir/lib/OneFlow/CMakeLists.txt b/oneflow/ir/lib/OneFlow/CMakeLists.txt index cdc4ccbb55b..b8d0ce21d1f 100644 --- a/oneflow/ir/lib/OneFlow/CMakeLists.txt +++ b/oneflow/ir/lib/OneFlow/CMakeLists.txt @@ -1,7 +1,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) message(STATUS "MLIR_DIALECT_LIBS: ${dialect_libs}") if(WITH_MLIR_CUDA_CODEGEN) - set(MLIR_GPU_LIBS MLIRSCFToGPU MLIRGPUToNVVMTransforms MLIRNVVMToLLVMIRTranslation) + set(MLIR_GPU_LIBS MLIRGPUToNVVMTransforms MLIRNVVMToLLVMIRTranslation) endif(WITH_MLIR_CUDA_CODEGEN) set(ONEFLOW_OP_GROUPS @@ -24,7 +24,6 @@ oneflow_add_mlir_dialect_library( OneFlowSupport.cpp OneFlowOpFolders.cpp Conversion/OneFlowToTosa.cpp - Conversion/SCFToGPU.cpp Conversion/PTXToCubin.cpp Transform/BufferHostRegister.cpp Transform/OutlineAndFuse.cpp @@ -43,6 +42,7 @@ oneflow_add_mlir_dialect_library( MLIRTosaToLinalg MLIRMemRefToLLVM MLIRLinalgToLLVM + MLIRSCFToGPU MLIRReconcileUnrealizedCasts ${MLIR_GPU_LIBS} MLIRIR diff --git a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp index ec92bb352ec..912ac6c3e0b 100644 --- a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp +++ b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp @@ -144,7 +144,7 @@ struct InputOpLowering final : public OpConversionPattern { // TODO: more choices to passing data between tosa and oneflow const auto newValues = op.input(); const auto is_block_arg = newValues.dyn_cast() != nullptr; - if (!is_block_arg) op->emitError("input is not block arg"); + if (!is_block_arg) { return op->emitError("input is not block arg"); } rewriter.replaceOp(op, newValues); return success(); } @@ -168,10 +168,10 @@ struct VariableOpLowering final : public OpConversionPattern { LogicalResult matchAndRewrite(VariableOp op, OpAdaptor adaptor, ConversionPatternRewriter& rewriter) const override { const auto mgr = ::oneflow::Global<::oneflow::VariableTensorMgr>::Get(); - if (!mgr) op->emitError("global variable tensor manager miss"); + if (!mgr) { return op->emitError("global variable tensor manager miss"); } const auto tensor = mgr->Get(op.op_name().str()); - if (!tensor) op->emitError("tensor is null"); + if (!tensor) { return op->emitError("tensor is null"); } const auto value = support::TensorToDenseElementsAttr(tensor, rewriter.getContext()); const auto output = op.output().getType(); @@ -204,7 +204,7 @@ struct VariableOpToConstLowering final : public OpConversionPattern rewriter.replaceOpWithNewOp(op, output, value); } else { - op->emitError( + return op->emitError( "OneFlow variable op lower to TOSA const op only support integer and float value now"); } @@ -327,7 +327,7 @@ struct MaxPool2DOpLowering final : public OpConversionPattern { return RankedTensorType::get(ranked_type, shape_type.getElementType()); }; // TODO: support return indice - if (op.return_indices()) op->emitError("not support return indices now"); + if (op.return_indices()) { return op->emitError("not support return indices now"); } auto stride_pairs = get_pair_int64_from_array(op.stride()); auto kernel_pairs = get_pair_int64_from_array(op.kernel_size()); auto pad_pairs = get_pair_int64_from_array(op.padding()); diff --git a/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp b/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp index 35ea2bd8b0e..8c22c3055de 100644 --- a/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp +++ b/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp @@ -17,7 +17,7 @@ limitations under the License. This file is ported from mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp */ -#include "mlir/Dialect/GPU/Passes.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" #ifdef WITH_MLIR_CUDA_CODEGEN #include "mlir/Pass/Pass.h" diff --git a/oneflow/ir/lib/OneFlow/Conversion/SCFToGPU.cpp b/oneflow/ir/lib/OneFlow/Conversion/SCFToGPU.cpp deleted file mode 100644 index 18cb2b4bd74..00000000000 --- a/oneflow/ir/lib/OneFlow/Conversion/SCFToGPU.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "OneFlow/OneFlowOps.h" -#include -#include -#include "OneFlow/OneFlowDialect.h" -#include "OneFlow/Passes.h" -#include "llvm/ADT/STLExtras.h" -#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h" -#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" -#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Linalg/Passes.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/Passes.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Func/Transforms/Passes.h" -#include "mlir/Dialect/Tensor/Transforms/Passes.h" -#include "mlir/Dialect/Tosa/IR/TosaOps.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/OpImplementation.h" - -#include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Support/LogicalResult.h" -#include "mlir/Transforms/DialectConversion.h" -#include "mlir/Transforms/Passes.h" - -#include "mlir/Dialect/GPU/ParallelLoopMapper.h" -#include "mlir/Pass/Pass.h" - -using namespace mlir; - -namespace { -/// Simple pass for testing the mapping of parallel loops to hardware ids using -/// a greedy mapping strategy. -class GpuGreedyParallelLoopMappingPass - : public MapSCFToGPUPassBase { - void runOnOperation() override { - Operation* op = getOperation(); - for (Region& region : op->getRegions()) greedilyMapParallelSCFToGPU(region); - } -}; -} // namespace - -namespace mlir { - -namespace oneflow { - -std::unique_ptr createMapSCFToGPUPass() { - return std::make_unique(); -} - -} // namespace oneflow - -} // namespace mlir diff --git a/oneflow/ir/lib/OneFlow/OneFlowOpFolders.cpp b/oneflow/ir/lib/OneFlow/OneFlowOpFolders.cpp index f5bedf762c1..c3d491cf597 100644 --- a/oneflow/ir/lib/OneFlow/OneFlowOpFolders.cpp +++ b/oneflow/ir/lib/OneFlow/OneFlowOpFolders.cpp @@ -51,10 +51,11 @@ OpFoldResult UnaryFold(MLIRContext* ctx, ArrayRef operands, const auto attr_dict = operands.front().cast(); auto attrs = NamedAttrList(attr_dict); const auto tensor = support::DenseElementsAttrToTensor( - attr_dict.get("value"), attr_dict.get("device_tag"), attr_dict.get("device_name")); + attr_dict.get("value"), attr_dict.get(OpTrait::IsOpConfCompatible::getDeviceTagAttr()), + attr_dict.get(OpTrait::IsOpConfCompatible::getDeviceNameAttr())); const auto result = f(tensor).GetPtrOrThrow(); attrs.set("value", support::TensorToDenseElementsAttr(result, ctx)); - attrs.set("op_name", GenNewVariableOpName(ctx)); + attrs.set(OpTrait::IsOpConfCompatible::getOpNameAttr(), GenNewVariableOpName(ctx)); return attrs.getDictionary(ctx); } @@ -67,17 +68,19 @@ OpFoldResult BinaryFold(MLIRContext* ctx, ArrayRef operands, auto rhs_attr_dict = operands.back().cast(); auto attrs = NamedAttrList(lhs_attr_dict); - const auto lhs_tensor = support::DenseElementsAttrToTensor(lhs_attr_dict.get("value"), - lhs_attr_dict.get("device_tag"), - lhs_attr_dict.get("device_name")); - const auto rhs_tensor = support::DenseElementsAttrToTensor(rhs_attr_dict.get("value"), - rhs_attr_dict.get("device_tag"), - rhs_attr_dict.get("device_name")); + const auto lhs_tensor = support::DenseElementsAttrToTensor( + lhs_attr_dict.get("value"), + lhs_attr_dict.get(OpTrait::IsOpConfCompatible::getDeviceTagAttr()), + lhs_attr_dict.get(OpTrait::IsOpConfCompatible::getDeviceNameAttr())); + const auto rhs_tensor = support::DenseElementsAttrToTensor( + rhs_attr_dict.get("value"), + rhs_attr_dict.get(OpTrait::IsOpConfCompatible::getDeviceTagAttr()), + rhs_attr_dict.get(OpTrait::IsOpConfCompatible::getDeviceNameAttr())); const auto result = f(lhs_tensor, rhs_tensor).GetPtrOrThrow(); attrs.set("value", support::TensorToDenseElementsAttr(result, ctx)); - attrs.set("op_name", GenNewVariableOpName(ctx)); + attrs.set(OpTrait::IsOpConfCompatible::getOpNameAttr(), GenNewVariableOpName(ctx)); return attrs.getDictionary(ctx); } diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index f76d9370109..b0f8c71bf57 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -62,7 +62,7 @@ limitations under the License. #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" -#include "mlir/Dialect/GPU/Passes.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h" #endif // WITH_MLIR_CUDA_CODEGEN @@ -349,9 +349,9 @@ ::llvm::SmallVector<::mlir::Value, 4> CreateConv2dAndErasePad(::mlir::PatternRew NamedAttrList GetUserOpCommonAttrs(MLIRContext* ctx, const std::string& op_name) { NamedAttrList attrs; - attrs.set("op_name", StringAttr::get(ctx, op_name)); - attrs.set("device_tag", StringAttr::get(ctx, "cpu")); - attrs.set("device_name", + attrs.set(OpTrait::IsOpConfCompatible::getOpNameAttr(), StringAttr::get(ctx, op_name)); + attrs.set(OpTrait::IsOpConfCompatible::getDeviceTagAttr(), StringAttr::get(ctx, "cpu")); + attrs.set(OpTrait::IsOpConfCompatible::getDeviceNameAttr(), ArrayAttr::get(ctx, llvm::to_vector<8>(llvm::map_range(ArrayRef({"@0:0"}), [&](StringRef v) -> Attribute { return StringAttr::get(ctx, v); @@ -569,7 +569,8 @@ llvm::SmallVector getInputOperandTransposeOp(NCHWCompatible op, PatternRewriter& rewriter) { std::string transpose_name = OpTrait::IsOpConfCompatible::getOpName(op).str() + "_transpose_input_" + std::to_string(num_transposed_operand); - transpose_attributes.set(llvm::StringRef("op_name"), rewriter.getStringAttr(transpose_name)); + transpose_attributes.set(llvm::StringRef(OpTrait::IsOpConfCompatible::getOpNameAttr()), + rewriter.getStringAttr(transpose_name)); SmallVector input_operands; input_operands.push_back(val); auto res = rewriter @@ -583,7 +584,8 @@ TransposeOp getResultTransposeOp(NCHWCompatible op, Value val, NamedAttrList tra int num_transposed_result, PatternRewriter& rewriter) { std::string transpose_name = OpTrait::IsOpConfCompatible::getOpName(op).str() + "_transpose_output_" + std::to_string(num_transposed_result); - transpose_attributes.set(llvm::StringRef("op_name"), rewriter.getStringAttr(transpose_name)); + transpose_attributes.set(llvm::StringRef(OpTrait::IsOpConfCompatible::getOpNameAttr()), + rewriter.getStringAttr(transpose_name)); SmallVector operands; operands.push_back(val); TransposeOp transpose_op = rewriter.create(op.getLoc(), val.getType(), @@ -767,9 +769,10 @@ LogicalResult LowerModuleToCUDALLVM(mlir::MLIRContext* context, ModuleOp module) AddLowerToLinalgMemRefPasses(pm); pm.addNestedPass( createConvertLinalgToParallelLoopsPass()); // convert-linalg-to-parallel-loops - pm.addPass(createMapSCFToGPUPass()); // gpu-greedy-parallel-loop-mapping - pm.addPass(createParallelLoopToGpuPass()); // convert-parallel-loops-to-gpu - pm.addPass(createGpuKernelOutliningPass()); // gpu-kernel-outlining + pm.addNestedPass(createGpuMapParallelLoopsPass()); // gpu-map-parallel-loops + pm.addPass(createParallelLoopToGpuPass()); // convert-parallel-loops-to-gpu + pm.addPass(createGpuLauchSinkIndexComputationsPass()); + pm.addPass(createGpuKernelOutliningPass()); // gpu-kernel-outlining pm.addNestedPass(createBufferHostRegisterPass()); // buffer-host-register pm.addPass(createCanonicalizerPass()); // canonicalize // -pass-pipeline='gpu.module([PASS1][PASS2]...)' @@ -779,6 +782,7 @@ LogicalResult LowerModuleToCUDALLVM(mlir::MLIRContext* context, ModuleOp module) pm.addNestedPass(createSerializeToCubinPass()); // out-of-tree-gpu-to-cubin pm.addNestedPass(createGpuCopyArgPass()); // buffer-host-register pm.addPass(createGpuToLLVMConversionPass()); + pm.addPass(createReconcileUnrealizedCastsPass()); // reconcile-unrealized-casts if (enable_ir_printing) pm.enableIRPrinting(); return pm.run(module); } diff --git a/oneflow/ir/oneflow-extension/CMakeLists.txt b/oneflow/ir/oneflow-extension/CMakeLists.txt index 8a0b21aa8f3..e7e2f1fbd18 100644 --- a/oneflow/ir/oneflow-extension/CMakeLists.txt +++ b/oneflow/ir/oneflow-extension/CMakeLists.txt @@ -11,7 +11,7 @@ oneflow_add_mlir_library( MLIRIR MLIRParser MLIRPass - MLIRSPIRV + MLIRSPIRVDialect MLIRTranslateLib MLIRSupport MLIROneFlow diff --git a/oneflow/ir/oneflow-extension/extension.cpp b/oneflow/ir/oneflow-extension/extension.cpp index 130ea9b11f4..9954ed6dd8d 100644 --- a/oneflow/ir/oneflow-extension/extension.cpp +++ b/oneflow/ir/oneflow-extension/extension.cpp @@ -77,8 +77,8 @@ OpaqueMemRefDescriptor CreateMemRefDescriptor(user_op::Tensor* tensor) { auto desc = new MemRefType(); *desc = mlir::detail::makeStridedMemRefDescriptor( tensor->dptr(), tensor->dptr(), - {tensor->shape().ptr(), tensor->shape().ptr() + tensor->shape().NumAxes()}, - {tensor->shape().ptr(), tensor->shape().ptr() + tensor->shape().NumAxes()}); + {tensor->shape_view().ptr(), tensor->shape_view().ptr() + tensor->shape_view().NumAxes()}, + {tensor->shape_view().ptr(), tensor->shape_view().ptr() + tensor->shape_view().NumAxes()}); auto deleter = [](void const* data) { auto p = static_cast(data); delete p; @@ -92,8 +92,8 @@ OpaqueMemRefDescriptor CreateMutMemRefDescriptor(user_op::Tensor* tensor) { auto desc = new MemRefType(); *desc = mlir::detail::makeStridedMemRefDescriptor( tensor->mut_dptr(), tensor->mut_dptr(), - {tensor->shape().ptr(), tensor->shape().ptr() + tensor->shape().NumAxes()}, - {tensor->shape().ptr(), tensor->shape().ptr() + tensor->shape().NumAxes()}); + {tensor->shape_view().ptr(), tensor->shape_view().ptr() + tensor->shape_view().NumAxes()}, + {tensor->shape_view().ptr(), tensor->shape_view().ptr() + tensor->shape_view().NumAxes()}); auto deleter = [](void const* data) { auto p = static_cast(data); delete p; @@ -120,13 +120,13 @@ llvm::SmallVector GetMLIRCInterfaceArgs( for (auto& pair : ctx->inputs()) { auto tensor = ctx->Tensor4ArgNameAndIndex(pair.first, pair.second); auto ref = SwitchCreateMemRefDescriptor( - SwitchCase(tensor->shape().NumAxes(), tensor->data_type()), tensor); + SwitchCase(tensor->shape_view().NumAxes(), tensor->data_type()), tensor); args.push_back(ref); } for (auto& pair : ctx->outputs()) { auto tensor = ctx->Tensor4ArgNameAndIndex(pair.first, pair.second); auto ref = SwitchCreateMutMemRefDescriptor( - SwitchCase(tensor->shape().NumAxes(), tensor->data_type()), tensor); + SwitchCase(tensor->shape_view().NumAxes(), tensor->data_type()), tensor); args.push_back(ref); } return args; diff --git a/oneflow/ir/oneflow-opt/oneflow-opt.cpp b/oneflow/ir/oneflow-opt/oneflow-opt.cpp index 0496d741603..f8b35f58d59 100644 --- a/oneflow/ir/oneflow-opt/oneflow-opt.cpp +++ b/oneflow/ir/oneflow-opt/oneflow-opt.cpp @@ -47,7 +47,7 @@ int32_t main(int32_t argc, char** argv) { mlir::registerAllPasses(); mlir::registerTestOneFlowTraitsPass(); mlir::registerLowerOneFlowToTosaPassPass(); - mlir::registerMapSCFToGPUPassPass(); + mlir::registerGpuMapParallelLoopsPassPass(); mlir::registerBufferHostRegisterPassPass(); mlir::registerGpuCopyArgPassPass(); #ifdef WITH_MLIR_CUDA_CODEGEN diff --git a/oneflow/ir/oneflow-runner/CMakeLists.txt b/oneflow/ir/oneflow-runner/CMakeLists.txt index d594362192b..9c5a601af5f 100644 --- a/oneflow/ir/oneflow-runner/CMakeLists.txt +++ b/oneflow/ir/oneflow-runner/CMakeLists.txt @@ -16,7 +16,7 @@ target_link_libraries( MLIRExecutionEngine MLIRIR MLIRJitRunner - MLIRLLVMIR + MLIRLLVMIRTransforms MLIRLLVMToLLVMIRTranslation MLIRToLLVMIRTranslationRegistration MLIRParser diff --git a/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt b/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt index 5ce5c097953..539021f8f54 100644 --- a/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt +++ b/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt @@ -14,7 +14,7 @@ oneflow_add_mlir_library( MLIRIR MLIRParser MLIRPass - MLIRSPIRV + MLIRSPIRVDialect MLIRTranslateLib MLIRSupport MLIROneFlow diff --git a/oneflow/ir/oneflow-translate/lib/OneFlow/Importer.cpp b/oneflow/ir/oneflow-translate/lib/OneFlow/Importer.cpp index 5386629fd00..97814d09633 100644 --- a/oneflow/ir/oneflow-translate/lib/OneFlow/Importer.cpp +++ b/oneflow/ir/oneflow-translate/lib/OneFlow/Importer.cpp @@ -492,10 +492,7 @@ LogicalResult ConvertCtrlInputs(Operation* op, ::oneflow::OperatorConf& op_conf) if (auto ctrl_ins = GetCtrlIntputOperands(op)) { for (auto ctrl_in : ctrl_ins.getValue()) { op_conf.add_ctrl_in_op_name( - ctrl_in.getDefiningOp() - ->getAttrOfType(OpTrait::IsOpConfCompatible::getOpNameAttr()) - .getValue() - .str()); + OpTrait::IsOpConfCompatible::getOpName(ctrl_in.getDefiningOp()).str()); } } return success(); @@ -675,9 +672,8 @@ llvm::Optional GetOutputLbn(OpResult result) { auto size = std::get<1>(name_size_tuple); if ((size_sum + size) > result_number) { const uint32_t bn_i = result_number - size_sum; - return def_op->getAttrOfType(OpTrait::IsOpConfCompatible::getOpNameAttr()) - .str() - + "/" + name + "_" + std::to_string(bn_i); + return OpTrait::IsOpConfCompatible::getOpName(def_op).str() + "/" + name + "_" + + std::to_string(bn_i); } size_sum += size; } @@ -946,7 +942,7 @@ LogicalResult ConvertVariableOpConf(VariableOp op, ::oneflow::OperatorConf* op_c // all operands are ctrl_inputs for (const auto& operand : op->getOperands()) { op_conf->add_ctrl_in_op_name( - operand.getDefiningOp()->getAttrOfType("op_name").getValue().str()); + OpTrait::IsOpConfCompatible::getOpName(operand.getDefiningOp()).str()); } if (auto floatInit = op.float_initializer()) { var_op_conf->mutable_initializer()->mutable_constant_conf()->set_value( @@ -1002,7 +998,7 @@ LogicalResult ConvertInputOpConf(InputOp op, ::oneflow::OperatorConf* op_conf) { // operand 0 is block argument, others are ctrl_inputs for (size_t i = 1; i < op->getNumOperands(); ++i) { op_conf->add_ctrl_in_op_name( - op->getOperand(i).getDefiningOp()->getAttrOfType("op_name").getValue().str()); + OpTrait::IsOpConfCompatible::getOpName(op->getOperand(i).getDefiningOp()).str()); } return success(); @@ -1054,7 +1050,7 @@ LogicalResult ConvertOutputOpConf(OutputOp op, ::oneflow::OperatorConf* op_conf) output_op_conf->set_in(output_lbn); for (size_t i = 1; i < op->getNumOperands(); ++i) { op_conf->add_ctrl_in_op_name( - op->getOperand(i).getDefiningOp()->getAttrOfType("op_name").getValue().str()); + OpTrait::IsOpConfCompatible::getOpName(op->getOperand(i).getDefiningOp()).str()); } return success(); } diff --git a/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir index 34ee5b499dc..3115bad55c6 100644 --- a/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir +++ b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir @@ -4,7 +4,7 @@ // RUN: -tensor-bufferize -func-bufferize -buffer-results-to-out-params \ // RUN: -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm \ // RUN: -convert-func-to-llvm -convert-memref-to-llvm -reconcile-unrealized-casts --print-after-all \ -// RUN: | oneflow-translate -mlir-to-llvmir | clang -x ir - -c -o test.o +// RUN: | oneflow-translate -mlir-to-llvmir builtin.module { func.func @Graph_0(%arg0: tensor<2xf32>) -> tensor<2xf32> { diff --git a/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir b/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir index a6a7db89b1b..9eaf154ac6f 100644 --- a/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir +++ b/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir @@ -1,4 +1,4 @@ -// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-greedy-parallel-loop-mapping \ +// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-map-parallel-loops \ // RUN: -convert-parallel-loops-to-gpu -gpu-kernel-outlining -buffer-host-register -canonicalize \ // RUN: -pass-pipeline='gpu.module(strip-debuginfo,lower-affine,convert-gpu-to-nvvm,out-of-tree-gpu-to-cubin)' \ // RUN: --func-bufferize -buffer-results-to-out-params -gpu-copy-arg --tensor-bufferize \ @@ -12,7 +12,7 @@ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_c_runner_utils%shlibext \ // RUN: --entry-point-result=void -// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-greedy-parallel-loop-mapping \ +// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-map-parallel-loops \ // RUN: -convert-parallel-loops-to-gpu -gpu-kernel-outlining -buffer-host-register -canonicalize \ // RUN: -pass-pipeline='gpu.module(strip-debuginfo,lower-affine,convert-gpu-to-nvvm,out-of-tree-gpu-to-cubin)' \ // RUN: --func-bufferize --tensor-bufferize \ @@ -25,13 +25,13 @@ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void -func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> { +func.func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> { %0 = "oneflow.cast"(%arg0) {device_name = ["@0:0"], device_tag = "cuda", dtype = 2 : i32, hierarchy = [1], op_name = "Cast_289", output_lbns = ["Cast_289/out_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xi64>) -> tensor<3x3xf32> %1 = "oneflow.scalar_mul_by_tensor"(%0, %arg1) {device_name = ["@0:0"], device_tag = "cuda", hierarchy = [1], op_name = "ScalarMulByTensor_290", output_lbns = ["ScalarMulByTensor_290/y_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xf32>, tensor<1xf32>) -> tensor<3x3xf32> return %1 : tensor<3x3xf32> } -func @main() { +func.func @main() { %a_data = memref.alloc() : memref<3x3xi64> %b_data = memref.alloc() : memref<1xf32> %a = bufferization.to_tensor %a_data : memref<3x3xi64> @@ -40,15 +40,15 @@ func @main() { %c = call @Cast_289__FUSE__ScalarMulByTensor_290(%a, %b) : (tensor<3x3xi64>, tensor<1xf32>) -> (tensor<3x3xf32>) %c_buffer = bufferization.to_memref %c : memref<3x3xf32> %cast_c_buffer = memref.cast %c_buffer : memref<3x3xf32> to memref<*xf32> - call @print_memref_f32(%cast_c_buffer) : (memref<*xf32>) -> () + call @printMemrefF32(%cast_c_buffer) : (memref<*xf32>) -> () // TODO: use real number // CHECK: [3, 3] %cast_a_data = memref.cast %a_data : memref<3x3xi64> to memref<*xi64> %cast_b_data = memref.cast %b_data : memref<1xf32> to memref<*xf32> - call @print_memref_i64(%cast_a_data) : (memref<*xi64>) -> () - call @print_memref_f32(%cast_b_data) : (memref<*xf32>) -> () + call @printMemrefI64(%cast_a_data) : (memref<*xi64>) -> () + call @printMemrefF32(%cast_b_data) : (memref<*xf32>) -> () return } -func private @print_memref_f32(memref<*xf32>) -func private @print_memref_i64(memref<*xi64>) +func.func private @printMemrefF32(memref<*xf32>) +func.func private @printMemrefI64(memref<*xi64>) diff --git a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir index 3371acad706..f63e65b7431 100644 --- a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir +++ b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir @@ -1,8 +1,8 @@ -// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-greedy-parallel-loop-mapping \ +// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-map-parallel-loops \ // RUN: -convert-parallel-loops-to-gpu -gpu-kernel-outlining -buffer-host-register -canonicalize \ // RUN: -pass-pipeline='gpu.module(strip-debuginfo,lower-affine,convert-gpu-to-nvvm,out-of-tree-gpu-to-cubin)' \ // RUN: --func-bufferize -buffer-results-to-out-params -gpu-copy-arg -func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> { +func.func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> { %0 = "oneflow.cast"(%arg0) {device_name = ["@0:0"], device_tag = "cuda", dtype = 2 : i32, hierarchy = [1], op_name = "Cast_289", output_lbns = ["Cast_289/out_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xi64>) -> tensor<3x3xf32> %1 = "oneflow.scalar_mul_by_tensor"(%0, %arg1) {device_name = ["@0:0"], device_tag = "cuda", hierarchy = [1], op_name = "ScalarMulByTensor_290", output_lbns = ["ScalarMulByTensor_290/y_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xf32>, tensor<1xf32>) -> tensor<3x3xf32> return %1 : tensor<3x3xf32> diff --git a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir index c5aac6f8e94..6f3d14cf212 100644 --- a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir +++ b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir @@ -8,7 +8,7 @@ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s // CHECK: [{{(35, ){34}35}}] -func @main() { +func.func @main() { %arg = memref.alloc() : memref<35xf32> %dst = memref.cast %arg : memref<35xf32> to memref %one = arith.constant 1 : index @@ -28,8 +28,8 @@ func @main() { memref.store %res, %dst[%tx] : memref gpu.terminator } - call @print_memref_f32(%cast_dst) : (memref<*xf32>) -> () + call @printMemrefF32(%cast_dst) : (memref<*xf32>) -> () return } -func private @print_memref_f32(memref<*xf32>) +func.func private @printMemrefF32(memref<*xf32>) diff --git a/oneflow/ir/test/OneFlow/folding/test_conv_bn.py b/oneflow/ir/test/OneFlow/folding/test_conv_bn.py index 1b939a891c0..f7c448ce404 100644 --- a/oneflow/ir/test/OneFlow/folding/test_conv_bn.py +++ b/oneflow/ir/test/OneFlow/folding/test_conv_bn.py @@ -31,7 +31,7 @@ def _test_fuse_conv_bn(test_case): data = flow.randn(1, 3, 224, 224) - model = resnet50(pretrained=True, progress=True) + model = resnet50(pretrained=False, progress=True) model.eval() eager_res = model(data) @@ -47,7 +47,7 @@ def build(self, *input): lazy_res = graph(data) test_case.assertTrue( - np.allclose(eager_res.numpy(), lazy_res.numpy(), rtol=1e-5, atol=1e-5) + np.allclose(eager_res.numpy(), lazy_res.numpy(), rtol=1e-2, atol=1e-2) ) diff --git a/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py b/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py index 085d72f5c93..c07e307f822 100644 --- a/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py +++ b/oneflow/ir/test/OneFlow/folding/test_simple_multiply.py @@ -87,10 +87,16 @@ def build(self, *args): class TestFoldMultiply(oneflow.unittest.TestCase): def test_fold_multiply(test_case): _test_fold_multiply(test_case, MultiplyModel, with_cuda=False) + + @unittest.skipUnless(oneflow.sysconfig.with_cuda(), "only test cpu cases") + def test_fold_multiply_cuda(test_case): _test_fold_multiply(test_case, MultiplyModel, with_cuda=True) def test_fold_multiply_complex(test_case): _test_fold_multiply(test_case, MultiplyModelComplex, with_cuda=False) + + @unittest.skipUnless(oneflow.sysconfig.with_cuda(), "only test cpu cases") + def test_fold_multiply_complex_cuda(test_case): _test_fold_multiply(test_case, MultiplyModelComplex, with_cuda=True) def test_fold_multiply_with_input(test_case): @@ -98,8 +104,10 @@ def test_fold_multiply_with_input(test_case): b = flow.tensor([9, -1], dtype=flow.float32) _test_fold_multiply(test_case, MultiplyModelWithInput, False, a, b) - a = a.to("cuda") - b = b.to("cuda") + @unittest.skipUnless(oneflow.sysconfig.with_cuda(), "only test cpu cases") + def test_fold_multiply_with_input_cuda(test_case): + a = flow.tensor([3, 7], dtype=flow.float32, device="cuda") + b = flow.tensor([9, -1], dtype=flow.float32, device="cuda") _test_fold_multiply(test_case, MultiplyModelWithInput, True, a, b) diff --git a/oneflow/ir/test/OneFlow/lower_to_tosa.mlir b/oneflow/ir/test/OneFlow/lower_to_tosa.mlir index df5f91c3129..f65ed33275c 100644 --- a/oneflow/ir/test/OneFlow/lower_to_tosa.mlir +++ b/oneflow/ir/test/OneFlow/lower_to_tosa.mlir @@ -1,8 +1,7 @@ // RUN: oneflow-opt -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -tensor-bufferize -func-bufferize -buffer-results-to-out-params -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -convert-func-to-llvm -convert-memref-to-llvm -reconcile-unrealized-casts --print-after-all %s -// RUN: oneflow-opt -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -tensor-bufferize -func-bufferize -buffer-results-to-out-params -finalizing-bufferize -canonicalize %s module { - func @Cast_1__FUSE__ScalarMulByTensor_2(%arg0: tensor<96x96xi64>, %arg1: tensor<1xf32>) -> tensor<96x96xf32> { + func.func @Cast_1__FUSE__ScalarMulByTensor_2(%arg0: tensor<96x96xi64>, %arg1: tensor<1xf32>) -> tensor<96x96xf32> { %0 = "oneflow.cast"(%arg0) {device_name = ["0:0"], device_tag = "cpu", dtype = 2 : i32, hierarchy = [1], op_name = "Cast_1", op_type_name = "cast", scope_symbol_id = 4611686018427416574 : i64} : (tensor<96x96xi64>) -> tensor<96x96xf32> %1 = "oneflow.scalar_mul_by_tensor"(%0, %arg1) {device_name = ["0:0"], device_tag = "cpu", hierarchy = [1], op_name = "ScalarMulByTensor_2", op_type_name = "scalar_mul_by_tensor", scope_symbol_id = 4611686018427416574 : i64} : (tensor<96x96xf32>, tensor<1xf32>) -> tensor<96x96xf32> return %1 : tensor<96x96xf32> diff --git a/oneflow/ir/test/OneFlow/traits.mlir b/oneflow/ir/test/OneFlow/traits.mlir index ed8eb3a5678..55506828b84 100644 --- a/oneflow/ir/test/OneFlow/traits.mlir +++ b/oneflow/ir/test/OneFlow/traits.mlir @@ -1,17 +1,17 @@ // RUN: oneflow-opt -test-oneflow-trait-folder %s | FileCheck %s -// CHECK-LABEL: func @testSingleIdempotent +// CHECK-LABEL: func.func @testSingleIdempotent // CHECK-SAME: ([[ARG0:%.+]]: tensor) -func @testSingleIdempotent(%arg0 : tensor) -> tensor { +func.func @testSingleIdempotent(%arg0 : tensor) -> tensor { // CHECK: [[IDEMPOTENT:%.+]] = "oneflow.relu"([[ARG0]]) %0 = "oneflow.relu"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor // CHECK: return [[IDEMPOTENT]] return %0: tensor } -// CHECK-LABEL: func @testDoubleIdempotent +// CHECK-LABEL: func.func @testDoubleIdempotent // CHECK-SAME: ([[ARG0:%.+]]: tensor) -func @testDoubleIdempotent(%arg0: tensor) -> tensor { +func.func @testDoubleIdempotent(%arg0: tensor) -> tensor { // CHECK: [[IDEMPOTENT:%.+]] = "oneflow.relu"([[ARG0]]) %0 = "oneflow.relu"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor %1 = "oneflow.relu"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor @@ -19,9 +19,9 @@ func @testDoubleIdempotent(%arg0: tensor) -> tensor { return %1: tensor } -// CHECK-LABEL: func @testTripleIdempotent +// CHECK-LABEL: func.func @testTripleIdempotent // CHECK-SAME: ([[ARG0:%.+]]: tensor) -func @testTripleIdempotent(%arg0: tensor) -> tensor { +func.func @testTripleIdempotent(%arg0: tensor) -> tensor { // CHECK: [[IDEMPOTENT:%.+]] = "oneflow.relu"([[ARG0]]) %0 = "oneflow.relu"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor %1 = "oneflow.relu"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor @@ -30,18 +30,18 @@ func @testTripleIdempotent(%arg0: tensor) -> tensor { return %2: tensor } -// CHECK-LABEL: func @testDoubleInvolution +// CHECK-LABEL: func.func @testDoubleInvolution // CHECK-SAME: ([[ARG0:%.+]]: tensor) -func @testDoubleInvolution(%arg0: tensor) -> tensor { +func.func @testDoubleInvolution(%arg0: tensor) -> tensor { %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor %1 = "oneflow.negative"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor // CHECK: return [[ARG0]] return %1: tensor } -// CHECK-LABEL: func @testTripleInvolution +// CHECK-LABEL: func.func @testTripleInvolution // CHECK-SAME: ([[ARG0:%.+]]: tensor) -func @testTripleInvolution(%arg0: tensor) -> tensor { +func.func @testTripleInvolution(%arg0: tensor) -> tensor { // CHECK: [[INVOLUTION:%.+]] = "oneflow.negative"([[ARG0]]) %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor %1 = "oneflow.negative"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor @@ -50,9 +50,9 @@ func @testTripleInvolution(%arg0: tensor) -> tensor { return %2: tensor } -// CHECK-LABEL: func @testFailedInvolutionFoldDueToDifferentPlacement +// CHECK-LABEL: func.func @testFailedInvolutionFoldDueToDifferentPlacement // CHECK-SAME: ([[ARG0:%.+]]: tensor) -func @testFailedInvolutionFoldDueToDifferentPlacement(%arg0: tensor) -> tensor { +func.func @testFailedInvolutionFoldDueToDifferentPlacement(%arg0: tensor) -> tensor { %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor %1 = "oneflow.negative"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["1:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor // CHECK: [[INVOLUTION:%.+]] = "oneflow.negative"(%1) @@ -61,9 +61,9 @@ func @testFailedInvolutionFoldDueToDifferentPlacement(%arg0: tensor) -> ten return %2: tensor } -// CHECK-LABEL: func @testFailedInvolutionFoldDueToDifferentDevice +// CHECK-LABEL: func.func @testFailedInvolutionFoldDueToDifferentDevice // CHECK-SAME: ([[ARG0:%.+]]: tensor) -func @testFailedInvolutionFoldDueToDifferentDevice(%arg0: tensor) -> tensor { +func.func @testFailedInvolutionFoldDueToDifferentDevice(%arg0: tensor) -> tensor { %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor %1 = "oneflow.negative"(%0) {device_tag = "cpu", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor) -> tensor // CHECK: [[INVOLUTION:%.+]] = "oneflow.negative"(%1) diff --git a/oneflow/ir/test/OneFlow/folding/test_conv_bn_auto_nhwc.py b/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py similarity index 78% rename from oneflow/ir/test/OneFlow/folding/test_conv_bn_auto_nhwc.py rename to oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py index 1028592acee..8202c49ae89 100644 --- a/oneflow/ir/test/OneFlow/folding/test_conv_bn_auto_nhwc.py +++ b/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py @@ -29,11 +29,14 @@ os.environ["ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"] = "1" -def _test_fuse_conv_bn(test_case): - data = flow.randn(1, 3, 224, 224).to("cuda") - - model = resnet50(pretrained=True, progress=True) - model.to("cuda") +def _test_fuse_conv_bn(test_case, with_cuda): + data = flow.randn(1, 3, 224, 224) + if with_cuda: + data = data.to("cuda") + + model = resnet50(pretrained=False, progress=True) + if with_cuda: + model.to("cuda") model.eval() eager_res = model(data) @@ -49,14 +52,15 @@ def build(self, *input): lazy_res = graph(data) test_case.assertTrue( - np.allclose(eager_res.numpy(), lazy_res.numpy(), rtol=1e-5, atol=1e-5) + np.allclose(eager_res.numpy(), lazy_res.numpy(), rtol=1e-2, atol=1e-2) ) @flow.unittest.skip_unless_1n1d() class TestFuseConvBn(oneflow.unittest.TestCase): - def test_fuse_conv_bn(test_case): - _test_fuse_conv_bn(test_case) + @unittest.skipUnless(oneflow.sysconfig.with_cuda(), "only test cpu cases") + def test_fuse_conv_bn_cuda(test_case): + _test_fuse_conv_bn(test_case, True) if __name__ == "__main__": diff --git a/oneflow/user/data/coco_parser.cpp b/oneflow/user/data/coco_parser.cpp index 484073b2703..69e13e29f50 100644 --- a/oneflow/user/data/coco_parser.cpp +++ b/oneflow/user/data/coco_parser.cpp @@ -64,30 +64,31 @@ void COCOParser::Parse(BatchType& batch_data, user_op::KernelComputeContext* ctx } }); // dynamic batch size - if (image_tensor->shape().elem_cnt() != batch_data.size()) { - CHECK_EQ(image_tensor->shape().NumAxes(), 1); - image_tensor->mut_shape().Set(0, batch_data.size()); + if (image_tensor->shape_view().elem_cnt() != batch_data.size()) { + CHECK_EQ(image_tensor->shape_view().NumAxes(), 1); + image_tensor->mut_shape_view().Set(0, batch_data.size()); } - if (image_id_tensor && image_id_tensor->shape().At(0) != batch_data.size()) { - image_id_tensor->mut_shape().Set(0, batch_data.size()); + if (image_id_tensor && image_id_tensor->shape_view().At(0) != batch_data.size()) { + image_id_tensor->mut_shape_view().Set(0, batch_data.size()); } - if (image_size_tensor && image_size_tensor->shape().At(0) != batch_data.size()) { - image_size_tensor->mut_shape().Set(0, batch_data.size()); + if (image_size_tensor && image_size_tensor->shape_view().At(0) != batch_data.size()) { + image_size_tensor->mut_shape_view().Set(0, batch_data.size()); } - if (bbox_tensor && bbox_tensor->shape().elem_cnt() != batch_data.size()) { - CHECK_EQ(bbox_tensor->shape().NumAxes(), 1); - bbox_tensor->mut_shape().Set(0, batch_data.size()); + if (bbox_tensor && bbox_tensor->shape_view().elem_cnt() != batch_data.size()) { + CHECK_EQ(bbox_tensor->shape_view().NumAxes(), 1); + bbox_tensor->mut_shape_view().Set(0, batch_data.size()); } - if (label_tensor && label_tensor->shape().elem_cnt() != batch_data.size()) { - CHECK_EQ(label_tensor->shape().NumAxes(), 1); - label_tensor->mut_shape().Set(0, batch_data.size()); + if (label_tensor && label_tensor->shape_view().elem_cnt() != batch_data.size()) { + CHECK_EQ(label_tensor->shape_view().NumAxes(), 1); + label_tensor->mut_shape_view().Set(0, batch_data.size()); } - if (segm_tensor && segm_index_tensor && segm_tensor->shape().elem_cnt() != batch_data.size()) { - CHECK_EQ(segm_tensor->shape().NumAxes(), 1); - CHECK_EQ(segm_index_tensor->shape().NumAxes(), 1); - CHECK_EQ(segm_tensor->shape().elem_cnt(), segm_index_tensor->shape().elem_cnt()); - segm_tensor->mut_shape().Set(0, batch_data.size()); - segm_index_tensor->mut_shape().Set(0, batch_data.size()); + if (segm_tensor && segm_index_tensor + && segm_tensor->shape_view().elem_cnt() != batch_data.size()) { + CHECK_EQ(segm_tensor->shape_view().NumAxes(), 1); + CHECK_EQ(segm_index_tensor->shape_view().NumAxes(), 1); + CHECK_EQ(segm_tensor->shape_view().elem_cnt(), segm_index_tensor->shape_view().elem_cnt()); + segm_tensor->mut_shape_view().Set(0, batch_data.size()); + segm_index_tensor->mut_shape_view().Set(0, batch_data.size()); } } diff --git a/oneflow/user/data/ofrecord_image_classification_dataset.cpp b/oneflow/user/data/ofrecord_image_classification_dataset.cpp index 979acd56365..1cefd7e1a0a 100644 --- a/oneflow/user/data/ofrecord_image_classification_dataset.cpp +++ b/oneflow/user/data/ofrecord_image_classification_dataset.cpp @@ -103,7 +103,7 @@ void DecodeWorker(const std::string& image_feature_name, const std::string& labe CHECK(receive_status == kBufferStatusSuccess); OFRecord record; CHECK(record.ParseFromArray(serialized_record.data(), - serialized_record.shape().elem_cnt())); + serialized_record.shape_view().elem_cnt())); ImageClassificationDataInstance instance; DecodeImageFromOFRecord(record, image_feature_name, color_space, &instance.image); DecodeLabelFromFromOFRecord(record, label_feature_name, &instance.label); diff --git a/oneflow/user/data/ofrecord_image_classification_parser.h b/oneflow/user/data/ofrecord_image_classification_parser.h index 54cae5741b0..c961c8c3b2e 100644 --- a/oneflow/user/data/ofrecord_image_classification_parser.h +++ b/oneflow/user/data/ofrecord_image_classification_parser.h @@ -38,12 +38,12 @@ class OFRecordImageClassificationParser final : public ParserTensor4ArgNameAndIndex("image", 0); - CHECK_EQ(image_tensor->shape().NumAxes(), 1); - CHECK_EQ(image_tensor->shape().At(0), batch_size); + CHECK_EQ(image_tensor->shape_view().NumAxes(), 1); + CHECK_EQ(image_tensor->shape_view().At(0), batch_size); auto* image_buffers = image_tensor->mut_dptr(); user_op::Tensor* label_tensor = ctx->Tensor4ArgNameAndIndex("label", 0); - CHECK_EQ(label_tensor->shape().NumAxes(), 1); - CHECK_EQ(label_tensor->shape().At(0), batch_size); + CHECK_EQ(label_tensor->shape_view().NumAxes(), 1); + CHECK_EQ(label_tensor->shape_view().At(0), batch_size); auto* label_buffers = label_tensor->mut_dptr(); for (size_t i = 0; i < batch_data.size(); ++i) { auto& instance = batch_data[i]; diff --git a/oneflow/user/data/ofrecord_parser.h b/oneflow/user/data/ofrecord_parser.h index dc2e20ea3a2..fe313e19724 100644 --- a/oneflow/user/data/ofrecord_parser.h +++ b/oneflow/user/data/ofrecord_parser.h @@ -40,9 +40,9 @@ class OFRecordParser final : public Parser { auto& sample = batch_data[i]; CHECK(dptr[i].ParseFromArray(sample.data(), sample.nbytes())); }); - if (batch_data.size() != out_tensor->shape().elem_cnt()) { - CHECK_EQ(out_tensor->mut_shape().NumAxes(), 1); - out_tensor->mut_shape().Set(0, batch_data.size()); + if (batch_data.size() != out_tensor->shape_view().elem_cnt()) { + CHECK_EQ(out_tensor->mut_shape_view().NumAxes(), 1); + out_tensor->mut_shape_view().Set(0, batch_data.size()); } } }; diff --git a/oneflow/user/image/image_util.cpp b/oneflow/user/image/image_util.cpp index 6ad6dc83305..a69d877213f 100644 --- a/oneflow/user/image/image_util.cpp +++ b/oneflow/user/image/image_util.cpp @@ -39,10 +39,10 @@ void ImageUtil::ConvertColor(const std::string& input_color, const cv::Mat& inpu } cv::Mat GenCvMat4ImageBuffer(const TensorBuffer& image_buffer) { - CHECK_EQ(image_buffer.shape().NumAxes(), 3); - int h = image_buffer.shape().At(0); - int w = image_buffer.shape().At(1); - int channels = image_buffer.shape().At(2); + CHECK_EQ(image_buffer.shape_view().NumAxes(), 3); + int h = image_buffer.shape_view().At(0); + int w = image_buffer.shape_view().At(1); + int channels = image_buffer.shape_view().At(2); DataType data_type = image_buffer.data_type(); if (channels == 1 && data_type == DataType::kUInt8) { return CreateMatWithPtr(h, w, CV_8UC1, image_buffer.data()); @@ -60,19 +60,19 @@ cv::Mat GenCvMat4ImageBuffer(const TensorBuffer& image_buffer) { cv::Mat GenCvMat4ImageTensor(const user_op::Tensor* image_tensor, int image_offset) { int has_batch_dim = 0; - if (image_tensor->shape().NumAxes() == 3) { + if (image_tensor->shape_view().NumAxes() == 3) { has_batch_dim = 0; image_offset = 0; - } else if (image_tensor->shape().NumAxes() == 4) { + } else if (image_tensor->shape_view().NumAxes() == 4) { has_batch_dim = 1; CHECK_GE(image_offset, 0); - CHECK_LT(image_offset, image_tensor->shape().At(0)); + CHECK_LT(image_offset, image_tensor->shape_view().At(0)); } else { UNIMPLEMENTED(); } - int h = image_tensor->shape().At(0 + has_batch_dim); - int w = image_tensor->shape().At(1 + has_batch_dim); - int c = image_tensor->shape().At(2 + has_batch_dim); + int h = image_tensor->shape_view().At(0 + has_batch_dim); + int w = image_tensor->shape_view().At(1 + has_batch_dim); + int c = image_tensor->shape_view().At(2 + has_batch_dim); int elem_offset = image_offset * h * w * c; DataType data_type = image_tensor->data_type(); if (c == 1 && data_type == DataType::kUInt8) { diff --git a/oneflow/user/kernels/acc_kernel.cpp b/oneflow/user/kernels/acc_kernel.cpp index 1773bc5d1bd..cbc718a6188 100644 --- a/oneflow/user/kernels/acc_kernel.cpp +++ b/oneflow/user/kernels/acc_kernel.cpp @@ -31,13 +31,13 @@ class AccKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt()); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()); CHECK_EQ(in->data_type(), out->data_type()); std::unique_ptr primitive = ep::primitive::NewPrimitive(ctx->device_type(), in->data_type()); CHECK(primitive); primitive->Launch(ctx->stream(), out->dptr(), in->dptr(), out->mut_dptr(), - in->shape().elem_cnt()); + in->shape_view().elem_cnt()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp b/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp index d26c3a8541f..ba4ab9544eb 100644 --- a/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp +++ b/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp @@ -108,7 +108,7 @@ void AvgBackwardCompute(user_op::KernelComputeContext* ctx, const int32_t& dim) const T* out_ptr = grad_output->dptr(); T* in_ptr = grad_input->mut_dptr(); - std::fill(in_ptr, in_ptr + grad_input->shape().elem_cnt(), static_cast(0)); + std::fill(in_ptr, in_ptr + grad_input->shape_view().elem_cnt(), static_cast(0)); const int64_t input_width = in.Count(4); const int64_t output_width = out.Count(4); diff --git a/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu b/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu index 3310183babf..8648576c513 100644 --- a/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu +++ b/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu @@ -150,7 +150,7 @@ void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) { const Shape& in = GetShape5D(x_shape, data_format, dim); const Shape& out = GetShape5D(y_shape, data_format, dim); - const int out_elems = out_tensor->shape().elem_cnt(); + const int out_elems = out_tensor->shape_view().elem_cnt(); RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel), ctx->stream(), out_elems, in_ptr, out_ptr, out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4)); @@ -171,8 +171,8 @@ void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) { const Shape& in = GetShape5D(dx_shape, data_format, dim); const Shape& out = GetShape5D(dy_shape, data_format, dim); - const int in_elems = in_tensor->shape().elem_cnt(); - const int out_elems = out_tensor->shape().elem_cnt(); + const int in_elems = in_tensor->shape_view().elem_cnt(); + const int out_elems = out_tensor->shape_view().elem_cnt(); RUN_CUDA_KERNEL((InitPtr), ctx->stream(), in_elems, in_elems, in_ptr); RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel), ctx->stream(), out_elems, in_ptr, out_ptr, diff --git a/oneflow/user/kernels/add_n_kernel.cpp b/oneflow/user/kernels/add_n_kernel.cpp index db382a549dc..ca0c396e88a 100644 --- a/oneflow/user/kernels/add_n_kernel.cpp +++ b/oneflow/user/kernels/add_n_kernel.cpp @@ -44,13 +44,13 @@ class AddNKernel : public OpKernel, public CudaGraphSupport { CHECK(primitive); Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); const DataType data_type = out->data_type(); - const size_t count = out->shape().elem_cnt(); + const size_t count = out->shape_view().elem_cnt(); if (count == 0) { return; } size_t in_num = ctx->inputs().size(); std::vector srcs(in_num); for (size_t i = 0; i < in_num; ++i) { const Tensor* in_i = ctx->Tensor4ArgNameAndIndex("in", i); - CHECK_EQ(in_i->shape().elem_cnt(), count); + CHECK_EQ(in_i->shape_view().elem_cnt(), count); CHECK_EQ(in_i->data_type(), data_type); srcs[i] = in_i->template dptr(); } diff --git a/oneflow/user/kernels/affine_grid_kernel.cpp b/oneflow/user/kernels/affine_grid_kernel.cpp index dcd4122de37..c33dfe8ce5b 100644 --- a/oneflow/user/kernels/affine_grid_kernel.cpp +++ b/oneflow/user/kernels/affine_grid_kernel.cpp @@ -38,9 +38,9 @@ class AffineGridKernel final : public user_op::OpKernel { bool is_2d_grid = true; if (size.NumAxes() == 5) { is_2d_grid = false; } - int64_t N = theta->shape().At(0); - int64_t theta_h = theta->shape().At(1); - int64_t theta_w = theta->shape().At(2); + int64_t N = theta->shape_view().At(0); + int64_t theta_h = theta->shape_view().At(1); + int64_t theta_w = theta->shape_view().At(2); if (is_2d_grid) { int64_t H = size.At(2); @@ -108,9 +108,9 @@ class AffineGridGradKernel final : public user_op::OpKernel { bool is_2d_grid = true; if (size.NumAxes() == 5) { is_2d_grid = false; } - int64_t N = dtheta->shape().At(0); - int64_t dtheta_h = dtheta->shape().At(1); - int64_t dtheta_w = dtheta->shape().At(2); + int64_t N = dtheta->shape_view().At(0); + int64_t dtheta_h = dtheta->shape_view().At(1); + int64_t dtheta_w = dtheta->shape_view().At(2); if (is_2d_grid) { int64_t H = size.At(2); diff --git a/oneflow/user/kernels/arg_sort_kernel.cpp b/oneflow/user/kernels/arg_sort_kernel.cpp index 9b2eb69bab0..b9db027324d 100644 --- a/oneflow/user/kernels/arg_sort_kernel.cpp +++ b/oneflow/user/kernels/arg_sort_kernel.cpp @@ -29,8 +29,8 @@ class CpuArgSortKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int32_t instance_size = in->shape().At(in->shape().NumAxes() - 1); - const int32_t instance_num = in->shape().elem_cnt() / instance_size; + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int32_t instance_num = in->shape_view().elem_cnt() / instance_size; const std::string& direction = ctx->Attr("direction"); const bool is_ascending = direction == "ASCENDING"; const bool is_descending = direction == "DESCENDING"; diff --git a/oneflow/user/kernels/arg_sort_kernel.cu b/oneflow/user/kernels/arg_sort_kernel.cu index c0259ec4b86..9d898089926 100644 --- a/oneflow/user/kernels/arg_sort_kernel.cu +++ b/oneflow/user/kernels/arg_sort_kernel.cu @@ -78,11 +78,11 @@ class GpuArgSortKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(static_cast(tmp_buffer->shape().elem_cnt()), - tmp_buffer->mut_dptr(), in->shape()); + TmpBufferManager buf_manager(static_cast(tmp_buffer->shape_view().elem_cnt()), + tmp_buffer->mut_dptr(), in->shape_view()); - const int32_t elem_cnt = in->shape().elem_cnt(); - const int32_t instance_size = in->shape().At(in->shape().NumAxes() - 1); + const int32_t elem_cnt = in->shape_view().elem_cnt(); + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); const int32_t instance_num = elem_cnt / instance_size; const std::string& direction = ctx->Attr("direction"); InitializeIndices<<Tensor4ArgNameAndIndex("input", 0)->shape().NumAxes(); + int64_t ndims = ctx->Tensor4ArgNameAndIndex("input", 0)->shape_view().NumAxes(); if (ndims == 0) { return; } SwitchNdimCompute(SwitchCase(ndims), ctx); } @@ -47,9 +47,9 @@ class ArgWhereKernel final : public user_op::OpKernel { user_op::Tensor* output_size = ctx->Tensor4ArgNameAndIndex("output_size", 0); user_op::Tensor* tmp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); void* tmp_ptr = tmp ? tmp->mut_dptr() : nullptr; - size_t tmp_size = tmp ? tmp->shape().elem_cnt() * GetSizeOfDataType(tmp->data_type()) : 0; + size_t tmp_size = tmp ? tmp->shape_view().elem_cnt() * GetSizeOfDataType(tmp->data_type()) : 0; ArgWhereKernelUtil::ArgWhere( - ctx->stream(), input->shape(), input->dptr(), tmp_ptr, tmp_size, + ctx->stream(), input->shape_view(), input->dptr(), tmp_ptr, tmp_size, output->mut_dptr(), output_size->mut_dptr()); } }; diff --git a/oneflow/user/kernels/argmax_kernel.cpp b/oneflow/user/kernels/argmax_kernel.cpp index 85d3657a27f..893e8d14159 100644 --- a/oneflow/user/kernels/argmax_kernel.cpp +++ b/oneflow/user/kernels/argmax_kernel.cpp @@ -32,8 +32,8 @@ class CpuArgMaxKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); int64_t* out_ptr = out->mut_dptr(); - const int64_t instance_size = in->shape().At(in->shape().NumAxes() - 1); - const int64_t instance_num = in->shape().elem_cnt() / instance_size; + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int64_t instance_num = in->shape_view().elem_cnt() / instance_size; const int64_t num_thread = std::min(instance_num, (int64_t)Global::Get()->thread_num()); const BalancedSplitter bs(instance_num, num_thread); diff --git a/oneflow/user/kernels/argmax_kernel.cu b/oneflow/user/kernels/argmax_kernel.cu index ea36b2f695f..eacd32531cb 100644 --- a/oneflow/user/kernels/argmax_kernel.cu +++ b/oneflow/user/kernels/argmax_kernel.cu @@ -130,11 +130,11 @@ class GpuArgMaxKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int32_t elem_cnt = in->shape().elem_cnt(); - const int32_t instance_size = in->shape().At(in->shape().NumAxes() - 1); + const int32_t elem_cnt = in->shape_view().elem_cnt(); + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); const int32_t instance_num = elem_cnt / instance_size; - TmpBufferManager buffer_manager(tmp_buffer->shape().elem_cnt(), tmp_buffer->mut_dptr(), - instance_num); + TmpBufferManager buffer_manager(tmp_buffer->shape_view().elem_cnt(), + tmp_buffer->mut_dptr(), instance_num); ArgMax(in->dptr(), instance_num, instance_size, buffer_manager.TempStoragePtr(), buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(), diff --git a/oneflow/user/kernels/as_strided_kernel.cpp b/oneflow/user/kernels/as_strided_kernel.cpp index 808bc9d0650..1a822b51678 100644 --- a/oneflow/user/kernels/as_strided_kernel.cpp +++ b/oneflow/user/kernels/as_strided_kernel.cpp @@ -79,10 +79,10 @@ class CpuAsStridedKernel final : public user_op::OpKernel { const auto stride = ctx->Attr>("stride"); const int32_t storage_offset = ctx->Attr("storage_offset"); - size_t dest_num_dims = output->shape().NumAxes(); - const int64_t* dest_dims = output->shape().ptr(); - const size_t input_num = input->shape().Count(0); - const size_t output_num = output->shape().Count(0); + size_t dest_num_dims = output->shape_view().NumAxes(); + const int64_t* dest_dims = output->shape_view().ptr(); + const size_t input_num = input->shape_view().Count(0); + const size_t output_num = output->shape_view().Count(0); AsStridedFunctor()(ctx->stream(), input->dptr(), output->mut_dptr(), dest_dims, stride.data(), dest_num_dims, storage_offset, input_num, output_num); @@ -105,12 +105,13 @@ class CpuAsStridedGradKernel final : public user_op::OpKernel { const auto stride = ctx->Attr>("stride"); const int32_t storage_offset = ctx->Attr("storage_offset"); - size_t dy_num_dims = dy->shape().NumAxes(); - const int64_t* dy_dims = dy->shape().ptr(); - const size_t dx_num = dx->shape().Count(0); - const size_t dy_num = dy->shape().Count(0); + size_t dy_num_dims = dy->shape_view().NumAxes(); + const int64_t* dy_dims = dy->shape_view().ptr(); + const size_t dx_num = dx->shape_view().Count(0); + const size_t dy_num = dy->shape_view().Count(0); - Memset(ctx->stream(), dx->mut_dptr(), 0, dx->shape().Count(0) * sizeof(T)); + Memset(ctx->stream(), dx->mut_dptr(), 0, + dx->shape_view().Count(0) * sizeof(T)); AsStridedGradFunctor()(ctx->stream(), dy->dptr(), dx->mut_dptr(), dy_dims, stride.data(), dy_num_dims, storage_offset, dx_num, dy_num); diff --git a/oneflow/user/kernels/as_strided_kernel.cu b/oneflow/user/kernels/as_strided_kernel.cu index 60df107ef84..2f528e00a0b 100644 --- a/oneflow/user/kernels/as_strided_kernel.cu +++ b/oneflow/user/kernels/as_strided_kernel.cu @@ -134,10 +134,10 @@ class GpuAsStridedKernel final : public user_op::OpKernel { const auto stride = ctx->Attr>("stride"); const int32_t storage_offset = ctx->Attr("storage_offset"); - size_t dest_num_dims = output->shape().NumAxes(); - const int64_t* dest_dims = output->shape().ptr(); - const size_t input_num = input->shape().Count(0); - const size_t output_num = output->shape().Count(0); + size_t dest_num_dims = output->shape_view().NumAxes(); + const int64_t* dest_dims = output->shape_view().ptr(); + const size_t input_num = input->shape_view().Count(0); + const size_t output_num = output->shape_view().Count(0); if (input_num == 0) { // 0-size tensor return; @@ -164,12 +164,13 @@ class GpuAsStridedGradKernel final : public user_op::OpKernel { const auto stride = ctx->Attr>("stride"); const int32_t storage_offset = ctx->Attr("storage_offset"); - size_t dy_num_dims = dy->shape().NumAxes(); - const int64_t* dy_dims = dy->shape().ptr(); - const size_t dx_num = dx->shape().Count(0); - const size_t dy_num = dy->shape().Count(0); + size_t dy_num_dims = dy->shape_view().NumAxes(); + const int64_t* dy_dims = dy->shape_view().ptr(); + const size_t dx_num = dx->shape_view().Count(0); + const size_t dy_num = dy->shape_view().Count(0); - Memset(ctx->stream(), dx->mut_dptr(), 0, dx->shape().Count(0) * sizeof(T)); + Memset(ctx->stream(), dx->mut_dptr(), 0, + dx->shape_view().Count(0) * sizeof(T)); AsStridedGradFunctor()(ctx->stream(), dy->dptr(), dx->mut_dptr(), dy_dims, stride.data(), dy_num_dims, storage_offset, dx_num, dy_num); diff --git a/oneflow/user/kernels/assign_if_kernel.cpp b/oneflow/user/kernels/assign_if_kernel.cpp index eb6e515f52d..2e476d2ec3f 100644 --- a/oneflow/user/kernels/assign_if_kernel.cpp +++ b/oneflow/user/kernels/assign_if_kernel.cpp @@ -33,9 +33,10 @@ class AssignIfCPUKernel final : public user_op::OpKernel { const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0); if (value->dptr() == ref->dptr()) { return; } - CHECK_EQ(value->shape(), ref->shape()); + CHECK_EQ(value->shape_view(), ref->shape_view()); CHECK_EQ(value->data_type(), ref->data_type()); - const size_t tensor_bytes_size = ref->shape().elem_cnt() * GetSizeOfDataType(ref->data_type()); + const size_t tensor_bytes_size = + ref->shape_view().elem_cnt() * GetSizeOfDataType(ref->data_type()); AutoMemcpy(ctx->stream(), ref->mut_dptr(), value->dptr(), tensor_bytes_size, ref->mem_case(), value->mem_case()); } diff --git a/oneflow/user/kernels/assign_if_kernel.cu b/oneflow/user/kernels/assign_if_kernel.cu index fc79eab85ba..e581b9f577b 100644 --- a/oneflow/user/kernels/assign_if_kernel.cu +++ b/oneflow/user/kernels/assign_if_kernel.cu @@ -37,14 +37,14 @@ class AssignIfGPUKernel final : public user_op::OpKernel { using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0); - CHECK_EQ(condition->shape().NumAxes(), 1); - CHECK_EQ(condition->shape().At(0), 1); + CHECK_EQ(condition->shape_view().NumAxes(), 1); + CHECK_EQ(condition->shape_view().At(0), 1); const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0); if (value->dptr() == ref->dptr()) { return; } - CHECK_EQ(value->shape(), ref->shape()); + CHECK_EQ(value->shape_view(), ref->shape_view()); CHECK_EQ(value->data_type(), ref->data_type()); - const size_t elem_cnt = ref->shape().elem_cnt(); + const size_t elem_cnt = ref->shape_view().elem_cnt(); AssignGpu<<stream()->As()->cuda_stream()>>>( elem_cnt, condition->dptr(), value->dptr(), ref->mut_dptr()); diff --git a/oneflow/user/kernels/assign_kernel.cpp b/oneflow/user/kernels/assign_kernel.cpp index 583814e8cba..9bd449cd552 100644 --- a/oneflow/user/kernels/assign_kernel.cpp +++ b/oneflow/user/kernels/assign_kernel.cpp @@ -31,9 +31,9 @@ class AssignKernel final : public user_op::OpKernel { user_op::Tensor* ref_tensor = ctx->Tensor4ArgNameAndIndex("ref", 0); if (value_tensor->dptr() == ref_tensor->dptr()) { return; } size_t tensor_bytes_size = - ref_tensor->shape().elem_cnt() * GetSizeOfDataType(ref_tensor->data_type()); + ref_tensor->shape_view().elem_cnt() * GetSizeOfDataType(ref_tensor->data_type()); size_t val_tensor_bytes_size = - value_tensor->shape().elem_cnt() * GetSizeOfDataType(value_tensor->data_type()); + value_tensor->shape_view().elem_cnt() * GetSizeOfDataType(value_tensor->data_type()); CHECK_EQ(tensor_bytes_size, val_tensor_bytes_size); AutoMemcpy(ctx->stream(), ref_tensor->mut_dptr(), value_tensor->dptr(), tensor_bytes_size, ref_tensor->mem_case(), value_tensor->mem_case()); diff --git a/oneflow/user/kernels/avg_pool_kernel.cpp b/oneflow/user/kernels/avg_pool_kernel.cpp index d582f7d2cb3..70915f4c1be 100644 --- a/oneflow/user/kernels/avg_pool_kernel.cpp +++ b/oneflow/user/kernels/avg_pool_kernel.cpp @@ -129,13 +129,13 @@ class AvgPool1dKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const AvgPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = y->shape().elem_cnt(); + const int64_t elem_num = y->shape_view().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); DimVector y_vector(2); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); + y_vector.at(0) = y->shape_view().At(0) * y->shape_view().At(1); + y_vector.at(1) = y->shape_view().At(2); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); AvgPoolKernelUtil::Avgpool1dForward(ctx->stream(), index_helper, @@ -169,15 +169,15 @@ class AvgPool1dGradKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const AvgPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = dy->shape().elem_cnt(); + const int64_t elem_num = dy->shape_view().elem_cnt(); const T* src = dy->dptr(); T* dest = dx->mut_dptr(); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); DimVector dy_vector(2); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); - dy_vector.at(1) = dy->shape().At(2); + dy_vector.at(0) = dy->shape_view().At(0) * dy->shape_view().At(1); + dy_vector.at(1) = dy->shape_view().At(2); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); AvgPoolKernelUtil::Avgpool1dBackward(ctx->stream(), index_helper, @@ -211,14 +211,14 @@ class AvgPool2dKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const AvgPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = y->shape().elem_cnt(); + const int64_t elem_num = y->shape_view().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); DimVector y_vector(3); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); - y_vector.at(2) = y->shape().At(3); + y_vector.at(0) = y->shape_view().At(0) * y->shape_view().At(1); + y_vector.at(1) = y->shape_view().At(2); + y_vector.at(2) = y->shape_view().At(3); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); AvgPoolKernelUtil::Avgpool2dForward(ctx->stream(), index_helper, @@ -252,17 +252,17 @@ class AvgPool2dGradKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const AvgPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = dy->shape().elem_cnt(); + const int64_t elem_num = dy->shape_view().elem_cnt(); const T* src = dy->dptr(); T* dest = dx->mut_dptr(); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); DimVector dy_vector(3); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); - dy_vector.at(1) = dy->shape().At(2); - dy_vector.at(2) = dy->shape().At(3); + dy_vector.at(0) = dy->shape_view().At(0) * dy->shape_view().At(1); + dy_vector.at(1) = dy->shape_view().At(2); + dy_vector.at(2) = dy->shape_view().At(3); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); AvgPoolKernelUtil::Avgpool2dBackward(ctx->stream(), index_helper, @@ -296,15 +296,15 @@ class AvgPool3dKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const AvgPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = y->shape().elem_cnt(); + const int64_t elem_num = y->shape_view().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); DimVector y_vector(4); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); - y_vector.at(2) = y->shape().At(3); - y_vector.at(3) = y->shape().At(4); + y_vector.at(0) = y->shape_view().At(0) * y->shape_view().At(1); + y_vector.at(1) = y->shape_view().At(2); + y_vector.at(2) = y->shape_view().At(3); + y_vector.at(3) = y->shape_view().At(4); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); AvgPoolKernelUtil::Avgpool3dForward(ctx->stream(), index_helper, @@ -338,18 +338,18 @@ class AvgPool3dGradKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const AvgPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = dy->shape().elem_cnt(); + const int64_t elem_num = dy->shape_view().elem_cnt(); const T* src = dy->dptr(); T* dest = dx->mut_dptr(); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); DimVector dy_vector(4); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); - dy_vector.at(1) = dy->shape().At(2); - dy_vector.at(2) = dy->shape().At(3); - dy_vector.at(3) = dy->shape().At(4); + dy_vector.at(0) = dy->shape_view().At(0) * dy->shape_view().At(1); + dy_vector.at(1) = dy->shape_view().At(2); + dy_vector.at(2) = dy->shape_view().At(3); + dy_vector.at(3) = dy->shape_view().At(4); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); AvgPoolKernelUtil::Avgpool3dBackward(ctx->stream(), index_helper, diff --git a/oneflow/user/kernels/batch_gather_kernel.cpp b/oneflow/user/kernels/batch_gather_kernel.cpp index 859d7a81c26..6dec116cbdc 100644 --- a/oneflow/user/kernels/batch_gather_kernel.cpp +++ b/oneflow/user/kernels/batch_gather_kernel.cpp @@ -32,12 +32,13 @@ class BatchGatherKernel final : public user_op::OpKernel, public user_op::CudaGr const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t axis = indices->shape().NumAxes() - 1; + const int64_t axis = indices->shape_view().NumAxes() - 1; const Shape flat_out_shape = - Shape({out->shape().Count(0, axis), out->shape().At(axis), out->shape().Count(axis + 1)}); - BatchGatherKernelUtilImpl::Forward(ctx->stream(), in->dptr(), - indices->dptr(), flat_out_shape, - in->shape().At(axis), out->mut_dptr()); + Shape({out->shape_view().Count(0, axis), out->shape_view().At(axis), + out->shape_view().Count(axis + 1)}); + BatchGatherKernelUtilImpl::Forward( + ctx->stream(), in->dptr(), indices->dptr(), flat_out_shape, in->shape_view().At(axis), + out->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/batch_gather_kernel_util.cpp b/oneflow/user/kernels/batch_gather_kernel_util.cpp index 3167c395edc..7b51f901b8f 100644 --- a/oneflow/user/kernels/batch_gather_kernel_util.cpp +++ b/oneflow/user/kernels/batch_gather_kernel_util.cpp @@ -28,10 +28,10 @@ Shape GetFlatShape(const ShapeView& shape, const int64_t axis) { template void BatchGatherForward(ep::Stream* stream, const Blob* in, const Blob* indices, Blob* out) { - const int64_t axis = indices->shape().NumAxes() - 1; - const Shape flat_out_shape = GetFlatShape(out->shape(), axis); + const int64_t axis = indices->shape_view().NumAxes() - 1; + const Shape flat_out_shape = GetFlatShape(out->shape_view(), axis); BatchGatherKernelUtilImpl::Forward(stream, in->dptr(), indices->dptr(), - flat_out_shape, in->shape().At(axis), + flat_out_shape, in->shape_view().At(axis), out->mut_dptr()); } @@ -39,11 +39,11 @@ template void BatchGatherBackward(ep::Stream* stream, const Blob* out_diff, const Blob* indices, Blob* in_diff) { Memset(stream, in_diff->mut_dptr(), 0, in_diff->ByteSizeOfBlobBody()); - const int64_t axis = indices->shape().NumAxes() - 1; - const Shape flat_out_diff_shape = GetFlatShape(out_diff->shape(), axis); + const int64_t axis = indices->shape_view().NumAxes() - 1; + const Shape flat_out_diff_shape = GetFlatShape(out_diff->shape_view(), axis); BatchGatherKernelUtilImpl::Backward( stream, out_diff->dptr(), indices->dptr(), flat_out_diff_shape, - in_diff->shape().At(axis), in_diff->mut_dptr()); + in_diff->shape_view().At(axis), in_diff->mut_dptr()); } template diff --git a/oneflow/user/kernels/bernoulli_kernel.cpp b/oneflow/user/kernels/bernoulli_kernel.cpp index 3fa324dc958..1a72325921c 100644 --- a/oneflow/user/kernels/bernoulli_kernel.cpp +++ b/oneflow/user/kernels/bernoulli_kernel.cpp @@ -43,7 +43,7 @@ class BernoulliKerenl final : public user_op::OpKernel { K* out_dptr = out_blob->mut_dptr(); CHECK_EQ(GetDataType(), in_blob->data_type()); CHECK_EQ(GetDataType(), out_blob->data_type()); - CHECK_EQ(in_blob->shape().elem_cnt(), out_blob->shape().elem_cnt()); + CHECK_EQ(in_blob->shape_view().elem_cnt(), out_blob->shape_view().elem_cnt()); auto* kernel_state = dynamic_cast(state); CHECK_NOTNULL(kernel_state); @@ -51,7 +51,7 @@ class BernoulliKerenl final : public user_op::OpKernel { CHECK_NOTNULL(generator); const auto& cpu_generator = CHECK_JUST(generator->Get()); - for (int32_t i = 0; i < out_blob->shape().elem_cnt(); ++i) { + for (int32_t i = 0; i < out_blob->shape_view().elem_cnt(); ++i) { double prob = static_cast(*(in_dptr + i)); CHECK(prob >= 0.0 && prob <= 1.0); std::bernoulli_distribution dis(prob); diff --git a/oneflow/user/kernels/bias_add_kernel.h b/oneflow/user/kernels/bias_add_kernel.h index 96ad83b8a46..c644e441b38 100644 --- a/oneflow/user/kernels/bias_add_kernel.h +++ b/oneflow/user/kernels/bias_add_kernel.h @@ -38,13 +38,15 @@ class BiasAddUserKernel final : public user_op::OpKernel, public user_op::CudaGr void Compute(user_op::KernelComputeContext* ctx) const override { const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0); const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); - if (a_tensor->shape().elem_cnt() == 0 || b_tensor->shape().elem_cnt() == 0) { return; } + if (a_tensor->shape_view().elem_cnt() == 0 || b_tensor->shape_view().elem_cnt() == 0) { + return; + } auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); const int32_t bias_add_axis = ctx->Attr("axis"); - const int64_t outer_size = a_tensor->shape().Count(0, bias_add_axis); - const int64_t bias_size = a_tensor->shape().At(bias_add_axis); - const int64_t inner_size = a_tensor->shape().Count(bias_add_axis + 1); - const auto n = a_tensor->shape().elem_cnt(); + const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); + const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); + const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); + const auto n = a_tensor->shape_view().elem_cnt(); if (IsKernelSafeInt32(n)) { BiasAddCalculation::Invoke( ctx->stream(), outer_size, bias_size, inner_size, a_tensor->dptr(), diff --git a/oneflow/user/kernels/binary_cross_entropy_kernel.cpp b/oneflow/user/kernels/binary_cross_entropy_kernel.cpp index 06865f3d09c..c9a008b8d28 100644 --- a/oneflow/user/kernels/binary_cross_entropy_kernel.cpp +++ b/oneflow/user/kernels/binary_cross_entropy_kernel.cpp @@ -63,7 +63,7 @@ class BinaryCrossEntropyKernel final : public user_op::OpKernel { const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* input = input_blob->dptr(); const T* target = target_blob->dptr(); @@ -90,7 +90,7 @@ class BinaryCrossEntropyGradKernel final : public user_op::OpKernel { const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* dy = dy_blob->dptr(); const T* input = input_blob->dptr(); diff --git a/oneflow/user/kernels/binary_cross_entropy_kernel.cu b/oneflow/user/kernels/binary_cross_entropy_kernel.cu index 96c163bac09..933d48aed7f 100644 --- a/oneflow/user/kernels/binary_cross_entropy_kernel.cu +++ b/oneflow/user/kernels/binary_cross_entropy_kernel.cu @@ -116,7 +116,7 @@ class BinaryCrossEntropyKernel final : public user_op::OpKernel { const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* input = input_blob->dptr(); const T* target = target_blob->dptr(); @@ -150,7 +150,7 @@ class BinaryCrossEntropyGradKernel final : public user_op::OpKernel { const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* dy = dy_blob->dptr(); const T* input = input_blob->dptr(); diff --git a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cpp b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cpp index 949f488e764..33cd3f95638 100644 --- a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cpp +++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cpp @@ -93,7 +93,7 @@ class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel { auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* input = input_blob->dptr(); const T* target = target_blob->dptr(); @@ -108,13 +108,13 @@ class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel { pos_weight_processed = tmp_buffer_blob->mut_dptr(); const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); - Shape pos_weight_shape = Shape::Ones(target_blob->shape().NumAxes()); + Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, - ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape().elem_cnt()); + ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); NdarrayUtil::BroadcastMul( - ctx->stream(), XpuVarNdarray(target_blob->shape(), pos_weight_processed), + ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), XpuVarNdarray(pos_weight_shape, pos_weight), - XpuVarNdarray(target_blob->shape(), target)); + XpuVarNdarray(target_blob->shape_view(), target)); } ComputeBinaryCrossEntropyWithLogitsOut(elem_cnt, input, target, out, weight, pos_weight_processed); @@ -137,7 +137,7 @@ class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel { auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* dy = dy_blob->dptr(); const T* input = input_blob->dptr(); @@ -152,13 +152,13 @@ class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel { pos_weight_processed = tmp_buffer_blob->mut_dptr(); const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); - Shape pos_weight_shape = Shape::Ones(target_blob->shape().NumAxes()); + Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, - ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape().elem_cnt()); + ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); NdarrayUtil::BroadcastMul( - ctx->stream(), XpuVarNdarray(target_blob->shape(), pos_weight_processed), + ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), XpuVarNdarray(pos_weight_shape, pos_weight), - XpuVarNdarray(target_blob->shape(), target)); + XpuVarNdarray(target_blob->shape_view(), target)); } ComputeBinaryCrossEntropyWithLogitsGradOut(elem_cnt, input, target, dy, dx, weight, pos_weight_processed); diff --git a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu index c2b5c94c433..97422f6db34 100644 --- a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu +++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu @@ -208,7 +208,7 @@ class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel { auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* input = input_blob->dptr(); const T* target = target_blob->dptr(); @@ -218,13 +218,13 @@ class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel { T* pos_weight_processed = tmp_buffer_blob->mut_dptr(); const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); - Shape pos_weight_shape = Shape::Ones(target_blob->shape().NumAxes()); + Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, - ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape().elem_cnt()); + ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); NdarrayUtil::BroadcastMul( - ctx->stream(), XpuVarNdarray(target_blob->shape(), pos_weight_processed), + ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), XpuVarNdarray(pos_weight_shape, pos_weight), - XpuVarNdarray(target_blob->shape(), target)); + XpuVarNdarray(target_blob->shape_view(), target)); if (ctx->has_input("weight", 0)) { const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); using FunctorT = BinaryCrossEntropyWithLogitsFunctor; @@ -269,7 +269,7 @@ class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel { auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* dy = dy_blob->dptr(); const T* input = input_blob->dptr(); @@ -280,13 +280,13 @@ class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel { T* pos_weight_processed = tmp_buffer_blob->mut_dptr(); const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); - Shape pos_weight_shape = Shape::Ones(target_blob->shape().NumAxes()); + Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, - ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape().elem_cnt()); + ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); NdarrayUtil::BroadcastMul( - ctx->stream(), XpuVarNdarray(target_blob->shape(), pos_weight_processed), + ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), XpuVarNdarray(pos_weight_shape, pos_weight), - XpuVarNdarray(target_blob->shape(), target)); + XpuVarNdarray(target_blob->shape_view(), target)); if (ctx->has_input("weight", 0)) { const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); diff --git a/oneflow/user/kernels/broadcast_div_grad_kernel.cpp b/oneflow/user/kernels/broadcast_div_grad_kernel.cpp index edd7b6b0ba2..7a786212989 100644 --- a/oneflow/user/kernels/broadcast_div_grad_kernel.cpp +++ b/oneflow/user/kernels/broadcast_div_grad_kernel.cpp @@ -35,21 +35,23 @@ class BroadcastDivGradKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const int64_t num_axes = dz_tensor->shape().NumAxes(); - XpuVarNdarray dz(dz_tensor->shape(), dz_tensor->dptr(), num_axes); + const int64_t num_axes = dz_tensor->shape_view().NumAxes(); + XpuVarNdarray dz(dz_tensor->shape_view(), dz_tensor->dptr(), num_axes); XpuVarNdarray const_tmp(dz.shape(), tmp_buffer->dptr()); XpuVarNdarray tmp(dz.shape(), tmp_buffer->mut_dptr()); NdarrayUtil::BroadcastDiv( ctx->stream(), tmp, - XpuVarNdarray(z_tensor->shape(), z_tensor->dptr(), num_axes), - XpuVarNdarray(y_tensor->shape(), y_tensor->dptr(), num_axes)); + XpuVarNdarray(z_tensor->shape_view(), z_tensor->dptr(), num_axes), + XpuVarNdarray(y_tensor->shape_view(), y_tensor->dptr(), num_axes)); NdarrayUtil::BroadcastMul(ctx->stream(), tmp, dz, const_tmp); NdarrayUtil::ReduceSum( - ctx->stream(), XpuVarNdarray(dy_tensor->shape(), dy_tensor->mut_dptr(), num_axes), - const_tmp, tmp); + ctx->stream(), + XpuVarNdarray(dy_tensor->shape_view(), dy_tensor->mut_dptr(), num_axes), const_tmp, + tmp); NdarrayUtil::InplaceNegative( - ctx->stream(), XpuVarNdarray(dy_tensor->shape(), dy_tensor->mut_dptr(), num_axes)); + ctx->stream(), + XpuVarNdarray(dy_tensor->shape_view(), dy_tensor->mut_dptr(), num_axes)); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/broadcast_like_kernel.cpp b/oneflow/user/kernels/broadcast_like_kernel.cpp index f44a704e00b..919509e66fc 100644 --- a/oneflow/user/kernels/broadcast_like_kernel.cpp +++ b/oneflow/user/kernels/broadcast_like_kernel.cpp @@ -35,9 +35,9 @@ class BroadcastLikeKernel final : public user_op::OpKernel, public user_op::Cuda user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); const auto& axis = ctx->Attr>("broadcast_axes"); const Shape& reduced_shape = - CreateReducedShapeOrOnesShape(like_tensor->shape(), {axis.begin(), axis.end()}); + CreateReducedShapeOrOnesShape(like_tensor->shape_view(), {axis.begin(), axis.end()}); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out_tensor->shape(), out_tensor->mut_dptr()), + ctx->stream(), XpuVarNdarray(out_tensor->shape_view(), out_tensor->mut_dptr()), XpuVarNdarray(reduced_shape, in_tensor->dptr())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/broadcast_pow_grad_kernel.cpp b/oneflow/user/kernels/broadcast_pow_grad_kernel.cpp index b2a531b041b..c4cf0570935 100644 --- a/oneflow/user/kernels/broadcast_pow_grad_kernel.cpp +++ b/oneflow/user/kernels/broadcast_pow_grad_kernel.cpp @@ -37,21 +37,22 @@ class BroadcastPowXGradKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t num_axes = dz_tensor->shape().NumAxes(); - XpuVarNdarray dz(dz_tensor->shape(), dz_tensor->dptr(), num_axes); - XpuVarNdarray y(y_tensor->shape(), y_tensor->dptr(), num_axes); + const int64_t num_axes = dz_tensor->shape_view().NumAxes(); + XpuVarNdarray dz(dz_tensor->shape_view(), dz_tensor->dptr(), num_axes); + XpuVarNdarray y(y_tensor->shape_view(), y_tensor->dptr(), num_axes); XpuVarNdarray const_tmp(dz.shape(), tmp_buffer->dptr()); XpuVarNdarray tmp(dz.shape(), tmp_buffer->mut_dptr()); NdarrayUtil::BroadcastDiv( ctx->stream(), tmp, - XpuVarNdarray(z_tensor->shape(), z_tensor->dptr(), num_axes), - XpuVarNdarray(x_tensor->shape(), x_tensor->dptr(), num_axes)); + XpuVarNdarray(z_tensor->shape_view(), z_tensor->dptr(), num_axes), + XpuVarNdarray(x_tensor->shape_view(), x_tensor->dptr(), num_axes)); NdarrayUtil::BroadcastMul(ctx->stream(), tmp, y, const_tmp); NdarrayUtil::BroadcastMul(ctx->stream(), tmp, dz, const_tmp); NdarrayUtil::ReduceSum( - ctx->stream(), XpuVarNdarray(dx_tensor->shape(), dx_tensor->mut_dptr(), num_axes), - const_tmp, tmp); + ctx->stream(), + XpuVarNdarray(dx_tensor->shape_view(), dx_tensor->mut_dptr(), num_axes), const_tmp, + tmp); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -70,17 +71,17 @@ class BroadcastPowYGradKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const int64_t num_axes = dz_tensor->shape().NumAxes(); - const int64_t elem_cnt = z_tensor->shape().elem_cnt(); + const int64_t num_axes = dz_tensor->shape_view().NumAxes(); + const int64_t elem_cnt = z_tensor->shape_view().elem_cnt(); Memset(ctx->stream(), tmp_buffer->mut_dptr(), 0, GetCudaAlignedSize(elem_cnt * sizeof(T))); T* tmp_ptr = tmp_buffer->mut_dptr(); - XpuVarNdarray z(z_tensor->shape(), z_tensor->dptr(), num_axes); - XpuVarNdarray dz(dz_tensor->shape(), dz_tensor->dptr(), num_axes); + XpuVarNdarray z(z_tensor->shape_view(), z_tensor->dptr(), num_axes); + XpuVarNdarray dz(dz_tensor->shape_view(), dz_tensor->dptr(), num_axes); XpuVarNdarray const_tmp(dz.shape(), tmp_buffer->dptr()); XpuVarNdarray tmp(dz.shape(), tmp_buffer->mut_dptr()); - XpuVarNdarray x(x_tensor->shape(), x_tensor->dptr(), num_axes); - XpuVarNdarray dy(dy_tensor->shape(), dy_tensor->mut_dptr(), num_axes); + XpuVarNdarray x(x_tensor->shape_view(), x_tensor->dptr(), num_axes); + XpuVarNdarray dy(dy_tensor->shape_view(), dy_tensor->mut_dptr(), num_axes); NdarrayUtil::BroadcastAdd(ctx->stream(), tmp, x, const_tmp); FOR_RANGE(int64_t, i, 0, elem_cnt) { tmp_ptr[i] = SafeLog(tmp_ptr[i]); } NdarrayUtil::BroadcastMul(ctx->stream(), tmp, dz, const_tmp); diff --git a/oneflow/user/kernels/broadcast_pow_grad_kernel.cu b/oneflow/user/kernels/broadcast_pow_grad_kernel.cu index 30f1e150d05..1471f2383c4 100644 --- a/oneflow/user/kernels/broadcast_pow_grad_kernel.cu +++ b/oneflow/user/kernels/broadcast_pow_grad_kernel.cu @@ -48,16 +48,16 @@ class BroadcastPowYGradKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const int64_t num_axes = dz_tensor->shape().NumAxes(); - const int64_t elem_cnt = z_tensor->shape().elem_cnt(); + const int64_t num_axes = dz_tensor->shape_view().NumAxes(); + const int64_t elem_cnt = z_tensor->shape_view().elem_cnt(); Memset(ctx->stream(), tmp_buffer->mut_dptr(), 0, GetCudaAlignedSize(elem_cnt * sizeof(T))); - XpuVarNdarray z(z_tensor->shape(), z_tensor->dptr(), num_axes); - XpuVarNdarray dz(dz_tensor->shape(), dz_tensor->dptr(), num_axes); + XpuVarNdarray z(z_tensor->shape_view(), z_tensor->dptr(), num_axes); + XpuVarNdarray dz(dz_tensor->shape_view(), dz_tensor->dptr(), num_axes); XpuVarNdarray const_tmp(dz.shape(), tmp_buffer->dptr()); XpuVarNdarray tmp(dz.shape(), tmp_buffer->mut_dptr()); - XpuVarNdarray x(x_tensor->shape(), x_tensor->dptr(), num_axes); - XpuVarNdarray dy(dy_tensor->shape(), dy_tensor->mut_dptr(), num_axes); + XpuVarNdarray x(x_tensor->shape_view(), x_tensor->dptr(), num_axes); + XpuVarNdarray dy(dy_tensor->shape_view(), dy_tensor->mut_dptr(), num_axes); NdarrayUtil::BroadcastAdd(ctx->stream(), tmp, x, const_tmp); ComputeLogGpu<<stream()->As()->cuda_stream()>>>( diff --git a/oneflow/user/kernels/cast_kernel.cpp b/oneflow/user/kernels/cast_kernel.cpp index 3e6a1b2a489..d76dd4bb85e 100644 --- a/oneflow/user/kernels/cast_kernel.cpp +++ b/oneflow/user/kernels/cast_kernel.cpp @@ -41,8 +41,8 @@ class CastKernel final : public OpKernel, public user_op::CudaGraphSupport { void Compute(KernelComputeContext* ctx) const override { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); Tensor* output_tenor = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t elem_cnt = input_tensor->shape().elem_cnt(); - CHECK_EQ(output_tenor->shape().elem_cnt(), elem_cnt); + const int64_t elem_cnt = input_tensor->shape_view().elem_cnt(); + CHECK_EQ(output_tenor->shape_view().elem_cnt(), elem_cnt); if (input_tensor->data_type() == output_tenor->data_type() && input_tensor->dptr() == output_tenor->dptr()) { return; diff --git a/oneflow/user/kernels/cast_to_static_shape_kernel.cpp b/oneflow/user/kernels/cast_to_static_shape_kernel.cpp index 840e86ff034..dd43379407c 100644 --- a/oneflow/user/kernels/cast_to_static_shape_kernel.cpp +++ b/oneflow/user/kernels/cast_to_static_shape_kernel.cpp @@ -31,10 +31,10 @@ class CastToStaticShapeKernel final : public user_op::OpKernel { const user_op::Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); const Shape& input_static_shape = ctx->TensorDesc4ArgNameAndIndex("input", 0)->shape(); user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); - CHECK(input_tensor->shape() == ShapeView(input_static_shape)); - CHECK_EQ(output_tensor->shape(), input_tensor->shape()); + CHECK(input_tensor->shape_view() == ShapeView(input_static_shape)); + CHECK_EQ(output_tensor->shape_view(), input_tensor->shape_view()); size_t output_tensor_size = - output_tensor->shape().elem_cnt() * GetSizeOfDataType(output_tensor->data_type()); + output_tensor->shape_view().elem_cnt() * GetSizeOfDataType(output_tensor->data_type()); Memcpy(ctx->stream(), output_tensor->mut_dptr(), input_tensor->dptr(), output_tensor_size); } diff --git a/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp b/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp index f7f0a822bd0..36d77a0cce2 100644 --- a/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp +++ b/oneflow/user/kernels/categorical_ordinal_encode_kernel.cpp @@ -32,12 +32,12 @@ class CategoricalOrdinalEncodeKernel final : public user_op::OpKernel { user_op::Tensor* table = ctx->Tensor4ArgNameAndIndex("table", 0); user_op::Tensor* size = ctx->Tensor4ArgNameAndIndex("size", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t table_elem_cnt = table->shape().elem_cnt(); + const int64_t table_elem_cnt = table->shape_view().elem_cnt(); CHECK_EQ(table_elem_cnt % 2, 0); const int64_t capacity = table_elem_cnt / 2; CategoricalOrdinalEncodeKernelUtil::Encode( - ctx->stream(), capacity, table->mut_dptr(), size->mut_dptr(), in->shape().elem_cnt(), - in->dptr(), out->mut_dptr()); + ctx->stream(), capacity, table->mut_dptr(), size->mut_dptr(), + in->shape_view().elem_cnt(), in->dptr(), out->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } }; diff --git a/oneflow/user/kernels/clip_by_value_kernel.cpp b/oneflow/user/kernels/clip_by_value_kernel.cpp index b8269c007e8..eb4e016e94d 100644 --- a/oneflow/user/kernels/clip_by_value_kernel.cpp +++ b/oneflow/user/kernels/clip_by_value_kernel.cpp @@ -80,7 +80,7 @@ class ClipByScalarKernel final : public user_op::OpKernel { int64_t integral_max = ctx->Attr("integral_max"); ClipByMinMaxFunctor clip_func(GetDtypeMatchedValue(floating_min, integral_min), GetDtypeMatchedValue(floating_max, integral_max)); - ClipKernelUtil::Forward(ctx->stream(), clip_func, y->shape().elem_cnt(), + ClipKernelUtil::Forward(ctx->stream(), clip_func, y->shape_view().elem_cnt(), x->dptr(), y->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -99,7 +99,7 @@ class ClipByScalarMinKernel final : public user_op::OpKernel { double floating_min = ctx->Attr("floating_min"); int64_t integral_min = ctx->Attr("integral_min"); ClipByMinFunctor clip_func(GetDtypeMatchedValue(floating_min, integral_min)); - ClipKernelUtil::Forward(ctx->stream(), clip_func, y->shape().elem_cnt(), + ClipKernelUtil::Forward(ctx->stream(), clip_func, y->shape_view().elem_cnt(), x->dptr(), y->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -118,7 +118,7 @@ class ClipByScalarMaxKernel final : public user_op::OpKernel { double floating_max = ctx->Attr("floating_max"); int64_t integral_max = ctx->Attr("integral_max"); ClipByMaxFunctor clip_func(GetDtypeMatchedValue(floating_max, integral_max)); - ClipKernelUtil::Forward(ctx->stream(), clip_func, y->shape().elem_cnt(), + ClipKernelUtil::Forward(ctx->stream(), clip_func, y->shape_view().elem_cnt(), x->dptr(), y->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -141,7 +141,7 @@ class ClipByScalarGradKernel final : public user_op::OpKernel { int64_t integral_max = ctx->Attr("integral_max"); ClipByMinMaxGradFunctor clip_func(GetDtypeMatchedValue(floating_min, integral_min), GetDtypeMatchedValue(floating_max, integral_max)); - ClipKernelUtil::Backward(ctx->stream(), clip_func, dx->shape().elem_cnt(), + ClipKernelUtil::Backward(ctx->stream(), clip_func, dx->shape_view().elem_cnt(), x->dptr(), dy->dptr(), dx->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -161,7 +161,7 @@ class ClipByScalarMinGradKernel final : public user_op::OpKernel { double floating_min = ctx->Attr("floating_min"); int64_t integral_min = ctx->Attr("integral_min"); ClipByMinGradFunctor clip_func(GetDtypeMatchedValue(floating_min, integral_min)); - ClipKernelUtil::Backward(ctx->stream(), clip_func, dx->shape().elem_cnt(), + ClipKernelUtil::Backward(ctx->stream(), clip_func, dx->shape_view().elem_cnt(), x->dptr(), dy->dptr(), dx->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -181,7 +181,7 @@ class ClipByScalarMaxGradKernel final : public user_op::OpKernel { double floating_max = ctx->Attr("floating_max"); int64_t integral_max = ctx->Attr("integral_max"); ClipByMaxGradFunctor clip_func(GetDtypeMatchedValue(floating_max, integral_max)); - ClipKernelUtil::Backward(ctx->stream(), clip_func, dx->shape().elem_cnt(), + ClipKernelUtil::Backward(ctx->stream(), clip_func, dx->shape_view().elem_cnt(), x->dptr(), dy->dptr(), dx->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/combined_margin_loss_kernel.cpp b/oneflow/user/kernels/combined_margin_loss_kernel.cpp index 5b1d1c1b571..e21b55e7eb7 100644 --- a/oneflow/user/kernels/combined_margin_loss_kernel.cpp +++ b/oneflow/user/kernels/combined_margin_loss_kernel.cpp @@ -84,11 +84,11 @@ class CombinedMarginLossCpuKernel final : public user_op::OpKernel { if (cache != nullptr) { auto* kernel_cache = dynamic_cast(cache); CHECK_NOTNULL(kernel_cache); - CHECK_EQ(x->shape().Count(1), kernel_cache->upper() - kernel_cache->lower()); + CHECK_EQ(x->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); lower_bound = kernel_cache->lower(); } - const int64_t num_classes = x->shape().Count(1); - FOR_RANGE(int32_t, i, 0, x->shape().elem_cnt()) { + const int64_t num_classes = x->shape_view().Count(1); + FOR_RANGE(int32_t, i, 0, x->shape_view().elem_cnt()) { const int32_t row_id = i / num_classes; const int32_t col_id = i - row_id * num_classes; const T in_data = x_ptr[i]; @@ -144,12 +144,12 @@ class CombinedMarginLossGradCpuKernel final : public user_op::OpKernel { if (cache != nullptr) { auto* kernel_cache = dynamic_cast(cache); CHECK_NOTNULL(kernel_cache); - CHECK_EQ(dy->shape().Count(1), kernel_cache->upper() - kernel_cache->lower()); + CHECK_EQ(dy->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); lower_bound = kernel_cache->lower(); } - const int64_t num_classes = dy->shape().Count(1); - FOR_RANGE(int32_t, i, 0, dy->shape().elem_cnt()) { + const int64_t num_classes = dy->shape_view().Count(1); + FOR_RANGE(int32_t, i, 0, dy->shape_view().elem_cnt()) { const int32_t row_id = i / num_classes; const int32_t col_id = i - row_id * num_classes; K label = label_ptr[row_id] - lower_bound; diff --git a/oneflow/user/kernels/combined_margin_loss_kernel.cu b/oneflow/user/kernels/combined_margin_loss_kernel.cu index b0824ebb4b5..dcecd6b9bb3 100644 --- a/oneflow/user/kernels/combined_margin_loss_kernel.cu +++ b/oneflow/user/kernels/combined_margin_loss_kernel.cu @@ -129,20 +129,21 @@ class CombinedMarginLossGpuKernel final : public user_op::OpKernel { if (cache != nullptr) { auto* kernel_cache = dynamic_cast(cache); CHECK_NOTNULL(kernel_cache); - CHECK_EQ(x->shape().Count(1), kernel_cache->upper() - kernel_cache->lower()); + CHECK_EQ(x->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); lower_bound = kernel_cache->lower(); } if (m1 == 1.0 && m2 == 0.0) { - GpuForward<<shape().elem_cnt()), kCudaThreadsNumPerBlock, - 0, ctx->stream()->As()->cuda_stream()>>>( - x->shape().elem_cnt(), x->shape().Count(1), lower_bound, static_cast(m1), - static_cast(m2), static_cast(m3), x->dptr(), label->dptr(), y->mut_dptr(), - theta->mut_dptr()); + GpuForward + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, + ctx->stream()->As()->cuda_stream()>>>( + x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast(m1), + static_cast(m2), static_cast(m3), x->dptr(), label->dptr(), + y->mut_dptr(), theta->mut_dptr()); } else { GpuForward - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, ctx->stream()->As()->cuda_stream()>>>( - x->shape().elem_cnt(), x->shape().Count(1), lower_bound, static_cast(m1), + x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast(m1), static_cast(m2), static_cast(m3), x->dptr(), label->dptr(), y->mut_dptr(), theta->mut_dptr()); } @@ -187,23 +188,23 @@ class CombinedMarginLossGradGpuKernel final : public user_op::OpKernel { if (cache != nullptr) { auto* kernel_cache = dynamic_cast(cache); CHECK_NOTNULL(kernel_cache); - CHECK_EQ(dy->shape().Count(1), kernel_cache->upper() - kernel_cache->lower()); + CHECK_EQ(dy->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); lower_bound = kernel_cache->lower(); } if (m1 == 1.0 && m2 == 0.0) { GpuBackward - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, ctx->stream()->As()->cuda_stream()>>>( - dy->shape().elem_cnt(), dy->shape().Count(1), lower_bound, static_cast(m1), - static_cast(m2), static_cast(m3), dy->dptr(), label->dptr(), - theta->dptr(), dx->mut_dptr()); + dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound, + static_cast(m1), static_cast(m2), static_cast(m3), dy->dptr(), + label->dptr(), theta->dptr(), dx->mut_dptr()); } else { GpuBackward - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, ctx->stream()->As()->cuda_stream()>>>( - dy->shape().elem_cnt(), dy->shape().Count(1), lower_bound, static_cast(m1), - static_cast(m2), static_cast(m3), dy->dptr(), label->dptr(), - theta->dptr(), dx->mut_dptr()); + dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound, + static_cast(m1), static_cast(m2), static_cast(m3), dy->dptr(), + label->dptr(), theta->dptr(), dx->mut_dptr()); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/concat_kernel.cpp b/oneflow/user/kernels/concat_kernel.cpp index 10b77629611..18a7f2c006b 100644 --- a/oneflow/user/kernels/concat_kernel.cpp +++ b/oneflow/user/kernels/concat_kernel.cpp @@ -55,10 +55,10 @@ class ConcatKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - if (out_tensor->shape().elem_cnt() == 0) { return; } + if (out_tensor->shape_view().elem_cnt() == 0) { return; } const int64_t axis = ctx->Attr("axis"); - const int64_t out_cols = out_tensor->shape().Count(axis); - const int64_t rows = out_tensor->shape().elem_cnt() / out_cols; + const int64_t out_cols = out_tensor->shape_view().Count(axis); + const int64_t rows = out_tensor->shape_view().elem_cnt() / out_cols; CHECK_GT(rows, 0); auto primitive = NewCopyNdPrimitive(ctx); @@ -67,9 +67,9 @@ class ConcatKernel final : public user_op::OpKernel { for (const auto& in_arg_pair : ctx->inputs()) { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex(in_arg_pair.first, in_arg_pair.second); - if (in_tensor->shape().elem_cnt() == 0) { continue; } - const int64_t in_cols = in_tensor->shape().Count(axis); - CHECK_EQ(in_tensor->shape().elem_cnt(), rows * in_cols); + if (in_tensor->shape_view().elem_cnt() == 0) { continue; } + const int64_t in_cols = in_tensor->shape_view().Count(axis); + CHECK_EQ(in_tensor->shape_view().elem_cnt(), rows * in_cols); if (in_cols > 0) { DimVector dst_shape = {rows, out_cols}; DimVector dst_pos_vec = {0, out_col_offset}; diff --git a/oneflow/user/kernels/constant_kernel.cpp b/oneflow/user/kernels/constant_kernel.cpp index 662367d8451..b76671eff60 100644 --- a/oneflow/user/kernels/constant_kernel.cpp +++ b/oneflow/user/kernels/constant_kernel.cpp @@ -38,7 +38,7 @@ class ConstantKernel final : public OpKernel { bool is_floating_value = ctx->Attr("is_floating_value"); const Scalar value = is_floating_value ? Scalar(ctx->Attr("floating_value")) : Scalar(ctx->Attr("integer_value")); - const int64_t elem_cnt = out_tensor->shape().elem_cnt(); + const int64_t elem_cnt = out_tensor->shape_view().elem_cnt(); CHECK_GE(elem_cnt, 0); if (elem_cnt == 0) { return; } std::unique_ptr fill = NewFillPrimitive(ctx); diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp index df04b81aa6e..6a99d796c82 100644 --- a/oneflow/user/kernels/conv_cudnn_kernels.cpp +++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp @@ -38,8 +38,8 @@ struct CudnnConvArgsAndAlgo final { CudnnConvArgsAndAlgo(const user_op::Tensor* x, const user_op::Tensor* w, const user_op::Tensor* y, user_op::Tensor* buf, const user_op::KernelComputeContext* ctx, ep::Stream* stream, bool has_forced_algo, int32_t forced_algo) - : args(*ctx, x->data_type(), x->shape(), w->data_type(), w->shape(), y->data_type(), - y->shape(), ctx->Attr("data_format"), buf->shape().elem_cnt(), + : args(*ctx, x->data_type(), x->shape_view(), w->data_type(), w->shape_view(), y->data_type(), + y->shape_view(), ctx->Attr("data_format"), buf->shape_view().elem_cnt(), Global::Get() ->resource() .cudnn_conf() @@ -54,7 +54,7 @@ struct CudnnConvArgsAndAlgo final { .cudnn_conv_enable_pseudo_half() || (ctx->Attr("data_format") == "channels_last" && std::is_same::value)) { - size_t byte_size_of_buf = buf->shape().elem_cnt(); + size_t byte_size_of_buf = buf->shape_view().elem_cnt(); AllocatedCudnnConvResource res(stream->As()->cudnn_handle(), const_cast(x->dptr()), const_cast(w->dptr()), const_cast(y->dptr()), buf->mut_dptr()); @@ -175,7 +175,7 @@ class ConvGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphS void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, const user_op::OpKernelCache* cache) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - if (in->shape().elem_cnt() == 0) return; + if (in->shape_view().elem_cnt() == 0) return; const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); user_op::Tensor* buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); @@ -252,7 +252,7 @@ class ConvDataGradGpuKernel final : public user_op::OpKernel, public user_op::Cu const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* filter = ctx->Tensor4ArgNameAndIndex("filter", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - if (dx->shape().elem_cnt() == 0) return; + if (dx->shape_view().elem_cnt() == 0) return; user_op::Tensor* buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const auto& cudnn_conf = Global::Get()->resource().cudnn_conf(); @@ -267,10 +267,10 @@ class ConvDataGradGpuKernel final : public user_op::OpKernel, public user_op::Cu if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), dx->data_type()); - CHECK_EQ(add_to_output->shape(), dx->shape()); + CHECK_EQ(add_to_output->shape_view(), dx->shape_view()); Memcpy( ctx->stream(), dx->mut_dptr(), add_to_output->dptr(), - add_to_output->shape().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); beta = CudnnSPOnePtr(); } else { beta = CudnnSPZeroPtr(); @@ -332,9 +332,9 @@ class ConvFilterGradGpuKernel final : public user_op::OpKernel, public user_op:: const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* filter_diff = ctx->Tensor4ArgNameAndIndex("filter_diff", 0); - if (x->shape().elem_cnt() == 0) { + if (x->shape_view().elem_cnt() == 0) { Memset(ctx->stream(), filter_diff->mut_dptr(), 0, - filter_diff->shape().elem_cnt() * sizeof(T)); + filter_diff->shape_view().elem_cnt() * sizeof(T)); return; } user_op::Tensor* buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); @@ -420,14 +420,14 @@ class ConvBiasGradGpuKernel final : public user_op::OpKernel, public user_op::Cu void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* bias_diff = ctx->Tensor4ArgNameAndIndex("bias_diff", 0); - CHECK_EQ(bias_diff->shape().NumAxes(), 1); - CHECK_GE(dy->shape().NumAxes(), 3); - CHECK_LE(dy->shape().NumAxes(), 5); + CHECK_EQ(bias_diff->shape_view().NumAxes(), 1); + CHECK_GE(dy->shape_view().NumAxes(), 3); + CHECK_LE(dy->shape_view().NumAxes(), 5); const std::string& data_format = ctx->Attr("data_format"); std::unique_ptr dy_desc; - dy_desc.reset(new CudnnTensorDesc(dy->data_type(), dy->shape(), data_format)); + dy_desc.reset(new CudnnTensorDesc(dy->data_type(), dy->shape_view(), data_format)); const auto& bias_grad_state = CreateConvBiasGradState(ctx); CHECK_NOTNULL(bias_grad_state.get()); OF_CUDNN_CHECK(cudnnConvolutionBackwardBias( diff --git a/oneflow/user/kernels/conv_kernels.cpp b/oneflow/user/kernels/conv_kernels.cpp index 24694773ac9..9750a9156fb 100644 --- a/oneflow/user/kernels/conv_kernels.cpp +++ b/oneflow/user/kernels/conv_kernels.cpp @@ -58,12 +58,12 @@ void Gemm4ChannelLast(ep::Stream* stream, enum CBLAS_TRANSPOSE trans_a, template T* GetImgMutDptr(user_op::Tensor* tensor, int64_t idx) { - return tensor->mut_dptr() + tensor->shape().Count(1) * idx; + return tensor->mut_dptr() + tensor->shape_view().Count(1) * idx; } template const T* GetImgDptr(const user_op::Tensor* tensor, int64_t idx) { - return tensor->dptr() + tensor->shape().Count(1) * idx; + return tensor->dptr() + tensor->shape_view().Count(1) * idx; } size_t CalcElemNumOfColBuf(const ShapeView& out_shape, const ShapeView& weight_shape, @@ -401,7 +401,7 @@ class ConvCpuKernel final : public user_op::OpKernel { T* col_buf_dptr = tmp_buffer->mut_dptr(); bool is_bias_mul_inited = false; - for (int64_t i = 0; i < in->shape().At(0); ++i) { + for (int64_t i = 0; i < in->shape_view().At(0); ++i) { conv_cache->im2col_func_(GetImgDptr(in, i), ShapeView(conv_cache->in_5d_shape_), ShapeView(conv_cache->weight_5d_shape_), ShapeView(conv_cache->out_5d_shape_), conv_cache->strides_3d_.data(), @@ -421,9 +421,10 @@ class ConvCpuKernel final : public user_op::OpKernel { const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); if (bias != nullptr) { - int64_t num_of_col_buf = CalcElemNumOfColBuf(out->shape(), weight->shape(), idx_offset); + int64_t num_of_col_buf = + CalcElemNumOfColBuf(out->shape_view(), weight->shape_view(), idx_offset); int64_t num_of_bias_mul = - (tmp_buffer->shape().elem_cnt() - num_of_col_buf * sizeof(T)) / sizeof(T); + (tmp_buffer->shape_view().elem_cnt() - num_of_col_buf * sizeof(T)) / sizeof(T); CHECK_GT(num_of_bias_mul, 0); T* bias_mul_dptr = col_buf_dptr + num_of_col_buf; if (!is_bias_mul_inited) { @@ -501,10 +502,10 @@ class ConvDataGradCpuKernel final : public user_op::OpKernel { user_op::Tensor* col_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); Memset(ctx->stream(), dx->mut_dptr(), 0, - dx->shape().elem_cnt() * sizeof(T)); + dx->shape_view().elem_cnt() * sizeof(T)); int32_t idx_offset = conv_cache->idx_offset_; - FOR_RANGE(int64_t, i, 0, dy->shape().At(0)) { + FOR_RANGE(int64_t, i, 0, dy->shape_view().At(0)) { // channels first: col_buf' = weight(T) * out[i]' // channels last : col_buf' = weight(T) * out[i]'(T) NewKernelUtil::OFGemm( @@ -525,13 +526,13 @@ class ConvDataGradCpuKernel final : public user_op::OpKernel { if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), dx->data_type()); - CHECK_EQ(add_to_output->shape(), dx->shape()); + CHECK_EQ(add_to_output->shape_view(), dx->shape_view()); std::unique_ptr primitive = ep::primitive::NewPrimitive(DeviceType::kCPU, add_to_output->data_type()); CHECK(primitive); primitive->Launch(ctx->stream(), dx->dptr(), add_to_output->dptr(), dx->mut_dptr(), - add_to_output->shape().elem_cnt()); + add_to_output->shape_view().elem_cnt()); } } }; @@ -582,9 +583,9 @@ class ConvFilterGradCpuKernel final : public user_op::OpKernel { user_op::Tensor* col_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); Memset(ctx->stream(), filter_diff->mut_dptr(), 0, - filter_diff->shape().elem_cnt() * sizeof(T)); + filter_diff->shape_view().elem_cnt() * sizeof(T)); int32_t idx_offset = conv_cache->idx_offset_; - FOR_RANGE(int64_t, i, 0, dy->shape().At(0)) { + FOR_RANGE(int64_t, i, 0, dy->shape_view().At(0)) { conv_cache->im2col_func_(GetImgDptr(x, i), ShapeView(conv_cache->in_5d_shape_), ShapeView(conv_cache->weight_5d_shape_), ShapeView(conv_cache->out_5d_shape_), conv_cache->strides_3d_.data(), @@ -639,9 +640,9 @@ class ConvBiasGradCpuKernel final : public user_op::OpKernel { user_op::Tensor* bias_diff = ctx->Tensor4ArgNameAndIndex("bias_diff", 0); user_op::Tensor* bias_mul_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - InitBiasMulBuf(bias_mul_buf->mut_dptr(), bias_mul_buf->shape().elem_cnt() / sizeof(T)); + InitBiasMulBuf(bias_mul_buf->mut_dptr(), bias_mul_buf->shape_view().elem_cnt() / sizeof(T)); Memset(ctx->stream(), bias_diff->mut_dptr(), 0, - bias_diff->shape().elem_cnt() * sizeof(T)); + bias_diff->shape_view().elem_cnt() * sizeof(T)); const auto& data_format = ctx->Attr("data_format"); int32_t idx_offset; @@ -650,21 +651,21 @@ class ConvBiasGradCpuKernel final : public user_op::OpKernel { if (data_format == "channels_first") { idx_offset = 2; is_out_diff_need_trans = CblasNoTrans; - filter = dy->shape().At(1); + filter = dy->shape_view().At(1); } else { idx_offset = 1; is_out_diff_need_trans = CblasTrans; - filter = dy->shape().At(dy->shape().NumAxes() - 1); + filter = dy->shape_view().At(dy->shape_view().NumAxes() - 1); } - int ndims = dy->shape().NumAxes() - 2; - FOR_RANGE(int64_t, i, 0, dy->shape().At(0)) { + int ndims = dy->shape_view().NumAxes() - 2; + FOR_RANGE(int64_t, i, 0, dy->shape_view().At(0)) { // channels first: bias' += out' * bias_mul // channels last: bias' += out'(T) * bias_mul NewKernelUtil::OFGemm( ctx->stream(), is_out_diff_need_trans, CblasNoTrans, - filter, // filter - 1, // 1 - dy->shape().Count(idx_offset, idx_offset + ndims), // od * oh * ow + filter, // filter + 1, // 1 + dy->shape_view().Count(idx_offset, idx_offset + ndims), // od * oh * ow static_cast(1), GetImgDptr(dy, i), bias_mul_buf->dptr(), static_cast(1), bias_diff->mut_dptr()); } diff --git a/oneflow/user/kernels/copy_data_content_kernel.h b/oneflow/user/kernels/copy_data_content_kernel.h index d6ec57e4c42..e4f763077cb 100644 --- a/oneflow/user/kernels/copy_data_content_kernel.h +++ b/oneflow/user/kernels/copy_data_content_kernel.h @@ -29,10 +29,10 @@ class CopyDataContentKernel final : public user_op::OpKernel, public user_op::Cu void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt()); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()); CHECK_EQ(in->data_type(), out->data_type()); Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), - in->shape().elem_cnt() * GetSizeOfDataType(in->data_type())); + in->shape_view().elem_cnt() * GetSizeOfDataType(in->data_type())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/copy_kernel.cpp b/oneflow/user/kernels/copy_kernel.cpp index 02e3d8db141..3e0b5ea2096 100644 --- a/oneflow/user/kernels/copy_kernel.cpp +++ b/oneflow/user/kernels/copy_kernel.cpp @@ -29,8 +29,8 @@ class CopyKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); - CHECK_EQ(out->shape(), in_shape); + const ShapeView& in_shape = in->shape_view(); + CHECK_EQ(out->shape_view(), in_shape); const DataType in_data_type = in->data_type(); CHECK_EQ(out->data_type(), in_data_type); if (in_shape.elem_cnt() == 0) { diff --git a/oneflow/user/kernels/count_not_finite_kernel.cpp b/oneflow/user/kernels/count_not_finite_kernel.cpp index 202b8d69ed9..93086946aa7 100644 --- a/oneflow/user/kernels/count_not_finite_kernel.cpp +++ b/oneflow/user/kernels/count_not_finite_kernel.cpp @@ -31,7 +31,7 @@ class MultiCountNotFiniteCpuKernel final : public user_op::OpKernel { FOR_RANGE(int32_t, i, 0, ctx->inputs().size()) { user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", i); const T* x_ptr = x->dptr(); - FOR_RANGE(int32_t, j, 0, x->shape().elem_cnt()) { + FOR_RANGE(int32_t, j, 0, x->shape_view().elem_cnt()) { if (!std::isfinite(x_ptr[j])) { count++; } } } diff --git a/oneflow/user/kernels/count_not_finite_kernel.cu b/oneflow/user/kernels/count_not_finite_kernel.cu index 649c5755c7d..b3425fa24d0 100644 --- a/oneflow/user/kernels/count_not_finite_kernel.cu +++ b/oneflow/user/kernels/count_not_finite_kernel.cu @@ -97,9 +97,9 @@ class CountNotFiniteGpuKernel final : public user_op::OpKernel, public user_op:: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int64_t elem_cnt = x->shape().elem_cnt(); + const int64_t elem_cnt = x->shape_view().elem_cnt(); Memset(ctx->stream(), y->mut_dptr(), 0, - y->shape().elem_cnt() * sizeof(int64_t)); + y->shape_view().elem_cnt() * sizeof(int64_t)); CountNotFiniteGpu<<stream()->As()->cuda_stream()>>>( elem_cnt, x->dptr(), y->mut_dptr()); @@ -130,7 +130,7 @@ class MultiCountNotFiniteGpuKernel final : public user_op::OpKernel, user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); Param para; Memset(ctx->stream(), y->mut_dptr(), 0, - y->shape().elem_cnt() * sizeof(int64_t)); + y->shape_view().elem_cnt() * sizeof(int64_t)); para.y = y->mut_dptr(); int64_t remain_size = ctx->inputs().size(); @@ -148,8 +148,8 @@ class MultiCountNotFiniteGpuKernel final : public user_op::OpKernel, const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", input_id); input_id++; para.x[i] = x->dptr(); - para.x_elem_cnt[i] = x->shape().elem_cnt(); - max_elem_cnt = std::max(max_elem_cnt, x->shape().elem_cnt()); + para.x_elem_cnt[i] = x->shape_view().elem_cnt(); + max_elem_cnt = std::max(max_elem_cnt, x->shape_view().elem_cnt()); } MultiCountNotFiniteGpu <<dptr(); const int64_t* input_lengths_ptr = input_lengths->dptr(); const bool merge_repeated = ctx->Attr("merge_repeated"); - const int64_t max_input_length = log_probs->shape().At(0); - const int64_t batch_size = log_probs->shape().At(1); - const int64_t num_labels = log_probs->shape().At(2); - CHECK_EQ(batch_size, input_lengths->shape().At(0)); + const int64_t max_input_length = log_probs->shape_view().At(0); + const int64_t batch_size = log_probs->shape_view().At(1); + const int64_t num_labels = log_probs->shape_view().At(2); + CHECK_EQ(batch_size, input_lengths->shape_view().At(0)); int64_t* decoded_ptr = decoded->mut_dptr(); T* neg_sum_logits_ptr = neg_sum_logits->mut_dptr(); diff --git a/oneflow/user/kernels/ctc_loss_kernel.cpp b/oneflow/user/kernels/ctc_loss_kernel.cpp index 67d16d77942..92c630c5d45 100644 --- a/oneflow/user/kernels/ctc_loss_kernel.cpp +++ b/oneflow/user/kernels/ctc_loss_kernel.cpp @@ -38,11 +38,11 @@ class CtcLossKernel final : public user_op::OpKernel { const IDX* input_lengths_ptr = input_lengths->dptr(); const IDX* target_lengths_ptr = target_lengths->dptr(); const int32_t blank = ctx->Attr("blank"); - const int64_t max_input_length = log_probs->shape().At(0); - const int64_t batch_size = log_probs->shape().At(1); - const int64_t num_labels = log_probs->shape().At(2); + const int64_t max_input_length = log_probs->shape_view().At(0); + const int64_t batch_size = log_probs->shape_view().At(1); + const int64_t num_labels = log_probs->shape_view().At(2); const int64_t max_target_length = ctx->Attr("max_target_length"); - const int32_t targets_ndim = targets->shape().NumAxes(); + const int32_t targets_ndim = targets->shape_view().NumAxes(); NdIndexOffsetHelper input_helper(max_input_length, batch_size, num_labels); NdIndexOffsetHelper alpha_helper(batch_size, max_input_length, @@ -95,11 +95,11 @@ class CtcLossGradKernel final : public user_op::OpKernel { const IDX* target_lengths_ptr = target_lengths->dptr(); const int32_t blank = ctx->Attr("blank"); const bool zero_infinity = ctx->Attr("zero_infinity"); - const int64_t batch_size = log_probs->shape().At(1); - const int64_t num_labels = log_probs->shape().At(2); - const int64_t max_input_length = log_probs->shape().At(0); + const int64_t batch_size = log_probs->shape_view().At(1); + const int64_t num_labels = log_probs->shape_view().At(2); + const int64_t max_input_length = log_probs->shape_view().At(0); const int64_t max_target_length = ctx->Attr("max_target_length"); - const int32_t targets_ndim = targets->shape().NumAxes(); + const int32_t targets_ndim = targets->shape_view().NumAxes(); NdIndexOffsetHelper input_helper(max_input_length, batch_size, num_labels); NdIndexOffsetHelper beta_helper(batch_size, max_input_length, diff --git a/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu b/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu index 6ba3e0e8d09..b94d0d08ef2 100644 --- a/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu +++ b/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu @@ -60,9 +60,9 @@ class CublasBiasAddReluMatmulGradKernel final : public user_op::OpKernel, // currently only support 2D matmul. DimVector dy_shape(2); - dy->shape().ToDimVector(&dy_shape); + dy->shape_view().ToDimVector(&dy_shape); DimVector weight_shape(2); - weight->shape().ToDimVector(&weight_shape); + weight->shape_view().ToDimVector(&weight_shape); cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DRELU_BGRAD; InferMatmulCublasMNK(dy_shape, weight_shape, diff --git a/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu b/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu index 95a25fcd525..6254d5128c8 100644 --- a/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu +++ b/oneflow/user/kernels/cublas_fused_matmul_bias_add_grad.cu @@ -74,9 +74,9 @@ class CublasMatmulBiasAddGradKernel final : public user_op::OpKernel, // currently only support 2D matmul. DimVector dy_shape(2); - dy->shape().ToDimVector(&dy_shape); + dy->shape_view().ToDimVector(&dy_shape); DimVector x_shape(2); - x->shape().ToDimVector(&x_shape); + x->shape_view().ToDimVector(&x_shape); cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BGRADB; InferMatmulCublasMNK(dy_shape, x_shape, diff --git a/oneflow/user/kernels/cublas_fused_mlp_kernel.cu b/oneflow/user/kernels/cublas_fused_mlp_kernel.cu index 8755c514ebd..50d2a75c731 100644 --- a/oneflow/user/kernels/cublas_fused_mlp_kernel.cu +++ b/oneflow/user/kernels/cublas_fused_mlp_kernel.cu @@ -68,7 +68,7 @@ class CublasFusedMLPKernel final : public user_op::OpKernel, public user_op::Cud // Currently only support 2D matmul. DimVector in_shape(2); - x->shape().ToDimVector(&in_shape); + x->shape_view().ToDimVector(&in_shape); DimVector weight_shape(2); @@ -78,8 +78,8 @@ class CublasFusedMLPKernel final : public user_op::OpKernel, public user_op::Cud const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("biases", idx); user_op::Tensor* cublas_aux = ctx->Tensor4ArgNameAndIndex("cublas_aux", idx); - int64_t out_feature = weight->shape().At(0); - weight->shape().ToDimVector(&weight_shape); + int64_t out_feature = weight->shape_view().At(0); + weight->shape_view().ToDimVector(&weight_shape); InferMatmulCublasMNK(in_shape, weight_shape, /*transpose_a=*/ep::primitive::BlasTransposeType::N, diff --git a/oneflow/user/kernels/cum_backward_kernel.cpp b/oneflow/user/kernels/cum_backward_kernel.cpp index 6e6967d1daa..69d1b472372 100644 --- a/oneflow/user/kernels/cum_backward_kernel.cpp +++ b/oneflow/user/kernels/cum_backward_kernel.cpp @@ -98,7 +98,7 @@ class CpuCumProdGradKernel final : public user_op::OpKernel { const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t elem_cnt = dy->shape().elem_cnt(); + const int64_t elem_cnt = dy->shape_view().elem_cnt(); if (elem_cnt == 0) { return; } const auto* output_ptr = output->dptr(); @@ -108,9 +108,9 @@ class CpuCumProdGradKernel final : public user_op::OpKernel { // data partition: up_space|space|down_space auto dim = ctx->Attr("dim"); - auto up_space = elem_cnt / dx->shape().Count(dim); - auto space = dx->shape().At(dim); - auto down_space = dx->shape().Count(dim + 1); + auto up_space = elem_cnt / dx->shape_view().Count(dim); + auto space = dx->shape_view().At(dim); + auto down_space = dx->shape_view().Count(dim + 1); if (space == 1) { Memcpy(ctx->stream(), dx_ptr, dy_ptr, elem_cnt * sizeof(T)); return; diff --git a/oneflow/user/kernels/cum_backward_kernel.cu b/oneflow/user/kernels/cum_backward_kernel.cu index 6dfee037471..c3d4bc717bb 100644 --- a/oneflow/user/kernels/cum_backward_kernel.cu +++ b/oneflow/user/kernels/cum_backward_kernel.cu @@ -95,7 +95,7 @@ class GpuCumProdGradKernel final : public user_op::OpKernel { const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const auto elem_cnt = dy->shape().elem_cnt(); + const auto elem_cnt = dy->shape_view().elem_cnt(); if (!elem_cnt) { return; } const auto* output_ptr = output->dptr(); @@ -105,9 +105,9 @@ class GpuCumProdGradKernel final : public user_op::OpKernel { // Data partition: up_space|space|down_space auto dim = ctx->Attr("dim"); - const auto up_space = elem_cnt / dx->shape().Count(dim); - const auto space = dx->shape().At(dim); - const auto down_space = dx->shape().Count(dim + 1); + const auto up_space = elem_cnt / dx->shape_view().Count(dim); + const auto space = dx->shape_view().At(dim); + const auto down_space = dx->shape_view().Count(dim + 1); const size_t thread_num = up_space * down_space; if (space == 1) { diff --git a/oneflow/user/kernels/cum_forward_kernel.cpp b/oneflow/user/kernels/cum_forward_kernel.cpp index d2c2e8de646..add96f69d4d 100644 --- a/oneflow/user/kernels/cum_forward_kernel.cpp +++ b/oneflow/user/kernels/cum_forward_kernel.cpp @@ -47,7 +47,7 @@ class CpuCumKernel : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const auto* in = ctx->Tensor4ArgNameAndIndex("x", 0); - auto elem_cnt = in->shape().elem_cnt(); + auto elem_cnt = in->shape_view().elem_cnt(); // judge whether tensor has 0 size dimension first if (!elem_cnt) { return; } @@ -57,9 +57,9 @@ class CpuCumKernel : public user_op::OpKernel { auto* out_ptr = out->mut_dptr(); // data partition: up_space|space|down_space - auto up_space = elem_cnt / in->shape().Count(dim); - auto space = in->shape().At(dim); - auto down_space = in->shape().Count(dim + 1); + auto up_space = elem_cnt / in->shape_view().Count(dim); + auto space = in->shape_view().At(dim); + auto down_space = in->shape_view().Count(dim + 1); CumForward(in_ptr, out_ptr, up_space, space, down_space, elem_cnt); } diff --git a/oneflow/user/kernels/cum_forward_kernel.cu b/oneflow/user/kernels/cum_forward_kernel.cu index a1ae58e51cb..32d725868e1 100644 --- a/oneflow/user/kernels/cum_forward_kernel.cu +++ b/oneflow/user/kernels/cum_forward_kernel.cu @@ -101,7 +101,7 @@ class GpuCumKernel : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { // judge whether tensor has 0 size dimension first const auto* in = ctx->Tensor4ArgNameAndIndex("x", 0); - auto elem_cnt = in->shape().elem_cnt(); + auto elem_cnt = in->shape_view().elem_cnt(); if (!elem_cnt) { return; } auto* out = ctx->Tensor4ArgNameAndIndex("y", 0); @@ -110,9 +110,9 @@ class GpuCumKernel : public user_op::OpKernel { auto* out_ptr = out->mut_dptr(); // data partition: up_space|space|down_space - auto up_space = elem_cnt / in->shape().Count(dim); - auto space = in->shape().At(dim); - auto down_space = in->shape().Count(dim + 1); + auto up_space = elem_cnt / in->shape_view().Count(dim); + auto space = in->shape_view().At(dim); + auto down_space = in->shape_view().Count(dim + 1); auto thread_num = up_space * down_space; if (up_space == 1) { diff --git a/oneflow/user/kernels/data_shuffle_kernel.cu b/oneflow/user/kernels/data_shuffle_kernel.cu index 348d69ba669..0821f57438d 100644 --- a/oneflow/user/kernels/data_shuffle_kernel.cu +++ b/oneflow/user/kernels/data_shuffle_kernel.cu @@ -245,11 +245,10 @@ class DataShuffleKernelState final : public user_op::OpKernelState { public: explicit DataShuffleKernelState(user_op::KernelInitContext* ctx) : device_index_(-1), - has_independent_stream_(ctx->op_conf().has_stream_name_hint()), - stream_name_(""), + stream_name_(EagerNcclCommMgr::kDefaultStreamName), parallel_desc_(ctx->parallel_desc()) { OF_CUDA_CHECK(cudaGetDevice(&device_index_)); - if (has_independent_stream_) { stream_name_ = ctx->op_conf().stream_name_hint(); } + if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); } OF_CUDA_CHECK(cudaMallocHost( &host_num_unique_matrix_, parallel_desc_.parallel_num() * parallel_desc_.parallel_num() * sizeof(IDX))); @@ -283,11 +282,7 @@ class DataShuffleKernelState final : public user_op::OpKernelState { } EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Global::Get()); ncclComm_t comm; - if (has_independent_stream_) { - comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); - } else { - comm = comm_mgr->GetCommForDevice(device_set); - } + comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); comm_.reset(new Comm(comm)); } @@ -333,13 +328,13 @@ class IdShuffleKernel final : public user_op::OpKernel { const bool has_table_ids = ctx->has_input("table_ids", 0); const bool need_gen_table_ids = (!has_table_ids && num_tables > 1); const bool need_process_table_ids = (has_table_ids || num_tables > 1); - const int64_t num_ids = ids->shape().elem_cnt(); + const int64_t num_ids = ids->shape_view().elem_cnt(); const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); cudaStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); IdShuffleTmpBufferManager buffer_manager( tmp_buffer->mut_dptr(), num_ids, parallel_num, need_gen_table_ids, need_process_table_ids); - CHECK_GE(tmp_buffer->shape().elem_cnt(), buffer_manager.TotalBufferSize()); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.TotalBufferSize()); const U* table_ids_ptr; if (has_table_ids) { @@ -874,10 +869,10 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); ncclComm_t comm = kernel_state->comm(); using ComputeType = typename DefaultComputeType::type; - const int64_t embedding_size = cur_rank_embeddings->shape().At(1); + const int64_t embedding_size = cur_rank_embeddings->shape_view().At(1); IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); DataType data_type = cur_rank_embeddings->data_type(); - const int64_t num_ids = inverse_unique_partition_indices->shape().elem_cnt(); + const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt(); const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); bool enable_quantized_comm_env_var = @@ -897,13 +892,13 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel { cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id]; } size_t full_elem_cnt = parallel_num * num_ids * embedding_size; - CHECK_EQ(full_elem_cnt, cur_rank_embeddings->shape().elem_cnt()); + CHECK_EQ(full_elem_cnt, cur_rank_embeddings->shape_view().elem_cnt()); if (!enable_quantized_comm) { size_t reverse_unique_cur_rank_embeddings_size = GetCudaAlignedSize(full_elem_cnt * sizeof(T)); size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size; - CHECK_GE(tmp_buffer->shape().elem_cnt(), + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), reverse_unique_cur_rank_embeddings_size + received_embeddings_size); T* reverse_unique_cur_rank_embeddings = reinterpret_cast(tmp_buffer->mut_dptr()); @@ -913,7 +908,7 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel { GatherKernelUtilImpl::Forward( ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), cur_rank_num_ids, cur_rank_embeddings->dptr(), - Shape({1, cur_rank_embeddings->shape().elem_cnt() / embedding_size, embedding_size}), + Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}), reverse_unique_cur_rank_embeddings, 0); ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size, @@ -923,7 +918,7 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel { // reverse unique_partition GatherKernelUtilImpl::Forward( ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), - inverse_unique_partition_indices->shape().elem_cnt(), received_embeddings, + inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings, Shape({1, parallel_num * num_ids, embedding_size}), embeddings->mut_dptr(), 0); } else { size_t reverse_unique_cur_rank_embeddings_size = @@ -933,11 +928,11 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel { size_t reverse_recv_quantize_cur_rank_embeddings_size = reverse_unique_cur_rank_embeddings_size; size_t cur_rank_quantize_factor_size = - GetCudaAlignedSize(cur_rank_embeddings->shape().At(0) * sizeof(T)); + GetCudaAlignedSize(cur_rank_embeddings->shape_view().At(0) * sizeof(T)); size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; size_t recv_quantize_factor_size = cur_rank_quantize_factor_size; size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size; - CHECK_GE(tmp_buffer->shape().elem_cnt(), + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), reverse_unique_cur_rank_embeddings_size + received_embeddings_size + quantize_cur_rank_embeddings_size + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size @@ -978,14 +973,14 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel { GatherKernelUtilImpl::Forward( ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), cur_rank_num_ids, quantize_cur_rank_embeddings, - Shape({1, cur_rank_embeddings->shape().elem_cnt() / embedding_size, embedding_size}), + Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}), reverse_unique_cur_rank_embeddings, 0); // reverse cur_rank quantize factor unique GatherKernelUtilImpl::Forward( ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), cur_rank_num_ids, cur_rank_quantize_factor, - Shape({1, cur_rank_embeddings->shape().elem_cnt() / embedding_size, 1}), + Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, 1}), reverse_cur_rank_quantize_factor, 0); ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size, @@ -996,16 +991,16 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel { // reverse unique_partition GatherKernelUtilImpl::Forward( ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), - inverse_unique_partition_indices->shape().elem_cnt(), received_embeddings, + inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings, Shape({1, parallel_num * num_ids, embedding_size}), reverse_recv_quantize_cur_rank_embeddings, 0); GatherKernelUtilImpl::Forward( ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), - inverse_unique_partition_indices->shape().elem_cnt(), recv_quantize_factor, + inverse_unique_partition_indices->shape_view().elem_cnt(), recv_quantize_factor, Shape({1, parallel_num * num_ids, 1}), reverse_recv_quantize_factor, 0); - int32_t dequantize_row_size = inverse_unique_partition_indices->shape().elem_cnt(); + int32_t dequantize_row_size = inverse_unique_partition_indices->shape_view().elem_cnt(); IDX dequantize_elem_cnt = dequantize_row_size * embedding_size; OF_CUDA_CHECK((LaunchDequantizeKernel( cuda_stream, reverse_recv_quantize_cur_rank_embeddings, reverse_recv_quantize_factor, @@ -1252,10 +1247,10 @@ class EmbeddingGradientShuffleKernel final : public user_op::OpKernel { ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0); user_op::Tensor* cur_rank_unique_embedding_grad = ctx->Tensor4ArgNameAndIndex("cur_rank_unique_embedding_grad", 0); - const int64_t embedding_size = cur_rank_unique_embedding_grad->shape().At(1); + const int64_t embedding_size = cur_rank_unique_embedding_grad->shape_view().At(1); IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); DataType data_type = embedding_grad->data_type(); - const int64_t num_ids = inverse_unique_partition_indices->shape().elem_cnt(); + const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt(); const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); const int64_t padded_embedding_size = GetPaddedEmbeddingSize(data_type, embedding_size); @@ -1289,7 +1284,7 @@ class EmbeddingGradientShuffleKernel final : public user_op::OpKernel { T* unique_partition_embedding_grad = reinterpret_cast(tmp_buffer->mut_dptr()); T* received_embedding_grad = reinterpret_cast(tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size); - CHECK_GE(tmp_buffer->shape().elem_cnt(), + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), unique_partition_embedding_grad_size + received_embedding_grad_size); UniquePartitionEmbeddingGrad( @@ -1315,7 +1310,7 @@ class EmbeddingGradientShuffleKernel final : public user_op::OpKernel { size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; size_t dequantize_cur_rank_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(T)); - CHECK_GE(tmp_buffer->shape().elem_cnt(), + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), unique_partition_embedding_grad_size + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size + received_cur_rank_quantize_factor_size @@ -1457,11 +1452,11 @@ class UniqueKeyValuePairKernel final : public user_op::OpKernel { const bool has_values = ctx->has_input("values", 0); const bool need_values_buffer = (!has_values && num_tables > 1); size_t values_buffer_bytes = - need_values_buffer ? GetCudaAlignedSize(keys->shape().elem_cnt() * sizeof(V)) : 0; - const int64_t num_keys = keys->shape().elem_cnt(); + need_values_buffer ? GetCudaAlignedSize(keys->shape_view().elem_cnt() * sizeof(V)) : 0; + const int64_t num_keys = keys->shape_view().elem_cnt(); const int64_t hash_capacity = num_keys; const size_t workspace_bytes = GetCudaAlignedSize(hash_capacity * sizeof(TableEntry)); - CHECK_LE(values_buffer_bytes + workspace_bytes, tmp_buffer->shape().elem_cnt()); + CHECK_LE(values_buffer_bytes + workspace_bytes, tmp_buffer->shape_view().elem_cnt()); cudaStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); const V* values_ptr; if (has_values) { @@ -1517,4 +1512,9 @@ class UniqueKeyValuePairKernel final : public user_op::OpKernel { OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL, ID_DATA_TYPE_SEQ, ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("id_shuffle"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_shuffle"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_gradient_shuffle"); + } // namespace oneflow diff --git a/oneflow/user/kernels/deconv_cpu_kernel.cpp b/oneflow/user/kernels/deconv_cpu_kernel.cpp index ed897f2f4ff..95b3bac1228 100644 --- a/oneflow/user/kernels/deconv_cpu_kernel.cpp +++ b/oneflow/user/kernels/deconv_cpu_kernel.cpp @@ -47,12 +47,12 @@ void Gemm4ChannelLast(ep::Stream* stream, enum CBLAS_TRANSPOSE trans_a, template T* GetImgMutDptr(user_op::Tensor* tensor, int64_t idx) { - return tensor->mut_dptr() + tensor->shape().Count(1) * idx; + return tensor->mut_dptr() + tensor->shape_view().Count(1) * idx; } template const T* GetImgDptr(const user_op::Tensor* tensor, int64_t idx) { - return tensor->dptr() + tensor->shape().Count(1) * idx; + return tensor->dptr() + tensor->shape_view().Count(1) * idx; } size_t CalcElemNumOfColBuf(const ShapeView& out_shape, const ShapeView& weight_shape, @@ -349,9 +349,9 @@ class DeconvCpuKernel final : public user_op::OpKernel { user_op::Tensor* col_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); Memset(ctx->stream(), out->mut_dptr(), 0, - out->shape().elem_cnt() * sizeof(T)); + out->shape_view().elem_cnt() * sizeof(T)); - FOR_RANGE(int64_t, i, 0, in->shape().At(0)) { + FOR_RANGE(int64_t, i, 0, in->shape_view().At(0)) { // channels first: col_buf' = weight(T) * in[i]' // channels last : col_buf' = weight(T) * in[i]'(T) // m, n, k diff --git a/oneflow/user/kernels/deconv_cudnn_kernel.cpp b/oneflow/user/kernels/deconv_cudnn_kernel.cpp index 1706170b4dd..440ad995c3c 100644 --- a/oneflow/user/kernels/deconv_cudnn_kernel.cpp +++ b/oneflow/user/kernels/deconv_cudnn_kernel.cpp @@ -37,8 +37,8 @@ struct CudnnDeConvArgsAndAlgo final { const user_op::Tensor* y, user_op::Tensor* buf, const user_op::KernelComputeContext* ctx, ep::Stream* stream, bool has_forced_algo, int32_t forced_algo) - : args(*ctx, x->data_type(), x->shape(), w->data_type(), w->shape(), y->data_type(), - y->shape(), ctx->Attr("data_format"), buf->shape().elem_cnt(), + : args(*ctx, x->data_type(), x->shape_view(), w->data_type(), w->shape_view(), y->data_type(), + y->shape_view(), ctx->Attr("data_format"), buf->shape_view().elem_cnt(), Global::Get() ->resource() .cudnn_conf() @@ -51,7 +51,7 @@ struct CudnnDeConvArgsAndAlgo final { ->resource() .cudnn_conf() .cudnn_conv_enable_pseudo_half()) { - size_t byte_size_of_buf = buf->shape().elem_cnt(); + size_t byte_size_of_buf = buf->shape_view().elem_cnt(); AllocatedCudnnConvResource res(stream->As()->cudnn_handle(), const_cast(x->dptr()), const_cast(w->dptr()), const_cast(y->dptr()), buf->mut_dptr()); @@ -120,7 +120,7 @@ class DeConvGpuKernel final : public user_op::OpKernel { const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); user_op::Tensor* buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - if (in->shape().elem_cnt() == 0) return; + if (in->shape_view().elem_cnt() == 0) return; const auto& cudnn_conf = Global::Get()->resource().cudnn_conf(); CudnnDeConvArgsAndAlgo args_and_algo( diff --git a/oneflow/user/kernels/diag_kernel.h b/oneflow/user/kernels/diag_kernel.h index aa8e2d8d922..000bbaa9a60 100644 --- a/oneflow/user/kernels/diag_kernel.h +++ b/oneflow/user/kernels/diag_kernel.h @@ -46,8 +46,8 @@ class DiagKernel final : public user_op::OpKernel { const int32_t diagonal = ctx->Attr("diagonal"); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& out_shape = out->shape(); - const ShapeView& in_shape = in->shape(); + const ShapeView& out_shape = out->shape_view(); + const ShapeView& in_shape = in->shape_view(); int32_t in_dim = in_shape.NumAxes(); const T* in_buf = in->dptr(); T* out_buf = out->mut_dptr(); @@ -86,8 +86,8 @@ class DiagBackwardKernel final : public user_op::OpKernel { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); int32_t diagonal = ctx->Attr("diagonal"); - const ShapeView& dx_shape = dx->shape(); - const ShapeView& dy_shape = dy->shape(); + const ShapeView& dx_shape = dx->shape_view(); + const ShapeView& dy_shape = dy->shape_view(); int32_t in_dim = dx_shape.NumAxes(); int32_t dy_cnt = dy_shape.Count(0); int32_t dx_cnt = dx_shape.Count(0); diff --git a/oneflow/user/kernels/diagonal_kernel.cpp b/oneflow/user/kernels/diagonal_kernel.cpp index d7895dc5adf..77e888bbc8d 100644 --- a/oneflow/user/kernels/diagonal_kernel.cpp +++ b/oneflow/user/kernels/diagonal_kernel.cpp @@ -63,8 +63,8 @@ class CpuDiagonalKernel final : public user_op::OpKernel { const int32_t offset = ctx->Attr("offset"); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& out_shape = out->shape(); - const ShapeView& in_shape = in->shape(); + const ShapeView& out_shape = out->shape_view(); + const ShapeView& in_shape = in->shape_view(); const T* in_buf = in->dptr(); T* out_buf = out->mut_dptr(); @@ -96,8 +96,8 @@ class CpuDiagonalBackwardKernel final : public user_op::OpKernel { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); int32_t offset = ctx->Attr("offset"); - const ShapeView& dx_shape = dx->shape(); - const ShapeView& dy_shape = dy->shape(); + const ShapeView& dx_shape = dx->shape_view(); + const ShapeView& dy_shape = dy->shape_view(); T* dx_buf = dx->mut_dptr(); const T* dy_buf = dy->dptr(); diff --git a/oneflow/user/kernels/diagonal_kernel.cu b/oneflow/user/kernels/diagonal_kernel.cu index dd56c9f00a1..f1ddf0ec9d7 100644 --- a/oneflow/user/kernels/diagonal_kernel.cu +++ b/oneflow/user/kernels/diagonal_kernel.cu @@ -83,8 +83,8 @@ class GpuDiagonalKernel final : public user_op::OpKernel { const int32_t offset = ctx->Attr("offset"); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& out_shape = out->shape(); - const ShapeView& in_shape = in->shape(); + const ShapeView& out_shape = out->shape_view(); + const ShapeView& in_shape = in->shape_view(); const T* in_buf = in->dptr(); T* out_buf = out->mut_dptr(); @@ -117,8 +117,8 @@ class GpuDiagonalBackwardKernel final : public user_op::OpKernel { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); int32_t offset = ctx->Attr("offset"); - const ShapeView& dx_shape = dx->shape(); - const ShapeView& dy_shape = dy->shape(); + const ShapeView& dx_shape = dx->shape_view(); + const ShapeView& dy_shape = dy->shape_view(); T* dx_buf = dx->mut_dptr(); const T* dy_buf = dy->dptr(); diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index efe197e4bc8..6812b774ebc 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -40,7 +40,7 @@ class DimGatherKernel final : public user_op::OpKernel { private: void Compute(KernelComputeContext* ctx) const override { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); - if (input_tensor->shape().elem_cnt() == 0) { return; } + if (input_tensor->shape_view().elem_cnt() == 0) { return; } const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); const int32_t dim = ctx->Attr("dim"); @@ -49,22 +49,15 @@ class DimGatherKernel final : public user_op::OpKernel { const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); - const int& ndim = input_tensor->shape().NumAxes(); - int dim_value = 0; - if (ndim > 0) { dim_value = input_tensor->shape().At(dim); } - - small_vector shape_vec(ndim); - auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { - std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int64_t dim) -> IDX_T { return static_cast(dim); }); - }; - shape2dims(input_tensor->shape()); - DimOpIndexNdHelper input_nd_helper(shape_vec.data(), ndim); - shape2dims(index_tensor->shape()); - DimOpIndexNdHelper index_nd_helper(shape_vec.data(), ndim); + const Shape in_shape = ExpandDimIf0D(input_tensor->shape_view()); + const auto ndim = in_shape.NumAxes(); + const auto dim_length = in_shape.At(dim); + + DimOpIndexNdHelper input_nd_helper(in_shape.data(), ndim); + DimOpIndexNdHelper index_nd_helper(index_tensor->shape_view().data(), ndim); DimGatherFunctor()(ctx->stream(), input_nd_helper, index_nd_helper, - ndim, index_tensor->shape().elem_cnt(), dim_value, - dim, index, input, output); + ndim, index_tensor->shape_view().elem_cnt(), + dim_length, dim, index, input, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index df4721b6c3f..a6392c84dc0 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -37,7 +37,7 @@ class DimScatterKernel final : public user_op::OpKernel { const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + out_tensor->shape_view().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); const IN_T* src = src_tensor->dptr(); @@ -50,29 +50,26 @@ class DimScatterKernel final : public user_op::OpKernel { UNIMPLEMENTED() << "Input tensor and like tensor cannot be empty simultaneously."; } - const int ndim = src_tensor->shape().NumAxes(); - small_vector shape_vec(ndim); - auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { - std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int32_t dim) -> IDX_T { return static_cast(dim); }); - }; - shape2dims(src_tensor->shape()); - DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); - shape2dims(index_tensor->shape()); - DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); - shape2dims(out_tensor->shape()); - DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - - int64_t upper_bound = 0; - if (input_tensor) { - upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound - } else { - upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound - } + const Shape src_shape = ExpandDimIf0D(src_tensor->shape_view()); + const Shape index_shape = ExpandDimIf0D(index_tensor->shape_view()); + const int ndim = src_shape.NumAxes(); + DimOpIndexNdHelper src_nd_helper(src_shape.data(), ndim); + DimOpIndexNdHelper idx_nd_helper(index_shape.data(), ndim); + DimOpIndexNdHelper output_nd_helper(out_tensor->shape_view().data(), ndim); + + const int64_t upper_bound = [&]() { + if (input_tensor) { + const Shape input_shape = ExpandDimIf0D(input_tensor->shape_view()); + return input_shape.At(dim); + } else { + const Shape like_shape = ExpandDimIf0D(like_tensor->shape_view()); + return like_shape.At(dim); + } + }(); DimScatterFunctor()( - ctx->stream(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); + ctx->stream(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, index_shape.elem_cnt(), + dim, upper_bound, index, src, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp index 34fab14c90c..0aea4238e05 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp @@ -35,7 +35,7 @@ class DimScatterScalarKernel final : public user_op::OpKernel { const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + out_tensor->shape_view().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); const IN_T src_scalar = static_cast(ctx->Attr("src_scalar")); @@ -48,27 +48,28 @@ class DimScatterScalarKernel final : public user_op::OpKernel { UNIMPLEMENTED() << "Input tensor and like tensor cannot be empty simultaneously."; } - const int ndim = out_tensor->shape().NumAxes(); + const int ndim = out_tensor->shape_view().NumAxes(); small_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int32_t dim) -> IDX_T { return static_cast(dim); }); }; - shape2dims(index_tensor->shape()); + shape2dims(index_tensor->shape_view()); DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); - shape2dims(out_tensor->shape()); + shape2dims(out_tensor->shape_view()); DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); int64_t upper_bound = 0; if (input_tensor) { - upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + upper_bound = + input_tensor->shape_view().At(dim); // ensure the idx is smaller than upperbound } else { - upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + upper_bound = like_tensor->shape_view().At(dim); // ensure the idx is smaller than upperbound } DimScatterScalarFunctor()( - ctx->stream(), idx_nd_helper, output_nd_helper, ndim, index_tensor->shape().elem_cnt(), dim, - upper_bound, index, src_scalar, output); + ctx->stream(), idx_nd_helper, output_nd_helper, ndim, index_tensor->shape_view().elem_cnt(), + dim, upper_bound, index, src_scalar, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/distributions/normal_kernel.h b/oneflow/user/kernels/distributions/normal_kernel.h index d5358b2c8c6..efd407435a1 100644 --- a/oneflow/user/kernels/distributions/normal_kernel.h +++ b/oneflow/user/kernels/distributions/normal_kernel.h @@ -47,7 +47,7 @@ class NormalKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); const double mean = ctx->Attr("mean"); const double std = ctx->Attr("std"); - int64_t elem_cnt = out->shape().elem_cnt(); + int64_t elem_cnt = out->shape_view().elem_cnt(); T* out_dptr = out->mut_dptr(); auto* distribution_state = dynamic_cast(state); CHECK_NOTNULL(distribution_state); diff --git a/oneflow/user/kernels/distributions/uniform_int_kernel.h b/oneflow/user/kernels/distributions/uniform_int_kernel.h index a57ccc3f93c..272a969e4b4 100644 --- a/oneflow/user/kernels/distributions/uniform_int_kernel.h +++ b/oneflow/user/kernels/distributions/uniform_int_kernel.h @@ -97,7 +97,7 @@ class UniformIntKernel final : public user_op::OpKernel { " casted to dtype"; } check_from_to_in_range(from, to - 1); - int64_t elem_cnt = out->shape().elem_cnt(); + int64_t elem_cnt = out->shape_view().elem_cnt(); T* out_dptr = out->mut_dptr(); auto* distribution_state = dynamic_cast(state); CHECK_NOTNULL(distribution_state); diff --git a/oneflow/user/kernels/distributions/uniform_kernel.h b/oneflow/user/kernels/distributions/uniform_kernel.h index 2e542cecc9a..4ee30407695 100644 --- a/oneflow/user/kernels/distributions/uniform_kernel.h +++ b/oneflow/user/kernels/distributions/uniform_kernel.h @@ -47,7 +47,7 @@ class UniformKernel final : public user_op::OpKernel { const double from = ctx->Attr("from"); const double to = ctx->Attr("to"); check_from_to_in_range(from, to); - int64_t elem_cnt = out->shape().elem_cnt(); + int64_t elem_cnt = out->shape_view().elem_cnt(); T* out_dptr = out->mut_dptr(); auto* distribution_state = dynamic_cast(state); CHECK_NOTNULL(distribution_state); diff --git a/oneflow/user/kernels/dot_kernel.cpp b/oneflow/user/kernels/dot_kernel.cpp index 4e055ceefeb..562993e3d26 100644 --- a/oneflow/user/kernels/dot_kernel.cpp +++ b/oneflow/user/kernels/dot_kernel.cpp @@ -47,7 +47,7 @@ class DotKernel final : public user_op::OpKernel { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t n = x->shape().elem_cnt(); + int64_t n = x->shape_view().elem_cnt(); auto primitive = NewMatmulPrimitive(ctx); primitive->Launch(ctx->stream(), 1, 1, n, 1, x->dptr(), y->dptr(), 0, out->mut_dptr()); diff --git a/oneflow/user/kernels/dropout_kernel.cpp b/oneflow/user/kernels/dropout_kernel.cpp index 088e878f8a5..77d557c8154 100644 --- a/oneflow/user/kernels/dropout_kernel.cpp +++ b/oneflow/user/kernels/dropout_kernel.cpp @@ -74,19 +74,19 @@ class DropoutKernelCPU final : public user_op::OpKernel { std::shared_ptr cpu_generator = CHECK_JUST(generator->Get()); - FusedDropoutKernel(ctx->stream(), in->shape().elem_cnt(), cpu_generator, rate, scale, + FusedDropoutKernel(ctx->stream(), in->shape_view().elem_cnt(), cpu_generator, rate, scale, in->dptr(), mask->mut_dptr(), out->mut_dptr()); if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), out->data_type()); - CHECK_EQ(add_to_output->shape(), out->shape()); + CHECK_EQ(add_to_output->shape_view(), out->shape_view()); std::unique_ptr primitive = ep::primitive::NewPrimitive(DeviceType::kCPU, add_to_output->data_type()); CHECK(primitive); primitive->Launch(ctx->stream(), out->dptr(), add_to_output->dptr(), out->mut_dptr(), - add_to_output->shape().elem_cnt()); + add_to_output->shape_view().elem_cnt()); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -119,8 +119,8 @@ class DropoutGradKernelCPU final : public user_op::OpKernel { const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const float scale = ctx->Attr("scale"); - MaskAndScale(ctx->stream(), dy->shape().elem_cnt(), scale, dy->dptr(), mask->dptr(), - dx->mut_dptr()); + MaskAndScale(ctx->stream(), dy->shape_view().elem_cnt(), scale, dy->dptr(), + mask->dptr(), dx->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/dropout_kernel.cu b/oneflow/user/kernels/dropout_kernel.cu index 6f05ec435bd..23c6cfc5c1a 100644 --- a/oneflow/user/kernels/dropout_kernel.cu +++ b/oneflow/user/kernels/dropout_kernel.cu @@ -435,11 +435,11 @@ class DropoutKernelGPU final : public user_op::OpKernel, public user_op::CudaGra if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); DispatchTail( - stream, seed, cuda_gen_state, in->shape().elem_cnt(), rate, scale, + stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale, reinterpret_cast(in->dptr()), reinterpret_cast(mask->mut_dptr()), reinterpret_cast(addend->dptr()), reinterpret_cast(out->mut_dptr())); } else { - DispatchTail(stream, seed, cuda_gen_state, in->shape().elem_cnt(), rate, scale, + DispatchTail(stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale, reinterpret_cast(in->dptr()), reinterpret_cast(mask->mut_dptr()), nullptr, reinterpret_cast(out->mut_dptr())); @@ -474,7 +474,7 @@ class DropoutGradKernelGPU final : public user_op::OpKernel, public user_op::Cud const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const float scale = ctx->Attr("scale"); - const int64_t elem_cnt = dy->shape().elem_cnt(); + const int64_t elem_cnt = dy->shape_view().elem_cnt(); OF_CUDA_CHECK((cuda::elementwise::Binary( MaskAndScaleFunctor(scale), elem_cnt, reinterpret_cast(dx->mut_dptr()), reinterpret_cast(dy->dptr()), reinterpret_cast(mask->dptr()), diff --git a/oneflow/user/kernels/eager_nccl_kernels.cpp b/oneflow/user/kernels/eager_nccl_kernels.cpp index f8272aac9a6..01a934bacc3 100644 --- a/oneflow/user/kernels/eager_nccl_kernels.cpp +++ b/oneflow/user/kernels/eager_nccl_kernels.cpp @@ -96,13 +96,13 @@ class EagerCclBroadcastKernel final : public user_op::OpKernel { int64_t root = ctx->Attr("root"); const void* in_ptr = nullptr; if (GlobalProcessCtx::Rank() == root) { - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); in_ptr = in->dptr(); } - CHECK_JUST(ccl::Broadcast(in_ptr, out->mut_dptr(), out->shape().elem_cnt(), - out->data_type(), root, - kernel_cache->parallel_desc(), ctx->stream())); + CHECK_JUST(ccl::Broadcast( + in_ptr, out->mut_dptr(), out->shape_view().elem_cnt(), out->data_type(), root, + kernel_cache->parallel_desc(), ctx->stream())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -150,11 +150,11 @@ class EagerCclReduceKernel final : public user_op::OpKernel { int64_t root = ctx->Attr("root"); void* out_ptr = nullptr; if (GlobalProcessCtx::Rank() == root) { - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); out_ptr = out->mut_dptr(); } - CHECK_JUST(ccl::Reduce(in->dptr(), out_ptr, in->shape().elem_cnt(), + CHECK_JUST(ccl::Reduce(in->dptr(), out_ptr, in->shape_view().elem_cnt(), in->data_type(), ccl::kSum, root, kernel_cache->parallel_desc(), ctx->stream())); }; @@ -183,11 +183,11 @@ class EagerCclAllReduceKernel final : public user_op::OpKernel { CHECK(kernel_cache != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); CHECK_JUST(ccl::AllReduce( - in->dptr(), out->mut_dptr(), out->shape().elem_cnt(), out->data_type(), ccl::kSum, + in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), out->data_type(), ccl::kSum, kernel_cache->parallel_desc(), ctx->stream())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -220,7 +220,7 @@ class EagerCclReduceScatterKernel final : public user_op::OpKernel { const auto& op_type = ctx->Attr("op_type"); CHECK_EQ(op_type, "sum"); CHECK_JUST(ccl::ReduceScatter( - in->dptr(), out->mut_dptr(), out->shape().elem_cnt(), out->data_type(), ccl::kSum, + in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), out->data_type(), ccl::kSum, kernel_cache->parallel_desc(), ctx->stream())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -250,9 +250,9 @@ class EagerCclAllGatherKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(in->data_type(), out->data_type()); - CHECK_JUST(ccl::AllGather(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), - out->data_type(), kernel_cache->parallel_desc(), - ctx->stream())); + CHECK_JUST(ccl::AllGather(in->dptr(), out->mut_dptr(), + in->shape_view().elem_cnt(), out->data_type(), + kernel_cache->parallel_desc(), ctx->stream())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -284,23 +284,23 @@ class EagerCclS2SKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = in->shape().elem_cnt() * dtype_size; + int64_t data_size = in->shape_view().elem_cnt() * dtype_size; // NOTE: in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out const char* pack_to_ptr = in->dptr(); char* unpack_from_ptr = out->mut_dptr(); - int64_t tmp_size = tmp_buffer->shape().elem_cnt(); + int64_t tmp_size = tmp_buffer->shape_view().elem_cnt(); CHECK_EQ(tmp_size, data_size * 2); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = kernel_cache->parallel_desc()->parallel_num(); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt()) - << in->shape().ToString() << " vs " << out->shape().ToString(); - const int64_t elem_cnt = in->shape().elem_cnt(); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()) + << in->shape_view().ToString() << " vs " << out->shape_view().ToString(); + const int64_t elem_cnt = in->shape_view().elem_cnt(); const int64_t in_split_axis = ctx->Attr("in_split_axis"); const int64_t out_split_axis = ctx->Attr("out_split_axis"); DimVector logical_shape_dim_vec; - in->shape().ToDimVector(&logical_shape_dim_vec); + in->shape_view().ToDimVector(&logical_shape_dim_vec); logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; if (out_split_axis != 0) { diff --git a/oneflow/user/kernels/eager_nccl_kernels.cu b/oneflow/user/kernels/eager_nccl_kernels.cu index 37c208c84bd..3b26cdef04f 100644 --- a/oneflow/user/kernels/eager_nccl_kernels.cu +++ b/oneflow/user/kernels/eager_nccl_kernels.cu @@ -90,11 +90,11 @@ class EagerNcclAllReduceKernel final : public user_op::OpKernel { CHECK(kernel_cache != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); ncclRedOp_t reduce_type = ncclSum; if (in->data_type() == kBool) { reduce_type = ncclMax; } - OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), + OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), reduce_type, kernel_cache->comm(), ctx->stream()->As()->cuda_stream())); }; @@ -129,11 +129,11 @@ class EagerNcclBroadcastKernel final : public user_op::OpKernel { CHECK_JUST(kernel_cache->parallel_desc()->ParallelId4MachineDeviceId(root, dev_id)); const void* in_ptr = nullptr; if (GlobalProcessCtx::Rank() == root) { - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); in_ptr = in->dptr(); } - OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape().elem_cnt(), + OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape_view().elem_cnt(), GetNcclDataType(out->data_type()), nccl_root, kernel_cache->comm(), ctx->stream()->As()->cuda_stream())); }; @@ -182,15 +182,16 @@ class EagerNcclReduceKernel final : public user_op::OpKernel { int64_t root = ctx->Attr("root"); void* out_ptr = nullptr; if (GlobalProcessCtx::Rank() == root) { - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); out_ptr = out->mut_dptr(); } ncclRedOp_t reduce_type = ncclSum; if (in->data_type() == kBool) { reduce_type = ncclMax; } - OF_NCCL_CHECK(ncclReduce( - in->dptr(), out_ptr, in->shape().elem_cnt(), GetNcclDataType(in->data_type()), reduce_type, - root, kernel_cache->comm(), ctx->stream()->As()->cuda_stream())); + OF_NCCL_CHECK(ncclReduce(in->dptr(), out_ptr, in->shape_view().elem_cnt(), + GetNcclDataType(in->data_type()), reduce_type, root, + kernel_cache->comm(), + ctx->stream()->As()->cuda_stream())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -226,7 +227,7 @@ class EagerNcclReduceScatterKernel final : public user_op::OpKernel { reduce_type = CHECK_JUST(MapAt(op_type2ncclRedOp_t, op_type)); } OF_NCCL_CHECK(ncclReduceScatter( - in->dptr(), out->mut_dptr(), out->shape().elem_cnt(), GetNcclDataType(in->data_type()), + in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), reduce_type, kernel_cache->comm(), ctx->stream()->As()->cuda_stream())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -260,7 +261,7 @@ class EagerNcclAllGatherKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(in->data_type(), out->data_type()); - OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), + OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), kernel_cache->comm(), ctx->stream()->As()->cuda_stream())); }; @@ -294,23 +295,23 @@ class EagerNcclS2SKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); int64_t tmp_size = 0; const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = GetCudaAlignedSize(in->shape().elem_cnt() * dtype_size); + int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size); // NOTE(chengcheng): in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out const char* pack_to_ptr = in->dptr(); char* unpack_from_ptr = out->mut_dptr(); - if (tmp_buffer) { tmp_size = tmp_buffer->shape().elem_cnt(); } + if (tmp_buffer) { tmp_size = tmp_buffer->shape_view().elem_cnt(); } CHECK(tmp_size == 0 || tmp_size == data_size || tmp_size == data_size * 2); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = kernel_cache->parallel_desc()->parallel_num(); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt()) - << in->shape().ToString() << " vs " << out->shape().ToString(); - const int64_t elem_cnt = in->shape().elem_cnt(); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()) + << in->shape_view().ToString() << " vs " << out->shape_view().ToString(); + const int64_t elem_cnt = in->shape_view().elem_cnt(); const int64_t in_split_axis = ctx->Attr("in_split_axis"); const int64_t out_split_axis = ctx->Attr("out_split_axis"); DimVector logical_shape_dim_vec; - in->shape().ToDimVector(&logical_shape_dim_vec); + in->shape_view().ToDimVector(&logical_shape_dim_vec); logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; if (out_split_axis != 0) { diff --git a/oneflow/user/kernels/eager_symmetric_s_to_p_kernel.cpp b/oneflow/user/kernels/eager_symmetric_s_to_p_kernel.cpp index f9b69157557..a17ecdc9f29 100644 --- a/oneflow/user/kernels/eager_symmetric_s_to_p_kernel.cpp +++ b/oneflow/user/kernels/eager_symmetric_s_to_p_kernel.cpp @@ -108,7 +108,7 @@ class EagerSymmetricSToPKernel final : public user_op::OpKernel { CHECK(kernel_cache != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const auto& out_shape_view = out->shape(); + const auto& out_shape_view = out->shape_view(); const void* in_ptr = in->dptr(); void* out_ptr = out->mut_dptr(); diff --git a/oneflow/user/kernels/elementwise_maximum_minimum_kernel.h b/oneflow/user/kernels/elementwise_maximum_minimum_kernel.h index d04677eb801..37f63320b2d 100644 --- a/oneflow/user/kernels/elementwise_maximum_minimum_kernel.h +++ b/oneflow/user/kernels/elementwise_maximum_minimum_kernel.h @@ -91,7 +91,7 @@ class ElemwiseXimumKernel final : public user_op::OpKernel { const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); ElemwiseXimumFunctor()(ctx->stream(), n, tensor_z->mut_dptr(), tensor_x->dptr(), tensor_y->dptr()); @@ -121,8 +121,9 @@ class ElemwiseXimumBackwardKernel final : public user_op::OpKernel { T* dptr_dx = tensor_dx ? tensor_dx->mut_dptr() : nullptr; T* dptr_dy = tensor_dy ? tensor_dy->mut_dptr() : nullptr; - ElemwiseXimumGradFunctor()(ctx->stream(), tensor_dz->shape().elem_cnt(), - dptr_dz, dptr_x, dptr_y, dptr_dx, dptr_dy); + ElemwiseXimumGradFunctor()(ctx->stream(), + tensor_dz->shape_view().elem_cnt(), dptr_dz, + dptr_x, dptr_y, dptr_dx, dptr_dy); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/elementwise_xpu_kernel.h b/oneflow/user/kernels/elementwise_xpu_kernel.h index 383cd89c4e6..dce15338a06 100644 --- a/oneflow/user/kernels/elementwise_xpu_kernel.h +++ b/oneflow/user/kernels/elementwise_xpu_kernel.h @@ -72,8 +72,8 @@ class UnaryElemwiseXpuKernel final : public user_op::OpKernel, public user_op::C const user_op::Tensor* input_a_tensor = ctx->Tensor4ArgNameAndIndex(input_a_name, 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex(output_name, 0); - const ShapeView input_a_shape = input_a_tensor->shape(); - const ShapeView out_shape = out_tensor->shape(); + const ShapeView input_a_shape = input_a_tensor->shape_view(); + const ShapeView out_shape = out_tensor->shape_view(); CHECK_EQ(input_a_shape, out_shape); const InputA* input_a_ptr = input_a_tensor->dptr(); @@ -113,8 +113,8 @@ class UnaryPrimitiveKernel final : public user_op::OpKernel, public user_op::Cud const user_op::Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex(input_name_, 0); user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex(output_name_, 0); - const ShapeView& input_shape = input_tensor->shape(); - const ShapeView& output_shape = output_tensor->shape(); + const ShapeView& input_shape = input_tensor->shape_view(); + const ShapeView& output_shape = output_tensor->shape_view(); CHECK_EQ(input_shape, output_shape) << "Input shape should be equal to Output shape."; const int64_t elem_cnt = input_shape.elem_cnt(); @@ -155,9 +155,9 @@ class BinaryElemwiseXpuKernel final : public user_op::OpKernel, public user_op:: const user_op::Tensor* input_b_tensor = ctx->Tensor4ArgNameAndIndex(input_b_name, 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex(output_name, 0); - const ShapeView input_a_shape = input_a_tensor->shape(); - const ShapeView input_b_shape = input_b_tensor->shape(); - const ShapeView out_shape = out_tensor->shape(); + const ShapeView input_a_shape = input_a_tensor->shape_view(); + const ShapeView input_b_shape = input_b_tensor->shape_view(); + const ShapeView out_shape = out_tensor->shape_view(); CHECK_EQ(input_a_shape, out_shape); CHECK_EQ(input_b_shape, out_shape); diff --git a/oneflow/user/kernels/embedding_kernel.cpp b/oneflow/user/kernels/embedding_kernel.cpp index 445c6c282d2..9855352d377 100644 --- a/oneflow/user/kernels/embedding_kernel.cpp +++ b/oneflow/user/kernels/embedding_kernel.cpp @@ -34,13 +34,13 @@ class CpuEmbeddingRenormKernel final : public user_op::OpKernel { const double max_norm = ctx->Attr("max_norm"); const double norm_type = ctx->Attr("norm_type"); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); const int64_t emb_size = in_shape.At(0); const int64_t emb_dim = in_shape.At(1); const T* in_buf = in->dptr(); const IndexType* indices_buf = indices->dptr(); T* out_buf = out->mut_dptr(); - const int64_t num_indices = indices->shape().elem_cnt(); + const int64_t num_indices = indices->shape_view().elem_cnt(); EmbeddingReNormFunctor()( ctx->stream(), in_buf, indices_buf, out_buf, max_norm, norm_type, num_indices, emb_size, emb_dim, nullptr); @@ -62,9 +62,9 @@ class CpuEmbeddingKernel final : public user_op::OpKernel { const int64_t padding_idx = ctx->Attr("padding_idx"); const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); - const ShapeView& out_shape = out->shape(); + const ShapeView& out_shape = out->shape_view(); const int64_t num_indices = out_shape.Count(0, out_shape.NumAxes() - 1); - const int64_t emb_size = weight->shape().At(0); + const int64_t emb_size = weight->shape_view().At(0); const int64_t emb_dim = out_shape.At(out_shape.NumAxes() - 1); const T* weight_buf = weight->dptr(); const IndexType* indices_buf = indices->dptr(); @@ -92,9 +92,9 @@ class CpuEmbeddingGradKernel final : public user_op::OpKernel { const int64_t padding_idx = ctx->Attr("padding_idx"); const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); - const ShapeView& dy_shape = dy->shape(); + const ShapeView& dy_shape = dy->shape_view(); const int64_t num_indices = dy_shape.Count(0, dy_shape.NumAxes() - 1); - const int64_t emb_size = weight->shape().At(0); + const int64_t emb_size = weight->shape_view().At(0); const int64_t emb_dim = dy_shape.At(dy_shape.NumAxes() - 1); const T* dy_buf = dy->dptr(); @@ -104,7 +104,7 @@ class CpuEmbeddingGradKernel final : public user_op::OpKernel { std::unique_ptr memset_primitive = ep::primitive::NewPrimitive(ctx->device_type()); CHECK(memset_primitive); - memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape().Count(0) * sizeof(T)); + memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape_view().Count(0) * sizeof(T)); EmbeddingGradFunctor()(ctx->stream(), dy_buf, indices_buf, dx_buf, padding_idx, scale_grad_by_freq, num_indices, emb_size, emb_dim, nullptr); diff --git a/oneflow/user/kernels/embedding_kernel.cu b/oneflow/user/kernels/embedding_kernel.cu index 261f908b044..c8d9899e825 100644 --- a/oneflow/user/kernels/embedding_kernel.cu +++ b/oneflow/user/kernels/embedding_kernel.cu @@ -36,13 +36,13 @@ class GpuEmbeddingRenormKernel final : public user_op::OpKernel { const double max_norm = ctx->Attr("max_norm"); const double norm_type = ctx->Attr("norm_type"); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); const int64_t emb_size = in_shape.At(0); const int64_t emb_dim = in_shape.At(1); const T* in_buf = in->dptr(); const IndexType* indices_buf = indices->dptr(); T* out_buf = out->mut_dptr(); - const int64_t num_indices = indices->shape().elem_cnt(); + const int64_t num_indices = indices->shape_view().elem_cnt(); int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr(); std::unique_ptr memset_primitive = ep::primitive::NewPrimitive(ctx->device_type()); @@ -71,9 +71,9 @@ class GpuEmbeddingKernel final : public user_op::OpKernel { const int64_t padding_idx = ctx->Attr("padding_idx"); const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); - const int64_t num_indices = indices->shape().elem_cnt(); - const int64_t emb_size = weight->shape().At(0); - const int64_t emb_dim = weight->shape().At(1); + const int64_t num_indices = indices->shape_view().elem_cnt(); + const int64_t emb_size = weight->shape_view().At(0); + const int64_t emb_dim = weight->shape_view().At(1); const T* weight_buf = weight->dptr(); const IndexType* indices_buf = indices->dptr(); T* out_buf = out->mut_dptr(); @@ -101,9 +101,9 @@ class GpuEmbeddingGradKernel final : public user_op::OpKernel { const int64_t padding_idx = ctx->Attr("padding_idx"); const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); - const int64_t num_indices = indices->shape().elem_cnt(); - const int64_t emb_size = weight->shape().At(0); - const int64_t emb_dim = weight->shape().At(1); + const int64_t num_indices = indices->shape_view().elem_cnt(); + const int64_t emb_size = weight->shape_view().At(0); + const int64_t emb_dim = weight->shape_view().At(1); const T* dy_buf = dy->dptr(); const IndexType* indices_buf = indices->dptr(); @@ -112,7 +112,7 @@ class GpuEmbeddingGradKernel final : public user_op::OpKernel { std::unique_ptr memset_primitive = ep::primitive::NewPrimitive(ctx->device_type()); CHECK(memset_primitive); - memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape().elem_cnt() * sizeof(T)); + memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape_view().elem_cnt() * sizeof(T)); memset_primitive->Launch(ctx->stream(), tmp_buf, 0, GetCudaAlignedSize(sizeof(int32_t) * emb_size)); EmbeddingGradFunctor()( diff --git a/oneflow/user/kernels/empty_kernel.cpp b/oneflow/user/kernels/empty_kernel.cpp index 71c8c4c2d54..9efe2266e13 100644 --- a/oneflow/user/kernels/empty_kernel.cpp +++ b/oneflow/user/kernels/empty_kernel.cpp @@ -33,7 +33,7 @@ class EmptyKernel final : public OpKernel { // None POD type need check if (!IsPODAndHalfDataType(dtype)) { - CHECK(out->shape().NumAxes() > 0 && out->shape().elem_cnt() == 0) + CHECK(out->shape_view().NumAxes() > 0 && out->shape_view().elem_cnt() == 0) << "None POD Tensor created by empty op must be 0-Size tensor."; } } diff --git a/oneflow/user/kernels/erfinv_kernel.cpp b/oneflow/user/kernels/erfinv_kernel.cpp index dffa9372146..3612eb7a703 100644 --- a/oneflow/user/kernels/erfinv_kernel.cpp +++ b/oneflow/user/kernels/erfinv_kernel.cpp @@ -27,7 +27,7 @@ class CpuErfinvKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); const T* x_ptr = x->dptr(); T* y_ptr = y->mut_dptr(); constexpr float central_range = 0.7; diff --git a/oneflow/user/kernels/erfinv_kernel.cu b/oneflow/user/kernels/erfinv_kernel.cu index afdad2117cd..cdaaf717b84 100644 --- a/oneflow/user/kernels/erfinv_kernel.cu +++ b/oneflow/user/kernels/erfinv_kernel.cu @@ -36,7 +36,7 @@ class GpuErfinvKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); OF_CUDA_CHECK(cuda::elementwise::Unary(ErfInvFunctor(), elem_cnt, y->mut_dptr(), x->dptr(), ctx->stream()->As()->cuda_stream())); diff --git a/oneflow/user/kernels/example_generated.h b/oneflow/user/kernels/example_generated.h index 00b1aba3d54..acb3a1cfa98 100644 --- a/oneflow/user/kernels/example_generated.h +++ b/oneflow/user/kernels/example_generated.h @@ -561,7 +561,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_DATA_TYPE = 6, VT_DATA = 8 }; - const flatbuffers::Vector* shape() const { + const flatbuffers::Vector* shape_view() const { return GetPointer*>(VT_SHAPE); } onerec::example::TensorData data_type() const { @@ -612,7 +612,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier& verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) - && verifier.VerifyVector(shape()) && VerifyField(verifier, VT_DATA_TYPE) + && verifier.VerifyVector(shape_view()) && VerifyField(verifier, VT_DATA_TYPE) && VerifyOffset(verifier, VT_DATA) && VerifyTensorData(verifier, data(), data_type()) && verifier.EndTable(); } diff --git a/oneflow/user/kernels/expand_kernel.cpp b/oneflow/user/kernels/expand_kernel.cpp index 02e80dd4d0c..742f105019a 100644 --- a/oneflow/user/kernels/expand_kernel.cpp +++ b/oneflow/user/kernels/expand_kernel.cpp @@ -37,8 +37,8 @@ class CpuExpandKernel final : public user_op::OpKernel { return; } std::vector in_shape; - in_shape.resize(in->shape().NumAxes()); - for (int i = 0; i < in->shape().NumAxes(); ++i) { in_shape[i] = in->shape().At(i); } + in_shape.resize(in->shape_view().NumAxes()); + for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } std::vector out_shape; std::vector expand_stride; @@ -46,8 +46,8 @@ class CpuExpandKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int32_t out_dims = out->shape().NumAxes(); - const int32_t out_size = out->shape().elem_cnt(); + const int32_t out_dims = out->shape_view().NumAxes(); + const int32_t out_size = out->shape_view().elem_cnt(); int32_t out_stride[out_dims]; InitStride(out_stride, out_shape.data(), out_dims); for (int32_t i = 0; i < out_size; ++i) { @@ -88,8 +88,8 @@ class CpuExpandGradKernel final : public user_op::OpKernel { ctx->Attr>("logical_expand_shape"); std::vector in_shape; - in_shape.resize(in->shape().NumAxes()); - for (int i = 0; i < in->shape().NumAxes(); ++i) { in_shape[i] = in->shape().At(i); } + in_shape.resize(in->shape_view().NumAxes()); + for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } std::vector out_shape; std::vector expand_stride; CHECK_JUST(getOutShapeAndStrideForBp(logical_out_shape, logical_expand_shape, in_shape, @@ -98,12 +98,12 @@ class CpuExpandGradKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int32_t in_dims = in->shape().NumAxes(); - const int32_t in_size = in->shape().elem_cnt(); + const int32_t in_dims = in->shape_view().NumAxes(); + const int32_t in_size = in->shape_view().elem_cnt(); int32_t in_stride[in_dims]; InitStride(in_stride, in_shape.data(), in_dims); - std::fill(out_ptr, out_ptr + out->shape().elem_cnt(), static_cast(0)); + std::fill(out_ptr, out_ptr + out->shape_view().elem_cnt(), static_cast(0)); for (int i = 0; i < in_size; ++i) { int offset = OffsetToNdIndexToOffset(i, in_stride, expand_stride.data(), in_dims); out_ptr[offset] += in_ptr[i]; diff --git a/oneflow/user/kernels/expand_kernel.cu b/oneflow/user/kernels/expand_kernel.cu index 104e2f6d4fa..fcfbb5b7dab 100644 --- a/oneflow/user/kernels/expand_kernel.cu +++ b/oneflow/user/kernels/expand_kernel.cu @@ -124,8 +124,8 @@ class GpuExpandKernel final : public user_op::OpKernel { return; } std::vector in_shape; - in_shape.resize(in->shape().NumAxes()); - for (int i = 0; i < in->shape().NumAxes(); ++i) { in_shape[i] = in->shape().At(i); } + in_shape.resize(in->shape_view().NumAxes()); + for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } std::vector out_shape; std::vector stride; @@ -133,8 +133,8 @@ class GpuExpandKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int32_t out_dims = out->shape().NumAxes(); - const int32_t out_size = out->shape().elem_cnt(); + const int32_t out_dims = out->shape_view().NumAxes(); + const int32_t out_size = out->shape_view().elem_cnt(); STRIDES expand_stride; for (int i = 0; i < out_dims; ++i) { expand_stride.val[i] = stride[i]; } @@ -178,8 +178,8 @@ class GpuExpandGradKernel final : public user_op::OpKernel { ctx->Attr>("logical_expand_shape"); std::vector in_shape; - in_shape.resize(in->shape().NumAxes()); - for (int i = 0; i < in->shape().NumAxes(); ++i) { in_shape[i] = in->shape().At(i); } + in_shape.resize(in->shape_view().NumAxes()); + for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } std::vector out_shape; std::vector stride; CHECK_JUST(getOutShapeAndStrideForBp(logical_out_shape, logical_expand_shape, in_shape, @@ -188,9 +188,9 @@ class GpuExpandGradKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int32_t in_dims = in->shape().NumAxes(); - const int32_t in_size = in->shape().elem_cnt(); - const int32_t out_size = out->shape().elem_cnt(); + const int32_t in_dims = in->shape_view().NumAxes(); + const int32_t in_size = in->shape_view().elem_cnt(); + const int32_t out_size = out->shape_view().elem_cnt(); STRIDES expand_stride; for (int i = 0; i < in_dims; ++i) { expand_stride.val[i] = stride[i]; } diff --git a/oneflow/user/kernels/eye_kernel.cpp b/oneflow/user/kernels/eye_kernel.cpp index 1e7c102c320..0d99a303c43 100644 --- a/oneflow/user/kernels/eye_kernel.cpp +++ b/oneflow/user/kernels/eye_kernel.cpp @@ -34,7 +34,7 @@ class EyeKernel final : public OpKernel { T* out = out_tensor->mut_dptr(); Memset( ctx->stream(), out_tensor->mut_dptr(), 0, - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type())); + out_tensor->shape_view().elem_cnt() * GetSizeOfDataType(out_tensor->data_type())); EyeFunctor()(ctx->stream(), cols, std::min(cols, rows), out); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/fake_quantization_kernel.cpp b/oneflow/user/kernels/fake_quantization_kernel.cpp index 9b9ee9072b8..1cf2042f0fb 100644 --- a/oneflow/user/kernels/fake_quantization_kernel.cpp +++ b/oneflow/user/kernels/fake_quantization_kernel.cpp @@ -90,10 +90,10 @@ class CpuFakeQuantizationKernel final : public user_op::OpKernel { if (quantization_formula == "google") { int64_t outer_num = 1; - int64_t inner_num = in->shape().elem_cnt(); - if (scale->shape().elem_cnt() > 1) { // per-channel quantization - outer_num = in->shape().At(0); - inner_num = in->shape().Count(1); + int64_t inner_num = in->shape_view().elem_cnt(); + if (scale->shape_view().elem_cnt() > 1) { // per-channel quantization + outer_num = in->shape_view().At(0); + inner_num = in->shape_view().Count(1); } if (quantization_scheme == "symmetric") { @@ -114,7 +114,7 @@ class CpuFakeQuantizationKernel final : public user_op::OpKernel { } } else if (quantization_formula == "cambricon") { FakeQuantizationPerLayerCambricon(in_ptr, scale_ptr[0], quantization_bit, - in->shape().elem_cnt(), out_ptr); + in->shape_view().elem_cnt(), out_ptr); } else { UNIMPLEMENTED(); } diff --git a/oneflow/user/kernels/fake_quantization_kernel.cu b/oneflow/user/kernels/fake_quantization_kernel.cu index 4bc066f3980..6cda702806a 100644 --- a/oneflow/user/kernels/fake_quantization_kernel.cu +++ b/oneflow/user/kernels/fake_quantization_kernel.cu @@ -115,9 +115,9 @@ class GpuFakeQuantizationKernel final : public user_op::OpKernel { const int32_t quantization_bit = ctx->Attr("quantization_bit"); const std::string quantization_formula = ctx->Attr("quantization_formula"); - const int64_t elements = in->shape().elem_cnt(); - const int64_t panel_size = in->shape().Count(1); - const int64_t scale_size = scale->shape().elem_cnt(); + const int64_t elements = in->shape_view().elem_cnt(); + const int64_t panel_size = in->shape_view().Count(1); + const int64_t scale_size = scale->shape_view().elem_cnt(); // round to even auto origin_round_mode = std::fegetround(); diff --git a/oneflow/user/kernels/flip_kernel.cpp b/oneflow/user/kernels/flip_kernel.cpp index cdf4d97a77e..1627b5a134c 100644 --- a/oneflow/user/kernels/flip_kernel.cpp +++ b/oneflow/user/kernels/flip_kernel.cpp @@ -62,22 +62,22 @@ class FlipCpuKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = y_tensor->shape().elem_cnt(); + const int32_t elem_cnt = y_tensor->shape_view().elem_cnt(); if (elem_cnt == 0) { return; } - const int32_t total_dims = y_tensor->shape().NumAxes(); + const int32_t total_dims = y_tensor->shape_view().NumAxes(); std::vector dims = ctx->Attr>("dims"); VIS vis; for (auto x : dims) { vis.val[x] = true; } SIZE_V sizes_v; - for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape().At(i); } + for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape_view().At(i); } // TODO(bbuf) delete strides caluculate, after tensor strides supported SIZE_V strides_v; strides_v.val[total_dims - 1] = 1; for (int32_t i = total_dims - 2; i >= 0; i--) { - strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape().At(i + 1); + strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape_view().At(i + 1); } FlipCpuForward(elem_cnt, total_dims, sizes_v, vis, strides_v, x_tensor->dptr(), diff --git a/oneflow/user/kernels/flip_kernel.cu b/oneflow/user/kernels/flip_kernel.cu index 812c3301e25..b415d469391 100644 --- a/oneflow/user/kernels/flip_kernel.cu +++ b/oneflow/user/kernels/flip_kernel.cu @@ -63,22 +63,22 @@ class FlipGpuKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = y_tensor->shape().elem_cnt(); + const int32_t elem_cnt = y_tensor->shape_view().elem_cnt(); if (elem_cnt == 0) { return; } - const int32_t total_dims = y_tensor->shape().NumAxes(); + const int32_t total_dims = y_tensor->shape_view().NumAxes(); std::vector dims = ctx->Attr>("dims"); VIS vis; for (auto x : dims) { vis.val[x] = true; } SIZE_V sizes_v; - for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape().At(i); } + for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape_view().At(i); } // TODO(bbuf) delete strides caluculate, after tensor strides supported SIZE_V strides_v; strides_v.val[total_dims - 1] = 1; for (int32_t i = total_dims - 2; i >= 0; i--) { - strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape().At(i + 1); + strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape_view().At(i + 1); } RUN_CUDA_KERNEL((FlipGpuForward), ctx->stream(), elem_cnt, elem_cnt, total_dims, sizes_v, vis, strides_v, x_tensor->dptr(), y_tensor->mut_dptr()); diff --git a/oneflow/user/kernels/fold_kernel.cpp b/oneflow/user/kernels/fold_kernel.cpp index 1a48f75cec9..f8a8a8c3221 100644 --- a/oneflow/user/kernels/fold_kernel.cpp +++ b/oneflow/user/kernels/fold_kernel.cpp @@ -71,9 +71,10 @@ class FoldKernel final : public OpKernel { const std::vector stride = ctx->Attr>("strides"); const auto& state_ptr = CreateFoldOpKernelState( - input->shape(), output_size, kernel_size, padding, stride, dilation); + input->shape_view(), output_size, kernel_size, padding, stride, dilation); const FoldParams params = state_ptr->params(); - size_t out_bytes_size = output->shape().elem_cnt() * GetSizeOfDataType(output->data_type()); + size_t out_bytes_size = + output->shape_view().elem_cnt() * GetSizeOfDataType(output->data_type()); Memset(ctx->stream(), output->mut_dptr(), 0, out_bytes_size); FoldKernelUtil::Forward( ctx->stream(), ¶ms, input->dptr(), output->mut_dptr()); diff --git a/oneflow/user/kernels/fused_bias_add_kernel.cu b/oneflow/user/kernels/fused_bias_add_kernel.cu index 9d2da281259..8acf3601c50 100644 --- a/oneflow/user/kernels/fused_bias_add_kernel.cu +++ b/oneflow/user/kernels/fused_bias_add_kernel.cu @@ -339,10 +339,10 @@ class FusedFusedBiasAddKernel final : public user_op::OpKernel { const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); const int32_t bias_add_axis = ctx->Attr("axis"); - const int64_t outer_size = a_tensor->shape().Count(0, bias_add_axis); - const int64_t bias_size = a_tensor->shape().At(bias_add_axis); - const int64_t inner_size = a_tensor->shape().Count(bias_add_axis + 1); - const auto n = a_tensor->shape().elem_cnt(); + const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); + const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); + const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); + const auto n = a_tensor->shape_view().elem_cnt(); GeluFunctor gelu_functor{}; DispatchFusedBiasAddForwardImpl( ctx->stream(), gelu_functor, n, outer_size, bias_size, inner_size, a_tensor->dptr(), @@ -377,10 +377,10 @@ class FusedBiasAddMaskScaleKernel final : public user_op::OpKernel { auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); const int32_t bias_add_axis = ctx->Attr("axis"); const float scale = ctx->Attr("scale"); - const int64_t outer_size = a_tensor->shape().Count(0, bias_add_axis); - const int64_t bias_size = a_tensor->shape().At(bias_add_axis); - const int64_t inner_size = a_tensor->shape().Count(bias_add_axis + 1); - const auto n = a_tensor->shape().elem_cnt(); + const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); + const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); + const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); + const auto n = a_tensor->shape_view().elem_cnt(); if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); MaskAndScaleAddFunctor mask_and_scale_add_functor(mask_tensor->dptr(), @@ -423,10 +423,10 @@ class FusedFusedBiasAddGradKernel final : public user_op::OpKernel { const auto* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); auto* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); const int32_t bias_add_axis = ctx->Attr("axis"); - const int64_t outer_size = a_tensor->shape().Count(0, bias_add_axis); - const int64_t bias_size = a_tensor->shape().At(bias_add_axis); - const int64_t inner_size = a_tensor->shape().Count(bias_add_axis + 1); - const auto n = a_tensor->shape().elem_cnt(); + const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); + const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); + const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); + const auto n = a_tensor->shape_view().elem_cnt(); GeluGradFunctor gelu_grad_functor; if (IsKernelSafeInt32(n)) { FusedBiasAddGradImpl( diff --git a/oneflow/user/kernels/fused_cast_scale_kernel.cpp b/oneflow/user/kernels/fused_cast_scale_kernel.cpp index 16cb168d3da..09e5da82251 100644 --- a/oneflow/user/kernels/fused_cast_scale_kernel.cpp +++ b/oneflow/user/kernels/fused_cast_scale_kernel.cpp @@ -29,7 +29,7 @@ class FusedCastScaleCpuKernel final : public user_op::OpKernel { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); const double scale_val = ctx->Attr("scale"); - const int64_t n = x->shape().elem_cnt(); + const int64_t n = x->shape_view().elem_cnt(); const T scale = *(scale_by_tensor->dptr()) * scale_val; const U* x_ptr = x->dptr(); T* y_ptr = y->mut_dptr(); diff --git a/oneflow/user/kernels/fused_cast_scale_kernel.cu b/oneflow/user/kernels/fused_cast_scale_kernel.cu index dbdd819c9f4..77502a78af8 100644 --- a/oneflow/user/kernels/fused_cast_scale_kernel.cu +++ b/oneflow/user/kernels/fused_cast_scale_kernel.cu @@ -78,7 +78,7 @@ class FusedCastScaleGpuKernel final : public user_op::OpKernel, public user_op:: const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int64_t n = x->shape().elem_cnt(); + const int64_t n = x->shape_view().elem_cnt(); const double scale = ctx->Attr("scale"); const int64_t launch_n = ((std::is_same::value && std::is_same::value) || (std::is_same::value && std::is_same::value)) diff --git a/oneflow/user/kernels/fused_cross_feature_interaction.cu b/oneflow/user/kernels/fused_cross_feature_interaction.cu index 687724cb89e..d111ef69483 100644 --- a/oneflow/user/kernels/fused_cross_feature_interaction.cu +++ b/oneflow/user/kernels/fused_cross_feature_interaction.cu @@ -219,17 +219,18 @@ class FusedCrossFeatureInteractionKernel final : public user_op::OpKernel, user_op::Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); const std::string interaction_mode = ctx->Attr("interaction_mode"); - CHECK_EQ(out->shape().NumAxes(), 2); + CHECK_EQ(out->shape_view().NumAxes(), 2); size_t m = 0, n = 0, k = 0; - InferMatmulMNK(x->shape(), weight->shape(), /*trans_a=*/false, /*trans_b=*/true, &m, &n, &k); + InferMatmulMNK(x->shape_view(), weight->shape_view(), /*trans_a=*/false, /*trans_b=*/true, &m, + &n, &k); const double alpha = 1.0; double beta = 0.0; auto matmul = NewMatmulPrimitive(ctx); CHECK(matmul); matmul->Launch(ctx->stream(), m, n, k, alpha, x->dptr(), weight->dptr(), beta, matmul_result->mut_dptr()); - const int64_t elem_cnt = out->shape().elem_cnt(); - const int64_t cols = out->shape().At(1); + const int64_t elem_cnt = out->shape_view().elem_cnt(); + const int64_t cols = out->shape_view().At(1); if (interaction_mode == "vector") { DispatchFusedBiasAddMulAddResidualIndexType( ctx->stream(), matmul_result->mut_dptr(), x->dptr(), x0->dptr(), bias->dptr(), diff --git a/oneflow/user/kernels/fused_cross_feature_interaction_grad.cu b/oneflow/user/kernels/fused_cross_feature_interaction_grad.cu index 92ccdc3da01..db07942bfd5 100644 --- a/oneflow/user/kernels/fused_cross_feature_interaction_grad.cu +++ b/oneflow/user/kernels/fused_cross_feature_interaction_grad.cu @@ -247,10 +247,10 @@ class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public Cud const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); - const int64_t batch_size = dy->shape().At(0); - const int64_t hidden_size = dy->shape().At(1); - const int64_t out_size = weight->shape().At(0); - const int64_t dy_elem_cnt = dy->shape().elem_cnt(); + const int64_t batch_size = dy->shape_view().At(0); + const int64_t hidden_size = dy->shape_view().At(1); + const int64_t out_size = weight->shape_view().At(0); + const int64_t dy_elem_cnt = dy->shape_view().elem_cnt(); Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); @@ -266,7 +266,7 @@ class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public Cud } size_t m = 0, n = 0, k = 0; DimVector dy_shape(2); - dy->shape().ToDimVector(&dy_shape); + dy->shape_view().ToDimVector(&dy_shape); DimVector ones_buf_shape(2); ones_buf_shape.at(0) = 1; ones_buf_shape.at(1) = batch_size; @@ -285,7 +285,7 @@ class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public Cud ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), hidden_size)); DimVector dy_mul_x0_shape(2); - dy->shape().ToDimVector(&dy_mul_x0_shape); + dy->shape_view().ToDimVector(&dy_mul_x0_shape); ones_buf_shape.at(0) = hidden_size; ones_buf_shape.at(1) = 1; InferMatmulMNK(dy_mul_x0_shape, ones_buf_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, @@ -300,7 +300,7 @@ class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public Cud dmatmul_result_shape.at(0) = batch_size; dmatmul_result_shape.at(1) = 1; // todo change to hidden size DimVector weight_shape(2); - weight->shape().ToDimVector(&weight_shape); + weight->shape_view().ToDimVector(&weight_shape); InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0, @@ -311,7 +311,7 @@ class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public Cud // step4: Get dw. DimVector x_shape(2); - x->shape().ToDimVector(&x_shape); + x->shape_view().ToDimVector(&x_shape); InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); @@ -363,10 +363,10 @@ class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public C const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); - const int64_t batch_size = dy->shape().At(0); - const int64_t in_size = weight->shape().At(1); - const int64_t hidden_size = weight->shape().At(0); - const int64_t dy_elem_cnt = dy->shape().elem_cnt(); + const int64_t batch_size = dy->shape_view().At(0); + const int64_t in_size = weight->shape_view().At(1); + const int64_t hidden_size = weight->shape_view().At(0); + const int64_t dy_elem_cnt = dy->shape_view().elem_cnt(); Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); @@ -391,7 +391,7 @@ class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public C dmatmul_result_shape.at(0) = batch_size; dmatmul_result_shape.at(1) = hidden_size; DimVector weight_shape(2); - weight->shape().ToDimVector(&weight_shape); + weight->shape_view().ToDimVector(&weight_shape); size_t m = 0, n = 0, k = 0; InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); @@ -405,7 +405,7 @@ class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public C // step4: Get dw. DimVector x_shape(2); - x->shape().ToDimVector(&x_shape); + x->shape_view().ToDimVector(&x_shape); InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); @@ -420,7 +420,7 @@ class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public C ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), batch_size)); } DimVector dy_shape(2); - dy->shape().ToDimVector(&dy_shape); + dy->shape_view().ToDimVector(&dy_shape); DimVector ones_buf_shape(2); ones_buf_shape.at(0) = 1; ones_buf_shape.at(1) = batch_size; diff --git a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu index 2a3ae5007eb..250e7588780 100644 --- a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu +++ b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu @@ -109,8 +109,8 @@ void ConcatFeatures(user_op::KernelComputeContext* ctx, int64_t dst_rows, int64_ int64_t out_col_offset = 0; for (int64_t i = 0; i < feature_input_size; ++i) { const user_op::Tensor* feature = ctx->Tensor4ArgNameAndIndex("features", i); - const int64_t feature_rows = feature->shape().At(0); - const int64_t feature_cols = feature->shape().Count(1); + const int64_t feature_rows = feature->shape_view().At(0); + const int64_t feature_cols = feature->shape_view().Count(1); DimVector dst_pos_vec = {0, out_col_offset}; DimVector src_shape = {feature_rows, feature_cols}; DimVector src_pos_vec = {0, 0}; @@ -171,8 +171,8 @@ void ConcatFeaturesGrad(user_op::KernelComputeContext* ctx, const int64_t batch_ int64_t in_col_offset = 0; for (int64_t i = 0; i < ctx->output_size("features_grad"); ++i) { user_op::Tensor* feature_grad = ctx->Tensor4ArgNameAndIndex("features_grad", i); - const int64_t feature_grad_rows = feature_grad->shape().At(0); - const int64_t feature_grad_cols = feature_grad->shape().Count(1); + const int64_t feature_grad_rows = feature_grad->shape_view().At(0); + const int64_t feature_grad_cols = feature_grad->shape_view().Count(1); DimVector dst_shape = {feature_grad_rows, feature_grad_cols}; DimVector dst_pos_vec = {0, 0}; DimVector src_pos_vec = {0, in_col_offset}; @@ -643,8 +643,8 @@ bool DispatchFeatureInteractionDotPackSize(user_op::KernelComputeContext* ctx, const int32_t input_size) { CHECK_LE(input_size, max_in) << input_size; user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t batch_size = out->shape().At(0); - const int64_t out_num_cols = out->shape().At(1); + const int64_t batch_size = out->shape_view().At(0); + const int64_t out_num_cols = out->shape_view().At(1); const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); DotFwdParam param; param.num_in = input_size; @@ -661,7 +661,7 @@ bool DispatchFeatureInteractionDotPackSize(user_op::KernelComputeContext* ctx, if (ctx->has_input("output_concat", 0)) { const user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0); param.output_concat = output_concat->dptr(); - param.output_concat_size = output_concat->shape().At(1); + param.output_concat_size = output_concat->shape_view().At(1); } else { param.output_concat = nullptr; param.output_concat_size = 0; @@ -688,8 +688,8 @@ bool DispatchFeatureInteractionDotBackwardPackSize(user_op::KernelComputeContext const int32_t input_size) { CHECK_LE(input_size, max_in) << input_size; user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const int64_t batch_size = dy->shape().At(0); - const int64_t out_num_cols = dy->shape().At(1); + const int64_t batch_size = dy->shape_view().At(0); + const int64_t out_num_cols = dy->shape_view().At(1); const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); DotBwdParam param; param.num_in = input_size; @@ -707,7 +707,7 @@ bool DispatchFeatureInteractionDotBackwardPackSize(user_op::KernelComputeContext if (ctx->has_output("output_concat_grad", 0)) { user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0); param.output_concat_grad = output_concat_grad->mut_dptr(); - param.output_concat_size = output_concat_grad->shape().At(1); + param.output_concat_size = output_concat_grad->shape_view().At(1); } else { param.output_concat_grad = nullptr; param.output_concat_size = 0; @@ -862,8 +862,8 @@ void DispatchFeatureInteractionSumInputSize(user_op::KernelComputeContext* ctx, const int32_t input_size) { CHECK_LE(input_size, max_in) << input_size; user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t batch_size = out->shape().At(0); - const int64_t vector_size = out->shape().At(1); + const int64_t batch_size = out->shape_view().At(0); + const int64_t vector_size = out->shape_view().At(1); Param param; param.num_in = input_size; param.out = out->mut_dptr(); @@ -879,8 +879,8 @@ void DispatchFeatureInteractionSumGradInputSize(user_op::KernelComputeContext* c const int32_t input_size) { CHECK_LE(input_size, max_in) << input_size; const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const int64_t batch_size = dy->shape().At(0); - const int64_t vector_size = dy->shape().At(1); + const int64_t batch_size = dy->shape_view().At(0); + const int64_t vector_size = dy->shape_view().At(1); int block_dim_x; int block_dim_y; GetBlockDims(vector_size, &block_dim_x, &block_dim_y); @@ -977,7 +977,7 @@ class FusedDotFeatureInteractionKernel final : public user_op::OpKernel, void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); const DataType data_type = out->data_type(); - CHECK_LT(out->shape().elem_cnt(), GetMaxVal()); + CHECK_LT(out->shape_view().elem_cnt(), GetMaxVal()); auto* cuda_stream = ctx->stream()->As(); if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16) || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) { @@ -985,14 +985,14 @@ class FusedDotFeatureInteractionKernel final : public user_op::OpKernel, if (success == true) { return; } } user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t batch_size = out->shape().At(0); + const int64_t batch_size = out->shape_view().At(0); int64_t features_concated_dim = 0; for (int64_t i = 0; i < ctx->input_size("features"); ++i) { features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); } const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); - const int64_t out_dim = out->shape().At(1); + const int64_t out_dim = out->shape_view().At(1); const int32_t output_padding = ctx->Attr("output_padding"); const int64_t valid_out_dim = out_dim - output_padding; const bool self_interaction = ctx->Attr("self_interaction"); @@ -1010,7 +1010,7 @@ class FusedDotFeatureInteractionKernel final : public user_op::OpKernel, reinterpret_cast(tmp_buffer->mut_dptr() + matmul_out_size + gather_indices_size); size_t padded_concated_features_size = GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); - CHECK_GE(tmp_buffer->shape().elem_cnt(), + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), matmul_out_size + gather_indices_size + padded_concated_features_size); ConcatFeatures(ctx, batch_size, concated_padded_dim * vector_size, padded_concated_features_ptr); @@ -1025,11 +1025,11 @@ class FusedDotFeatureInteractionKernel final : public user_op::OpKernel, const T* output_concat_ptr = nullptr; if (ctx->has_input("output_concat", 0)) { user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0); - output_concat_end_dim = output_concat->shape().At(1); + output_concat_end_dim = output_concat->shape_view().At(1); output_concat_ptr = output_concat->dptr(); } CHECK_EQ(valid_out_dim, output_concat_end_dim + interaction_dim); - GatherConcatKernel(ctx->stream(), out->shape().elem_cnt(), out_dim, valid_out_dim, + GatherConcatKernel(ctx->stream(), out->shape_view().elem_cnt(), out_dim, valid_out_dim, features_concated_dim, concated_padded_dim, output_concat_end_dim, self_interaction, matmul_out, output_concat_ptr, gather_indices_ptr, out->mut_dptr()); @@ -1091,14 +1091,14 @@ class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel, bool success = TryLaunchTensorCoreDotBackwardKernel(ctx); if (success == true) { return; } } - const int64_t batch_size = dy->shape().At(0); + const int64_t batch_size = dy->shape_view().At(0); int64_t features_concated_dim = 0; for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) { features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1); } const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features_grad", 0)->shape().At(2); - const int64_t out_dim = dy->shape().At(1); + const int64_t out_dim = dy->shape_view().At(1); const bool self_interaction = ctx->Attr("self_interaction"); T* matmul_out_grad_ptr = reinterpret_cast(tmp_buffer->mut_dptr()); size_t matmul_out_grad_size = @@ -1112,7 +1112,7 @@ class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel, size_t padded_concated_features_size = padded_concated_features_grad_size; CHECK_LE( matmul_out_grad_size + padded_concated_features_grad_size + padded_concated_features_size, - tmp_buffer->shape().elem_cnt()); + tmp_buffer->shape_view().elem_cnt()); ConcatFeatures(ctx, batch_size, concated_padded_dim * vector_size, padded_concated_features_ptr); @@ -1121,7 +1121,7 @@ class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel, if (ctx->has_output("output_concat_grad", 0)) { user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0); output_concat_grad_ptr = output_concat_grad->mut_dptr(); - output_concat_end_dim = output_concat_grad->shape().At(1); + output_concat_end_dim = output_concat_grad->shape_view().At(1); } ScatterSplitAddTranspose(ctx->stream(), batch_size, out_dim, concated_padded_dim, features_concated_dim, output_concat_end_dim, self_interaction, diff --git a/oneflow/user/kernels/fused_gru_cell_kernel.cu b/oneflow/user/kernels/fused_gru_cell_kernel.cu index 752dd912f49..3e91268e939 100644 --- a/oneflow/user/kernels/fused_gru_cell_kernel.cu +++ b/oneflow/user/kernels/fused_gru_cell_kernel.cu @@ -269,9 +269,9 @@ class GpuFusedGruCellKernel final : public user_op::OpKernel { T* hy_ptr = hy->mut_dptr(); T* workspace_ptr = workspace->mut_dptr(); - const int64_t hx_numel = hx->shape().elem_cnt(); - const int64_t workspace_numel = workspace->shape().elem_cnt(); - const int64_t hidden_size = hx->shape().At(hx->shape().NumAxes() - 1); + const int64_t hx_numel = hx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = hx->shape_view().At(hx->shape_view().NumAxes() - 1); FusedGruCellFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr, hy_ptr, workspace_ptr); @@ -316,9 +316,9 @@ class GpuFusedGruCellGradFloatKernel final : public user_op::OpKernel { grad_hx_ptr = grad_hx->mut_dptr(); } - const int64_t hx_numel = grad_hy->shape().elem_cnt(); - const int64_t workspace_numel = workspace->shape().elem_cnt(); - const int64_t hidden_size = grad_hy->shape().At(grad_hy->shape().NumAxes() - 1); + const int64_t hx_numel = grad_hy->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1); FusedGruCellGradFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, grad_hidden_gates_ptr, grad_hx_ptr); @@ -329,19 +329,21 @@ class GpuFusedGruCellGradFloatKernel final : public user_op::OpKernel { std::vector axis; axis.push_back(0); const Shape& reduced_shape = - CreateReducedShape(grad_input_gates->shape(), {axis.begin(), axis.end()}); + CreateReducedShape(grad_input_gates->shape_view(), {axis.begin(), axis.end()}); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); NdarrayReduce::Reduce( ctx->stream(), XpuVarNdarray(reduced_shape, grad_input_bias_ptr), - XpuVarNdarray(grad_input_gates->shape(), grad_input_gates->dptr()), - XpuVarNdarray(tmp_buffer->shape(), tmp_buffer->mut_dptr())); + XpuVarNdarray(grad_input_gates->shape_view(), + grad_input_gates->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); float* grad_hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0)->mut_dptr(); NdarrayReduce::Reduce( ctx->stream(), XpuVarNdarray(reduced_shape, grad_hidden_bias_ptr), - XpuVarNdarray(grad_hidden_gates->shape(), grad_hidden_gates->dptr()), - XpuVarNdarray(tmp_buffer->shape(), tmp_buffer->mut_dptr())); + XpuVarNdarray(grad_hidden_gates->shape_view(), + grad_hidden_gates->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); } } @@ -389,9 +391,9 @@ class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel { grad_hx_ptr = grad_hx->mut_dptr(); } - const int64_t hx_numel = grad_hy->shape().elem_cnt(); - const int64_t workspace_numel = workspace->shape().elem_cnt(); - const int64_t hidden_size = grad_hy->shape().At(grad_hy->shape().NumAxes() - 1); + const int64_t hx_numel = grad_hy->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1); FusedGruCellGradFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, grad_hidden_gates_ptr, grad_hx_ptr); @@ -400,7 +402,7 @@ class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel { std::vector axis; axis.push_back(0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const ShapeView& in_shape = grad_input_gates->shape(); + const ShapeView& in_shape = grad_input_gates->shape_view(); const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()}); float* in_tmp_buffer = tmp_buffer->mut_dptr(); const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); @@ -413,7 +415,7 @@ class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel { const size_t reduce_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, - tmp_buffer->shape().elem_cnt()); + tmp_buffer->shape_view().elem_cnt()); auto h2f = ep::primitive::NewPrimitive( ctx->device_type(), DataType::kFloat16, DataType::kFloat); CHECK(h2f); @@ -430,7 +432,7 @@ class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel { user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0); f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), - output_tensor->shape().elem_cnt()); + output_tensor->shape_view().elem_cnt()); h2f->Launch(ctx->stream(), grad_hidden_gates->dptr(), in_tmp_buffer, in_shape.elem_cnt()); @@ -441,7 +443,7 @@ class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel { output_tensor = ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0); f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), - output_tensor->shape().elem_cnt()); + output_tensor->shape_view().elem_cnt()); } } diff --git a/oneflow/user/kernels/fused_lstm_cell_kernel.cu b/oneflow/user/kernels/fused_lstm_cell_kernel.cu index 9f42fc41710..568ab44d482 100644 --- a/oneflow/user/kernels/fused_lstm_cell_kernel.cu +++ b/oneflow/user/kernels/fused_lstm_cell_kernel.cu @@ -314,9 +314,9 @@ class GpuFusedLstmCellKernel final : public user_op::OpKernel { T* hy_ptr = hy->mut_dptr(); T* cy_ptr = cy->mut_dptr(); T* workspace_ptr = workspace->mut_dptr(); - const int64_t cx_numel = cx->shape().elem_cnt(); - const int64_t workspace_numel = workspace->shape().elem_cnt(); - const int64_t hidden_size = cx->shape().At(cx->shape().NumAxes() - 1); + const int64_t cx_numel = cx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); FusedLstmCellFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr); @@ -363,9 +363,9 @@ class GpuFusedLstmCellGradFloatKernel final : public user_op::OpKernel { if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr(); } - const int64_t cx_numel = cx->shape().elem_cnt(); - const int64_t workspace_numel = workspace->shape().elem_cnt(); - const int64_t hidden_size = cx->shape().At(cx->shape().NumAxes() - 1); + const int64_t cx_numel = cx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); FusedLstmCellGradFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, grad_gates_ptr, grad_cx_ptr); @@ -375,12 +375,12 @@ class GpuFusedLstmCellGradFloatKernel final : public user_op::OpKernel { std::vector axis; axis.push_back(0); const Shape& reduced_shape = - CreateReducedShape(workspace->shape(), {axis.begin(), axis.end()}); + CreateReducedShape(workspace->shape_view(), {axis.begin(), axis.end()}); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); NdarrayReduce::Reduce( ctx->stream(), XpuVarNdarray(reduced_shape, grad_bias_ptr), - XpuVarNdarray(grad_gates->shape(), grad_gates->dptr()), - XpuVarNdarray(tmp_buffer->shape(), tmp_buffer->mut_dptr())); + XpuVarNdarray(grad_gates->shape_view(), grad_gates->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); } } @@ -433,9 +433,9 @@ class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel { if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr(); } - const int64_t cx_numel = cx->shape().elem_cnt(); - const int64_t workspace_numel = workspace->shape().elem_cnt(); - const int64_t hidden_size = cx->shape().At(cx->shape().NumAxes() - 1); + const int64_t cx_numel = cx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); FusedLstmCellGradFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, grad_gates_ptr, grad_cx_ptr); @@ -444,7 +444,7 @@ class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel { std::vector axis; axis.push_back(0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const ShapeView& in_shape = grad_gates->shape(); + const ShapeView& in_shape = grad_gates->shape_view(); const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()}); float* in_tmp_buffer = tmp_buffer->mut_dptr(); const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); @@ -457,7 +457,7 @@ class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel { const size_t reduce_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, - tmp_buffer->shape().elem_cnt()); + tmp_buffer->shape_view().elem_cnt()); auto h2f = ep::primitive::NewPrimitive( ctx->device_type(), DataType::kFloat16, DataType::kFloat); CHECK(h2f); @@ -473,7 +473,7 @@ class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel { user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_bias", 0); f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), - output_tensor->shape().elem_cnt()); + output_tensor->shape_view().elem_cnt()); } } diff --git a/oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu b/oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu index 9b6f6fc431c..3d6785dbc2f 100644 --- a/oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu +++ b/oneflow/user/kernels/fused_matmul_bias_add_relu_dropout.cu @@ -380,7 +380,7 @@ class FusedMatmulBiasAddReluDropoutKernel final : public user_op::OpKernel, // Currently only support 2D matmul. DimVector in_shape(2); - x->shape().ToDimVector(&in_shape); + x->shape_view().ToDimVector(&in_shape); DimVector weight_shape(2); const void* in_buf_ptr = x->dptr(); @@ -391,8 +391,8 @@ class FusedMatmulBiasAddReluDropoutKernel final : public user_op::OpKernel, user_op::Tensor* cublas_aux = ctx->Tensor4ArgNameAndIndex("cublas_aux", idx); const int64_t batchsize = in_shape.at(0); - const int64_t out_feature = weight->shape().At(0); - weight->shape().ToDimVector(&weight_shape); + const int64_t out_feature = weight->shape_view().At(0); + weight->shape_view().ToDimVector(&weight_shape); size_t matmul_out_elem_cnt = batchsize * out_feature; InferMatmulCublasMNK(in_shape, weight_shape, @@ -428,7 +428,7 @@ class FusedMatmulBiasAddReluDropoutKernel final : public user_op::OpKernel, if (idx != weight_size - 1 || !skip_final_activation || rate != 0.0f) { OF_CUDA_CHECK(cudaMemsetAsync(cublas_aux->mut_dptr(), 0, - cublas_aux->shape().elem_cnt() * sizeof(int32_t), + cublas_aux->shape_view().elem_cnt() * sizeof(int32_t), cuda_stream->cuda_stream())); } diff --git a/oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu index 85dc3d492df..3d91a5240e0 100644 --- a/oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu +++ b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.cu @@ -120,9 +120,9 @@ class FusedReluDropoutGradKernel final : public user_op::OpKernel, user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const float scale = ctx->Attr("scale"); - const int64_t cols = dy->shape().At(1); - const int64_t aux_ld = mask->shape().At(1) * 32; - const int64_t elem_cnt = dy->shape().elem_cnt(); + const int64_t cols = dy->shape_view().At(1); + const int64_t aux_ld = mask->shape_view().At(1) * 32; + const int64_t elem_cnt = dy->shape_view().elem_cnt(); LaunchVectorizedReluDropoutBackwardKernel( ctx->stream(), elem_cnt, cols, aux_ld, scale, reinterpret_cast(dy->dptr()), mask->dptr(), reinterpret_cast(dx->mut_dptr())); diff --git a/oneflow/user/kernels/fused_scale_mask_softmax.cu b/oneflow/user/kernels/fused_scale_mask_softmax.cu index 9c9713c7a8c..f977e6cf20f 100644 --- a/oneflow/user/kernels/fused_scale_mask_softmax.cu +++ b/oneflow/user/kernels/fused_scale_mask_softmax.cu @@ -16,64 +16,88 @@ limitations under the License. #include "oneflow/core/framework/framework.h" #include "oneflow/core/cuda/softmax.cuh" #include "oneflow/core/ep/cuda/cuda_stream.h" - +#include "oneflow/user/kernels/fused_scale_mask_softmax.cuh" namespace oneflow { -template -struct ScaleMaskLoad { - ScaleMaskLoad(const SRC* src, const bool* mask, int64_t row_size, SRC fill, SRC scale) - : src(src), mask(mask), row_size(row_size), fill(fill), scale(scale) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - const int64_t offset = (row * row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - dst[i] = static_cast(fill); - } else { - dst[i] = static_cast(pack.elem[i]) * static_cast(scale); - } - } - } - const SRC* src; - const bool* mask; - int64_t row_size; - SRC fill; - SRC scale; -}; +namespace { -template -struct ScaleMaskStore { - ScaleMaskStore(DST* dst, const bool* mask, int64_t row_size, DST fill, DST scale) - : dst(dst), mask(mask), row_size(row_size), fill(fill), scale(scale) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - const int64_t offset = (row * row_size + col) / N; - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - pack.elem[i] = fill; - } else { - pack.elem[i] = static_cast(src[i]) * static_cast(scale); - } - } - *(reinterpret_cast*>(dst) + offset) = pack.storage; - } - DST* dst; - const bool* mask; - int64_t row_size; - DST fill; - DST scale; -}; +template +void LaunchBroadcastForwardKernel(cudaStream_t stream, const T* x, T* y, const MASK* mask, + const int64_t elem_cnt, const int64_t rows, const int64_t cols, + const float fill, const float scale, const int64_t* input_dims, + const int64_t* mask_dims) { + NdIndexOffsetHelper input_index_helper(input_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::BroadcastScaleMaskLoad load( + x, mask, params); + cuda::softmax::DirectStore store(y, cols); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} -template +template +void LaunchElementwiseForwardKernel(cudaStream_t stream, const T* x, T* y, const MASK* mask, + const int64_t rows, const int64_t cols, const float fill, + const float scale) { + oneflow::fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::ElementwiseScaleMaskLoad load(x, mask, params); + cuda::softmax::DirectStore store(y, cols); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} + +template +void LaunchBroadcastBackwardKernel(cudaStream_t stream, const T* y, const T* dy, T* dx, + const MASK* mask, const int64_t elem_cnt, const int64_t rows, + const int64_t cols, const float fill, const float scale, + const int64_t* input_dims, const int64_t* mask_dims) { + NdIndexOffsetHelper input_index_helper(input_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_y(y, cols); + cuda::softmax::DirectLoad load_dy(dy, cols); + fused_scale_mask_softmax::BroadcastScaleMaskStore store( + dx, mask, params); + OF_CUDA_CHECK(( + cuda::softmax::DispatchSoftmaxGrad(stream, load_y, load_dy, store, rows, cols))); +} + +template +void LaunchElementwiseBackwardKernel(cudaStream_t stream, const T* y, const T* dy, T* dx, + const MASK* mask, const int64_t rows, const int64_t cols, + const float fill, const float scale) { + fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_y(y, cols); + cuda::softmax::DirectLoad load_dy(dy, cols); + fused_scale_mask_softmax::ElementwiseScaleMaskStore store(dx, mask, params); + OF_CUDA_CHECK(( + cuda::softmax::DispatchSoftmaxGrad(stream, load_y, load_dy, store, rows, cols))); +} + +constexpr int32_t kMaxNumDims = 5; + +template class FusedScaleMaskSoftmaxKernel final : public user_op::OpKernel { public: FusedScaleMaskSoftmaxKernel() = default; @@ -85,33 +109,50 @@ class FusedScaleMaskSoftmaxKernel final : public user_op::OpKernel { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const ShapeView& x_shape = x->shape(); + const float mask_fill_value = ctx->Attr("mask_fill_value"); + const float scale_value = ctx->Attr("scale_value"); + const ShapeView& x_shape = x->shape_view(); + const ShapeView& mask_shape = mask->shape_view(); CHECK_GE(x_shape.NumAxes(), 2); + const int64_t elem_cnt = x_shape.elem_cnt(); const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); + const size_t num_input_dims = x_shape.NumAxes(); + const int64_t* input_dims = x_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); using ComputeType = typename cuda::softmax::DefaultComputeType::type; - ScaleMaskLoad load(x->dptr(), mask->dptr(), cols, - ctx->Attr("mask_fill_value"), - ctx->Attr("scale_value")); - cuda::softmax::DirectStore store(y->mut_dptr(), cols); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - ctx->stream()->As()->cuda_stream(), load, store, rows, cols))); + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseForwardKernel( + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), + mask->dptr(), rows, cols, mask_fill_value, scale_value); + } +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastForwardKernel( \ + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), \ + mask->dptr(), elem_cnt, rows, cols, mask_fill_value, scale_value, \ + simplified_input_dims, simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_FUCED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_FUCED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(half) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(float) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(double) -#undef REGISTER_FUCED_SCALE_MASK_SOFTMAX_CUDA_KERNEL - -template +template class FusedScaleMaskSoftmaxGradKernel final : public user_op::OpKernel { public: FusedScaleMaskSoftmaxGradKernel() = default; @@ -124,31 +165,72 @@ class FusedScaleMaskSoftmaxGradKernel final : public user_op::OpKernel { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const ShapeView& dy_shape = dy->shape(); + const float scale_value = ctx->Attr("scale_value"); + const float mask_fill_value = static_cast(0.0); + const ShapeView& dy_shape = dy->shape_view(); + const ShapeView& mask_shape = mask->shape_view(); CHECK_GE(dy_shape.NumAxes(), 2); + const int64_t elem_cnt = dy_shape.elem_cnt(); const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); + const int64_t* input_dims = dy_shape.ptr(); + const size_t num_input_dims = dy_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + using ComputeType = typename cuda::softmax::DefaultComputeType::type; - cuda::softmax::DirectLoad load_y(y->dptr(), cols); - cuda::softmax::DirectLoad load_dy(dy->dptr(), cols); - ScaleMaskStore store(dx->mut_dptr(), mask->dptr(), cols, - static_cast(0.0), ctx->Attr("scale_value")); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( - ctx->stream()->As()->cuda_stream(), load_y, load_dy, store, rows, cols))); + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseBackwardKernel( + ctx->stream()->As()->cuda_stream(), y->dptr(), dy->dptr(), + dx->mut_dptr(), mask->dptr(), rows, cols, mask_fill_value, scale_value); + } +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastBackwardKernel( \ + ctx->stream()->As()->cuda_stream(), y->dptr(), dy->dptr(), \ + dx->mut_dptr(), mask->dptr(), elem_cnt, rows, cols, mask_fill_value, scale_value, \ + simplified_input_dims, simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_FUCED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); +} // namespace + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); + +REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); -REGISTER_FUCED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(half) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(float) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(double) -#undef REGISTER_FUCED_SCALE_MASK_SOFTMAX_GRAD_KERNEL +REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL } // namespace oneflow diff --git a/oneflow/user/kernels/fused_scale_mask_softmax.cuh b/oneflow/user/kernels/fused_scale_mask_softmax.cuh new file mode 100644 index 00000000000..1d36daadca1 --- /dev/null +++ b/oneflow/user/kernels/fused_scale_mask_softmax.cuh @@ -0,0 +1,216 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/nd_index_offset_helper.h" + +namespace oneflow { + +namespace fused_scale_mask_softmax { + +namespace { + +void SimplifyBroadcastDims(size_t num_a_dims, const int64_t* a_dims, size_t num_b_dims, + const int64_t* b_dims, size_t* simplified_num_dims, + int64_t* simplified_a_dims, int64_t* simplified_b_dims) { + const size_t num_max_dims = std::max(num_a_dims, num_b_dims); + auto MakeGetDim = [num_max_dims](size_t num_dims, const int64_t* dims) { + const int64_t num_padding_dims = num_max_dims - num_dims; + return [num_padding_dims, dims](size_t index) { + return index < num_padding_dims ? 1 : dims[index - num_padding_dims]; + }; + }; + auto GetADim = MakeGetDim(num_a_dims, a_dims); + auto GetBDim = MakeGetDim(num_b_dims, b_dims); + *simplified_num_dims = 0; + bool prev_broadcast_a = false; + bool prev_broadcast_b = false; + for (int64_t i = 0; i < num_max_dims; ++i) { + const int64_t a_dim = GetADim(i); + const int64_t b_dim = GetBDim(i); + const int64_t broadcast_dim = std::max(a_dim, b_dim); + CHECK_GT(broadcast_dim, 0); + const bool broadcast_a = (a_dim == 1); + const bool broadcast_b = (b_dim == 1); + CHECK((a_dim == broadcast_dim) || broadcast_a); + CHECK((b_dim == broadcast_dim) || broadcast_b); + if (broadcast_dim == 1) { + continue; + } else if (*simplified_num_dims != 0 + && (prev_broadcast_a == broadcast_a && prev_broadcast_b == broadcast_b)) { + simplified_a_dims[*simplified_num_dims - 1] *= a_dim; + simplified_b_dims[*simplified_num_dims - 1] *= b_dim; + } else { + simplified_a_dims[*simplified_num_dims] = a_dim; + simplified_b_dims[*simplified_num_dims] = b_dim; + *simplified_num_dims += 1; + prev_broadcast_a = broadcast_a; + prev_broadcast_b = broadcast_b; + } + } +} + +template +struct BroadcastMaskSoftmaxParams { + NdIndexOffsetHelper src_index_helper; + NdIndexOffsetHelper mask_index_helper; + const int64_t* mask_dims{}; + int64_t row_size; + float fill; + float scale; +}; + +struct ElementwiseMaskSoftmaxParams { + int64_t row_size; + float fill; + float scale; +}; + +template +struct BroadcastScaleMaskLoad { + BroadcastScaleMaskLoad(const SRC* src, const MASK* mask, + BroadcastMaskSoftmaxParams params) + : src(src), mask(mask), params(params) { + for (int i = 0; i < num_dims; i++) { mask_dims[i] = params.mask_dims[i]; } + } + template + __device__ void load(DST* dst, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + cuda::softmax::Pack mask_pack; + const IndexType offset = row * params.row_size + col; + IndexType input_index[num_dims]; + IndexType mask_index[num_dims]; + params.src_index_helper.OffsetToNdIndex(offset, input_index); + for (int dim = 0; dim < num_dims; ++dim) { + if (mask_dims[dim] == 1) { + mask_index[dim] = 0; + } else { + mask_index[dim] = input_index[dim]; + } + } + const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index); + pack.storage = *(reinterpret_cast*>(src) + offset / N); + mask_pack.storage = + *(reinterpret_cast*>(mask) + mask_offset / N); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + dst[i] = static_cast(params.fill); + } else { + dst[i] = static_cast(pack.elem[i]) * static_cast(params.scale); + } + } + } + const SRC* src; + const MASK* mask; + int64_t mask_dims[num_dims]; + BroadcastMaskSoftmaxParams params; +}; + +template +struct ElementwiseScaleMaskLoad { + ElementwiseScaleMaskLoad(const SRC* src, const MASK* mask, ElementwiseMaskSoftmaxParams param) + : src(src), mask(mask), param(param) {} + template + __device__ void load(DST* dst, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + const int64_t offset = (row * param.row_size + col) / N; + pack.storage = *(reinterpret_cast*>(src) + offset); + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + dst[i] = static_cast(param.fill); + } else { + dst[i] = static_cast(pack.elem[i]) * static_cast(param.scale); + } + } + } + const SRC* src; + const MASK* mask; + ElementwiseMaskSoftmaxParams param; +}; + +template +struct BroadcastScaleMaskStore { + BroadcastScaleMaskStore(DST* dst, const MASK* mask, + BroadcastMaskSoftmaxParams params) + : dst(dst), mask(mask), params(params) { + for (int i = 0; i < num_dims; ++i) { mask_dims[i] = params.mask_dims[i]; } + } + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + cuda::softmax::Pack mask_pack; + const IndexType offset = row * params.row_size + col; + IndexType input_index[num_dims]; + IndexType mask_index[num_dims]; + params.src_index_helper.OffsetToNdIndex(offset, input_index); + for (int dim = 0; dim < num_dims; ++dim) { + if (mask_dims[dim] == 1) { + mask_index[dim] = 0; + } else { + mask_index[dim] = input_index[dim]; + } + } + const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index); + mask_pack.storage = + *(reinterpret_cast*>(mask) + mask_offset / N); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + pack.elem[i] = static_cast(params.fill); + } else { + pack.elem[i] = static_cast(src[i]) * static_cast(params.scale); + } + } + *(reinterpret_cast*>(dst) + offset / N) = pack.storage; + } + DST* dst; + const MASK* mask; + int64_t mask_dims[num_dims]; + BroadcastMaskSoftmaxParams params; +}; + +template +struct ElementwiseScaleMaskStore { + ElementwiseScaleMaskStore(DST* dst, const MASK* mask, ElementwiseMaskSoftmaxParams params) + : dst(dst), mask(mask), params(params) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + const int64_t offset = (row * params.row_size + col) / N; + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + pack.elem[i] = params.fill; + } else { + pack.elem[i] = static_cast(src[i]) * static_cast(params.scale); + } + } + *(reinterpret_cast*>(dst) + offset) = pack.storage; + } + DST* dst; + const MASK* mask; + ElementwiseMaskSoftmaxParams params; +}; + +} // namespace + +} // namespace fused_scale_mask_softmax + +} // namespace oneflow diff --git a/oneflow/user/kernels/fused_scale_mask_softmax_dropout.cu b/oneflow/user/kernels/fused_scale_mask_softmax_dropout.cu index 4c21d12e373..a0bec673a4a 100644 --- a/oneflow/user/kernels/fused_scale_mask_softmax_dropout.cu +++ b/oneflow/user/kernels/fused_scale_mask_softmax_dropout.cu @@ -16,62 +16,11 @@ limitations under the License. #include "oneflow/core/framework/framework.h" #include "oneflow/core/cuda/softmax.cuh" #include "oneflow/core/ep/cuda/cuda_stream.h" +#include "oneflow/user/kernels/fused_scale_mask_softmax.cuh" namespace oneflow { -template -struct ScaleMaskLoad { - ScaleMaskLoad(const SRC* src, const bool* mask, int64_t row_size, SRC fill, SRC scale) - : src(src), mask(mask), row_size(row_size), fill(fill), scale(scale) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - const int64_t offset = (row * row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - dst[i] = static_cast(fill); - } else { - dst[i] = static_cast(pack.elem[i]) * static_cast(scale); - } - } - } - const SRC* src; - const bool* mask; - int64_t row_size; - SRC fill; - SRC scale; -}; - -template -struct ScaleMaskStore { - ScaleMaskStore(DST* dst, const bool* mask, int64_t row_size, DST fill, DST scale) - : dst(dst), mask(mask), row_size(row_size), fill(fill), scale(scale) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - const int64_t offset = (row * row_size + col) / N; - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - pack.elem[i] = fill; - } else { - pack.elem[i] = static_cast(src[i]) * static_cast(scale); - } - } - *(reinterpret_cast*>(dst) + offset) = pack.storage; - } - DST* dst; - const bool* mask; - int64_t row_size; - DST fill; - DST scale; -}; +namespace { template struct DropoutLoad { @@ -124,7 +73,87 @@ struct DropoutStore { DST scale; }; -template +template +void LaunchBroadcastForwardKernel(cudaStream_t stream, const T* x, T* y, T* softmax_y, + const MASK* mask, const bool* dropout_mask, + const int64_t elem_cnt, const int64_t rows, const int64_t cols, + const float fill, const float scale, const float dropout_scale, + const int64_t* input_dims, const int64_t* mask_dims) { + DropoutStore store(y, softmax_y, dropout_mask, cols, dropout_scale); + NdIndexOffsetHelper input_index_helper(input_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::BroadcastScaleMaskLoad load( + x, mask, params); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} + +template +void LaunchElementwiseForwardKernel(cudaStream_t stream, const T* x, T* y, T* softmax_y, + const MASK* mask, const bool* dropout_mask, const int64_t rows, + const int64_t cols, const float fill, const float scale, + const float dropout_scale) { + fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::ElementwiseScaleMaskLoad load(x, mask, params); + DropoutStore store(y, softmax_y, dropout_mask, cols, dropout_scale); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} + +template +void LaunchBroadcastBackwardKernel(cudaStream_t stream, const T* softmax_y, const T* dy, T* dx, + const MASK* mask, const bool* dropout_mask, + const int64_t elem_cnt, const int64_t rows, const int64_t cols, + const float fill, const float scale, const float dropout_scale, + const int64_t* input_dims, const int64_t* mask_dims) { + DropoutLoad load_dy(dy, dropout_mask, cols, dropout_scale); + NdIndexOffsetHelper input_index_helper(input_dims, num_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims, num_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_softmax_y(softmax_y, cols); + fused_scale_mask_softmax::BroadcastScaleMaskStore store( + dx, mask, params); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( + stream, load_softmax_y, load_dy, store, rows, cols))); +} + +template +void LaunchElementwiseBackwardKernel(cudaStream_t stream, const T* softmax_y, const T* dy, T* dx, + const MASK* mask, const bool* dropout_mask, const int64_t rows, + const int64_t cols, const float fill, const float scale, + const float dropout_scale) { + fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_softmax_y(softmax_y, cols); + DropoutLoad load_dy(dy, dropout_mask, cols, dropout_scale); + fused_scale_mask_softmax::ElementwiseScaleMaskStore store(dx, mask, params); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( + stream, load_softmax_y, load_dy, store, rows, cols))); +} + +constexpr int32_t kMaxNumDims = 5; + +template class FusedScaleMaskSoftmaxDropoutKernel final : public user_op::OpKernel { public: FusedScaleMaskSoftmaxDropoutKernel() = default; @@ -137,36 +166,55 @@ class FusedScaleMaskSoftmaxDropoutKernel final : public user_op::OpKernel { const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const float mask_fill_value = ctx->Attr("mask_fill_value"); + const float scale_value = ctx->Attr("scale_value"); + const float dropout_scale_value = ctx->Attr("dropout_scale_value"); user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); - const ShapeView& x_shape = x->shape(); + const ShapeView& x_shape = x->shape_view(); + const ShapeView& mask_shape = mask->shape_view(); CHECK_GE(x_shape.NumAxes(), 2); + const int64_t elem_cnt = x_shape.elem_cnt(); const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); + const size_t num_input_dims = x_shape.NumAxes(); + const int64_t* input_dims = x_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); using ComputeType = typename cuda::softmax::DefaultComputeType::type; - ScaleMaskLoad load(x->dptr(), mask->dptr(), cols, - ctx->Attr("mask_fill_value"), - ctx->Attr("scale_value")); - DropoutStore store(y->mut_dptr(), softmax_y->mut_dptr(), - dropout_mask->dptr(), cols, - ctx->Attr("dropout_scale_value")); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - ctx->stream()->As()->cuda_stream(), load, store, rows, cols))); + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseForwardKernel( + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), + softmax_y->mut_dptr(), mask->dptr(), dropout_mask->dptr(), rows, cols, + mask_fill_value, scale_value, dropout_scale_value); + } + +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastForwardKernel( \ + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), \ + softmax_y->mut_dptr(), mask->dptr(), dropout_mask->dptr(), elem_cnt, rows, \ + cols, mask_fill_value, scale_value, dropout_scale_value, simplified_input_dims, \ + simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(half) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(float) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(double) -#undef REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL - -template +template class FusedScaleMaskSoftmaxDropoutGradKernel final : public user_op::OpKernel { public: FusedScaleMaskSoftmaxDropoutGradKernel() = default; @@ -180,33 +228,76 @@ class FusedScaleMaskSoftmaxDropoutGradKernel final : public user_op::OpKernel { const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const ShapeView& dy_shape = dy->shape(); + const float mask_fill_value = static_cast(0.0); + const float scale_value = ctx->Attr("scale_value"); + const float dropout_scale_value = ctx->Attr("dropout_scale_value"); + const ShapeView& dy_shape = dy->shape_view(); + const int64_t elem_cnt = dy_shape.elem_cnt(); + const ShapeView& mask_shape = mask->shape_view(); CHECK_GE(dy_shape.NumAxes(), 2); const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); + const int64_t* input_dims = dy_shape.ptr(); + const size_t num_input_dims = dy_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + using ComputeType = typename cuda::softmax::DefaultComputeType::type; cuda::softmax::DirectLoad load_softmax_y(softmax_y->dptr(), cols); - DropoutLoad load_dy(dy->dptr(), dropout_mask->dptr(), cols, - ctx->Attr("dropout_scale_value")); - ScaleMaskStore store(dx->mut_dptr(), mask->dptr(), cols, - static_cast(0.0), ctx->Attr("scale_value")); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( - ctx->stream()->As()->cuda_stream(), load_softmax_y, load_dy, store, rows, - cols))); + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseBackwardKernel( + ctx->stream()->As()->cuda_stream(), softmax_y->dptr(), dy->dptr(), + dx->mut_dptr(), mask->dptr(), dropout_mask->dptr(), rows, cols, + mask_fill_value, scale_value, dropout_scale_value); + } +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastBackwardKernel( \ + ctx->stream()->As()->cuda_stream(), softmax_y->dptr(), dy->dptr(), \ + dx->mut_dptr(), mask->dptr(), dropout_mask->dptr(), elem_cnt, rows, cols, \ + static_cast(0.0), ctx->Attr("scale_value"), \ + ctx->Attr("dropout_scale_value"), simplified_input_dims, simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); +} // namespace + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); + +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); -REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(half) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(float) -REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(double) -#undef REGISTER_FUCED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL } // namespace oneflow diff --git a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.cu b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.cu index 88ef6690cef..382bb2acf12 100644 --- a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.cu +++ b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.cu @@ -185,9 +185,9 @@ class FusedSelfAttentionQueryMulKeyAndValueGpuKernel final : public user_op::OpK using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0); - int64_t seq_len = h_tensor->shape().At(0); - int64_t batch_size = h_tensor->shape().At(1); - int64_t hidden_size = h_tensor->shape().At(2); + int64_t seq_len = h_tensor->shape_view().At(0); + int64_t batch_size = h_tensor->shape_view().At(1); + int64_t hidden_size = h_tensor->shape_view().At(2); int64_t head_size = ctx->Attr("head_size"); int64_t num_heads = hidden_size / (3 * head_size); int64_t ld = batch_size * hidden_size; @@ -212,7 +212,7 @@ class FusedSelfAttentionQueryMulKeyAndValueGpuKernel final : public user_op::OpK tmp_v_tensor->mut_dptr()); // v from (s, b, n, h) transpose to (b, n, s, h) Shape value_shape({seq_len, batch_size, num_heads, head_size}); - TransposeGpu(ctx->stream(), h_tensor->data_type(), value_shape, v_tensor->shape(), + TransposeGpu(ctx->stream(), h_tensor->data_type(), value_shape, v_tensor->shape_view(), {1, 2, 0, 3}, tmp_v_tensor->dptr(), v_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -234,19 +234,20 @@ class FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel final : public user_op: user_op::Tensor* h_grad_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states_grad", 0); float alpha = ctx->Attr("alpha"); - int64_t seq_len = h_grad_tensor->shape().At(0); - int64_t batch_size = h_grad_tensor->shape().At(1); - int64_t hidden_size = h_grad_tensor->shape().At(2); - int64_t num_heads = v_grad_tensor->shape().At(1); - int64_t head_size = v_grad_tensor->shape().At(3); + int64_t seq_len = h_grad_tensor->shape_view().At(0); + int64_t batch_size = h_grad_tensor->shape_view().At(1); + int64_t hidden_size = h_grad_tensor->shape_view().At(2); + int64_t num_heads = v_grad_tensor->shape_view().At(1); + int64_t head_size = v_grad_tensor->shape_view().At(3); int64_t ld = batch_size * hidden_size; int64_t stride = 3 * head_size; CHECK_EQ(hidden_size, num_heads * stride); // transpose from (b, n, s, h) to (s, b, n, h) Shape value_shape({seq_len, batch_size, num_heads, head_size}); - TransposeGpu(ctx->stream(), v_grad_tensor->data_type(), v_grad_tensor->shape(), value_shape, - {2, 0, 1, 3}, v_grad_tensor->dptr(), tmp_v_tensor->mut_dptr()); + TransposeGpu(ctx->stream(), v_grad_tensor->data_type(), v_grad_tensor->shape_view(), + value_shape, {2, 0, 1, 3}, v_grad_tensor->dptr(), + tmp_v_tensor->mut_dptr()); // slice v grad SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size); SliceKernelUtil::Backward(ctx->stream(), params, tmp_v_tensor->dptr(), diff --git a/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.cu b/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.cu index 43678154627..3c26ea4be04 100644 --- a/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.cu +++ b/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.cu @@ -153,7 +153,7 @@ class FusedTrilScaleSoftmaxMaskScaleKernel final : public user_op::OpKernel { const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); - const ShapeView& x_shape = x->shape(); + const ShapeView& x_shape = x->shape_view(); CHECK_GE(x_shape.NumAxes(), 2); const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); @@ -195,7 +195,7 @@ class FusedTrilScaleSoftmaxMaskScaleGradKernel final : public user_op::OpKernel const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const ShapeView& dy_shape = dy->shape(); + const ShapeView& dy_shape = dy->shape_view(); CHECK_GE(dy_shape.NumAxes(), 2); const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); diff --git a/oneflow/user/kernels/gather_kernel.cpp b/oneflow/user/kernels/gather_kernel.cpp index 42a0a6dc976..c4150557a8f 100644 --- a/oneflow/user/kernels/gather_kernel.cpp +++ b/oneflow/user/kernels/gather_kernel.cpp @@ -24,7 +24,7 @@ namespace user_op { namespace { -Shape GetFlatShape(const ShapeView& shape, int64_t axis) { +Shape GetFlatShape(ShapeView shape, int64_t axis) { return Shape({shape.Count(0, axis), shape.At(axis), shape.Count(axis + 1)}); } @@ -72,9 +72,10 @@ class GatherKernel final : public user_op::OpKernel, public user_op::CudaGraphSu const Shape& hierarchy = *ctx->parallel_desc().hierarchy(); CheckNdSbp(hierarchy, axis, in_nd_sbp, ctx->NdSbp4ArgNameAndIndex("indices", 0), ctx->NdSbp4ArgNameAndIndex("out", 0)); - const TensorDesc* in_logical_desc = ctx->LogicalTensorDesc4ArgNameAndIndex("in", 0); - TensorSliceView view = GetTensorSliceView4ParallelId( - hierarchy, in_nd_sbp, in_logical_desc->shape(), ctx->parallel_ctx().parallel_id()); + const Shape in_logical_shape = + ExpandDimIf0D(ctx->LogicalTensorDesc4ArgNameAndIndex("in", 0)->shape()); + TensorSliceView view = GetTensorSliceView4ParallelId(hierarchy, in_nd_sbp, in_logical_shape, + ctx->parallel_ctx().parallel_id()); return std::make_shared(view.At(axis).begin(), view.At(axis).end()); } else { return nullptr; @@ -87,20 +88,22 @@ class GatherKernel final : public user_op::OpKernel, public user_op::CudaGraphSu const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); const int64_t axis = ctx->Attr("axis"); - const int64_t num_indices = indices->shape().elem_cnt(); + const int64_t num_indices = indices->shape_view().elem_cnt(); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - if (out->shape().elem_cnt() == 0) { return; } + if (out->shape_view().elem_cnt() == 0) { return; } + + const Shape in_shape = ExpandDimIf0D(in->shape_view()); int64_t offset = 0; if (cache != nullptr) { auto* gather_cache = dynamic_cast(cache); CHECK_NOTNULL(gather_cache); - CHECK_EQ(in->shape().At(axis), gather_cache->upper() - gather_cache->lower()); + CHECK_EQ(in_shape.At(axis), gather_cache->upper() - gather_cache->lower()); offset = gather_cache->lower(); } GatherKernelUtilImpl::Forward(ctx->stream(), indices->dptr(), num_indices, - in->dptr(), GetFlatShape(in->shape(), axis), + in->dptr(), GetFlatShape(in_shape, axis), out->mut_dptr(), offset); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/gather_kernel_util.cpp b/oneflow/user/kernels/gather_kernel_util.cpp index 88705ca4bff..9482e6d5547 100644 --- a/oneflow/user/kernels/gather_kernel_util.cpp +++ b/oneflow/user/kernels/gather_kernel_util.cpp @@ -29,9 +29,9 @@ Shape GetFlatShape(const ShapeView& shape, int64_t axis) { template void GatherForward(ep::Stream* stream, const Blob* indices, const Blob* in, int64_t axis, Blob* out, const int64_t offset) { - const Shape& flat_in_shape = GetFlatShape(in->shape(), axis); + const Shape& flat_in_shape = GetFlatShape(in->shape_view(), axis); GatherKernelUtilImpl::Forward(stream, indices->dptr(), - indices->shape().elem_cnt(), in->dptr(), + indices->shape_view().elem_cnt(), in->dptr(), flat_in_shape, out->mut_dptr(), offset); } diff --git a/oneflow/user/kernels/gelu_kernel.cpp b/oneflow/user/kernels/gelu_kernel.cpp index 61b1682f569..03a05db6fde 100644 --- a/oneflow/user/kernels/gelu_kernel.cpp +++ b/oneflow/user/kernels/gelu_kernel.cpp @@ -30,7 +30,7 @@ class CpuGeluGradKernel final : public user_op::OpKernel { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); const T* x_ptr = x->dptr(); const T* dy_ptr = dy->dptr(); T* dx_ptr = dx->mut_dptr(); diff --git a/oneflow/user/kernels/gelu_kernel.cu b/oneflow/user/kernels/gelu_kernel.cu index e9cb7ff387c..0eb22198e2f 100644 --- a/oneflow/user/kernels/gelu_kernel.cu +++ b/oneflow/user/kernels/gelu_kernel.cu @@ -56,7 +56,7 @@ class GpuGeluGradKernel final : public user_op::OpKernel, public user_op::CudaGr const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t elem_cnt = x->shape().elem_cnt(); + const int64_t elem_cnt = x->shape_view().elem_cnt(); OF_CUDA_CHECK((cuda::elementwise::Binary(GeluGradFunctor(), elem_cnt, dx->mut_dptr(), x->dptr(), dy->dptr(), ctx->stream()->As()->cuda_stream()))); diff --git a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp index a804d9076dd..548916b266a 100644 --- a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp +++ b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cpp @@ -36,8 +36,8 @@ class GenerateRandomBatchPermutationIndicesCPUKernel final : public user_op::OpK const user_op::OpKernelCache*) const override { auto* random_generator = dynamic_cast*>(state); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - std::iota(y->mut_dptr(), y->mut_dptr() + y->shape().elem_cnt(), 0); - std::shuffle(y->mut_dptr(), y->mut_dptr() + y->shape().elem_cnt(), + std::iota(y->mut_dptr(), y->mut_dptr() + y->shape_view().elem_cnt(), 0); + std::shuffle(y->mut_dptr(), y->mut_dptr() + y->shape_view().elem_cnt(), *random_generator->Mutable()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu index baa2ae9586f..97ec84abf6d 100644 --- a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu +++ b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.cu @@ -96,9 +96,10 @@ class GenerateRandomBatchPermutationIndicesGPUKernel final : public user_op::OpK auto* random_generator = dynamic_cast>*>(state); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t batch_size = y->shape().At(0); + const int32_t batch_size = y->shape_view().At(0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(batch_size, static_cast(tmp_buffer->shape().elem_cnt()), + TmpBufferManager buf_manager(batch_size, + static_cast(tmp_buffer->shape_view().elem_cnt()), tmp_buffer->mut_dptr()); random_generator->Mutable()->Uniform(batch_size, buf_manager.RandomValuePtr()); InitializeIndices<< void GetBatch(size_t iter, user_op::Tensor* tokens) const { const size_t sample_len = seq_len_ + label_len_; - CHECK_EQ(tokens->shape().NumAxes(), 2); - CHECK_EQ(tokens->shape().At(0), batch_size_); - CHECK_EQ(tokens->shape().At(1), sample_len); + CHECK_EQ(tokens->shape_view().NumAxes(), 2); + CHECK_EQ(tokens->shape_view().At(0), batch_size_); + CHECK_EQ(tokens->shape_view().At(1), sample_len); T* dptr = tokens->mut_dptr(); for (size_t i = 0; i < batch_size_; ++i) { size_t sample_iter = iter * batch_size_ * num_shards_ + shard_index_ * batch_size_ + i; @@ -120,7 +120,7 @@ class GPTDataLoaderKernel final : public OpKernel { user_op::Tensor* iteration_tensor = ctx->Tensor4ArgNameAndIndex("iteration", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); if (iteration_tensor) { - CHECK_EQ(iteration_tensor->shape().elem_cnt(), 1); + CHECK_EQ(iteration_tensor->shape_view().elem_cnt(), 1); CHECK_EQ(iteration_tensor->data_type(), DataType::kInt64); int64_t* iter_ptr = iteration_tensor->mut_dptr(); loader->GetBatch(*iter_ptr, out_tensor); diff --git a/oneflow/user/kernels/grid_sample_kernel.cpp b/oneflow/user/kernels/grid_sample_kernel.cpp index cead14525ed..01a0a741844 100644 --- a/oneflow/user/kernels/grid_sample_kernel.cpp +++ b/oneflow/user/kernels/grid_sample_kernel.cpp @@ -39,9 +39,9 @@ class GridSampleKernel final : public user_op::OpKernel { GridSamplerPadding padding = StringToGridGridSamplerPadding(padding_mode); const bool align_corners = ctx->Attr("align_corners"); - const ShapeView& input_shape = input->shape(); - const ShapeView& grid_shape = grid->shape(); - const ShapeView& output_shape = output->shape(); + const ShapeView& input_shape = input->shape_view(); + const ShapeView& grid_shape = grid->shape_view(); + const ShapeView& output_shape = output->shape_view(); int64_t count = output_shape.elem_cnt() / input_shape.At(1); if (input_shape.NumAxes() == 4) { @@ -101,9 +101,9 @@ class GridSampleGradKernel final : public user_op::OpKernel { GridSamplerPadding padding = StringToGridGridSamplerPadding(padding_mode); const bool align_corners = ctx->Attr("align_corners"); - const ShapeView& input_shape = input->shape(); - const ShapeView& grid_shape = grid->shape(); - const ShapeView& output_shape = doutput->shape(); + const ShapeView& input_shape = input->shape_view(); + const ShapeView& grid_shape = grid->shape_view(); + const ShapeView& output_shape = doutput->shape_view(); int64_t count = output_shape.elem_cnt() / input_shape.At(1); Memset(ctx->stream(), dinput->mut_dptr(), 0, diff --git a/oneflow/user/kernels/grid_sample_kernel_util.cu b/oneflow/user/kernels/grid_sample_kernel_util.cu index 9d9a033e571..d6df7b1a6c1 100644 --- a/oneflow/user/kernels/grid_sample_kernel_util.cu +++ b/oneflow/user/kernels/grid_sample_kernel_util.cu @@ -47,7 +47,7 @@ struct CudnnGridSampleKernelUtil { || ctx->Attr("padding_mode") != "zeros" || !ctx->Attr("align_corners")) { return false; } - const ShapeView& input_shape = ctx->Tensor4ArgNameAndIndex("input", 0)->shape(); + const ShapeView& input_shape = ctx->Tensor4ArgNameAndIndex("input", 0)->shape_view(); if (input_shape.NumAxes() != 4 || input_shape.At(1) > 1024) { return false; } return true; @@ -57,8 +57,8 @@ struct CudnnGridSampleKernelUtil { const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0); const user_op::Tensor* grid = ctx->Tensor4ArgNameAndIndex("grid", 0); user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0); - const ShapeView& input_shape = input->shape(); - const ShapeView& output_shape = output->shape(); + const ShapeView& input_shape = input->shape_view(); + const ShapeView& output_shape = output->shape_view(); const DataType dtype = input->data_type(); CudnnTensorDesc input_desc(dtype, input_shape, "channels_first"); @@ -77,9 +77,9 @@ struct CudnnGridSampleKernelUtil { const user_op::Tensor* grid = ctx->Tensor4ArgNameAndIndex("grid", 0); user_op::Tensor* dinput = ctx->Tensor4ArgNameAndIndex("dinput", 0); user_op::Tensor* dgrid = ctx->Tensor4ArgNameAndIndex("dgrid", 0); - const ShapeView& input_shape = input->shape(); - const ShapeView& output_shape = doutput->shape(); - const ShapeView& dinput_shape = dinput->shape(); + const ShapeView& input_shape = input->shape_view(); + const ShapeView& output_shape = doutput->shape_view(); + const ShapeView& dinput_shape = dinput->shape_view(); const DataType dtype = input->data_type(); CudnnTensorDesc input_desc(dtype, input_shape, "channels_first"); diff --git a/oneflow/user/kernels/group_conv_kernel.cpp b/oneflow/user/kernels/group_conv_kernel.cpp index f697f7c3c74..aba8502168e 100644 --- a/oneflow/user/kernels/group_conv_kernel.cpp +++ b/oneflow/user/kernels/group_conv_kernel.cpp @@ -58,12 +58,12 @@ void Gemm4ChannelLast(enum CBLAS_TRANSPOSE trans_a, enum CBLAS_TRANSPOSE trans_b template T* GetImgMutDptr(user_op::Tensor* tensor, int64_t idx) { - return tensor->mut_dptr() + tensor->shape().Count(1) * idx; + return tensor->mut_dptr() + tensor->shape_view().Count(1) * idx; } template const T* GetImgDptr(const user_op::Tensor* tensor, int64_t idx) { - return tensor->dptr() + tensor->shape().Count(1) * idx; + return tensor->dptr() + tensor->shape_view().Count(1) * idx; } size_t CalcElemNumOfColBuf(const ShapeView& out_shape, const ShapeView& weight_shape, @@ -412,18 +412,18 @@ class ConvCpuKernel final : public user_op::OpKernel { T* col_buf_dptr = tmp_buffer->mut_dptr(); int32_t idx_offset = conv_cache->idx_offset_; - const int32_t input_group_interval = in->shape().At(1) / conv_cache->groups; - const int32_t weight_group_interval = weight->shape().At(0) / conv_cache->groups; - const int32_t output_group_interval = out->shape().At(1) / conv_cache->groups; - const int32_t input_step = input_group_interval * in->shape().Count(2); - const int32_t weight_step = weight_group_interval * weight->shape().Count(1); - const int32_t output_step = output_group_interval * out->shape().Count(2); + const int32_t input_group_interval = in->shape_view().At(1) / conv_cache->groups; + const int32_t weight_group_interval = weight->shape_view().At(0) / conv_cache->groups; + const int32_t output_group_interval = out->shape_view().At(1) / conv_cache->groups; + const int32_t input_step = input_group_interval * in->shape_view().Count(2); + const int32_t weight_step = weight_group_interval * weight->shape_view().Count(1); + const int32_t output_step = output_group_interval * out->shape_view().Count(2); const int32_t m = conv_cache->weight_5d_shape_.At(0) / conv_cache->groups; const int32_t n = conv_cache->out_5d_shape_.Count(idx_offset, idx_offset + 3); const int32_t k = conv_cache->weight_5d_shape_.Count(1); bool is_bias_mul_inited = false; - for (int64_t i = 0; i < in->shape().At(0); ++i) { + for (int64_t i = 0; i < in->shape_view().At(0); ++i) { const T* input_ptr = GetImgDptr(in, i); const T* weight_ptr = weight->dptr(); T* output_ptr = GetImgMutDptr(out, i); @@ -449,9 +449,10 @@ class ConvCpuKernel final : public user_op::OpKernel { const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); if (bias != nullptr) { - int64_t num_of_col_buf = CalcElemNumOfColBuf(out->shape(), weight->shape(), idx_offset); + int64_t num_of_col_buf = + CalcElemNumOfColBuf(out->shape_view(), weight->shape_view(), idx_offset); int64_t num_of_bias_mul = - (tmp_buffer->shape().elem_cnt() - num_of_col_buf * sizeof(T)) / sizeof(T); + (tmp_buffer->shape_view().elem_cnt() - num_of_col_buf * sizeof(T)) / sizeof(T); CHECK_GT(num_of_bias_mul, 0); T* bias_mul_dptr = col_buf_dptr + num_of_col_buf; if (!is_bias_mul_inited) { @@ -529,20 +530,20 @@ class ConvDataGradCpuKernel final : public user_op::OpKernel { user_op::Tensor* col_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); int32_t idx_offset = conv_cache->idx_offset_; - const int32_t dy_group_interval = dy->shape().At(1) / conv_cache->groups; - const int32_t filter_group_interval = filter->shape().At(0) / conv_cache->groups; - const int32_t dx_group_interval = dx->shape().At(1) / conv_cache->groups; - const int32_t dx_step = dx_group_interval * dx->shape().Count(2); - const int32_t filter_step = filter_group_interval * filter->shape().Count(1); - const int32_t dy_step = dy_group_interval * dy->shape().Count(2); + const int32_t dy_group_interval = dy->shape_view().At(1) / conv_cache->groups; + const int32_t filter_group_interval = filter->shape_view().At(0) / conv_cache->groups; + const int32_t dx_group_interval = dx->shape_view().At(1) / conv_cache->groups; + const int32_t dx_step = dx_group_interval * dx->shape_view().Count(2); + const int32_t filter_step = filter_group_interval * filter->shape_view().Count(1); + const int32_t dy_step = dy_group_interval * dy->shape_view().Count(2); const int32_t m = conv_cache->weight_5d_shape_.Count(1); const int32_t n = conv_cache->out_5d_shape_.Count(idx_offset, idx_offset + 3); const int32_t k = conv_cache->weight_5d_shape_.At(0) / conv_cache->groups; Memset(ctx->stream(), dx->mut_dptr(), 0, - dx->shape().elem_cnt() * sizeof(T)); + dx->shape_view().elem_cnt() * sizeof(T)); - FOR_RANGE(int64_t, i, 0, dy->shape().At(0)) { + FOR_RANGE(int64_t, i, 0, dy->shape_view().At(0)) { const T* filter_ptr = filter->dptr(); const T* dy_ptr = GetImgDptr(dy, i); T* dx_ptr = GetImgMutDptr(dx, i); @@ -570,13 +571,13 @@ class ConvDataGradCpuKernel final : public user_op::OpKernel { if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), dx->data_type()); - CHECK_EQ(add_to_output->shape(), dx->shape()); + CHECK_EQ(add_to_output->shape_view(), dx->shape_view()); std::unique_ptr primitive = ep::primitive::NewPrimitive(DeviceType::kCPU, add_to_output->data_type()); CHECK(primitive); primitive->Launch(ctx->stream(), dx->dptr(), add_to_output->dptr(), dx->mut_dptr(), - add_to_output->shape().elem_cnt()); + add_to_output->shape_view().elem_cnt()); } } }; @@ -626,19 +627,20 @@ class ConvFilterGradCpuKernel final : public user_op::OpKernel { user_op::Tensor* filter_diff = ctx->Tensor4ArgNameAndIndex("filter_diff", 0); user_op::Tensor* col_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); int32_t idx_offset = conv_cache->idx_offset_; - const int32_t dy_group_interval = dy->shape().At(1) / conv_cache->groups; - const int32_t filter_diff_group_interval = filter_diff->shape().At(0) / conv_cache->groups; - const int32_t x_group_interval = x->shape().At(1) / conv_cache->groups; - const int32_t x_step = x_group_interval * x->shape().Count(2); - const int32_t dy_step = dy_group_interval * dy->shape().Count(2); - const int32_t filter_diff_step = filter_diff_group_interval * filter_diff->shape().Count(1); + const int32_t dy_group_interval = dy->shape_view().At(1) / conv_cache->groups; + const int32_t filter_diff_group_interval = filter_diff->shape_view().At(0) / conv_cache->groups; + const int32_t x_group_interval = x->shape_view().At(1) / conv_cache->groups; + const int32_t x_step = x_group_interval * x->shape_view().Count(2); + const int32_t dy_step = dy_group_interval * dy->shape_view().Count(2); + const int32_t filter_diff_step = + filter_diff_group_interval * filter_diff->shape_view().Count(1); const int32_t m = conv_cache->weight_5d_shape_.At(0) / conv_cache->groups; const int32_t n = conv_cache->weight_5d_shape_.Count(1); const int32_t k = conv_cache->out_5d_shape_.Count(idx_offset, idx_offset + 3); Memset(ctx->stream(), filter_diff->mut_dptr(), 0, - filter_diff->shape().elem_cnt() * sizeof(T)); - FOR_RANGE(int64_t, i, 0, dy->shape().At(0)) { + filter_diff->shape_view().elem_cnt() * sizeof(T)); + FOR_RANGE(int64_t, i, 0, dy->shape_view().At(0)) { const T* x_ptr = GetImgDptr(x, i); const T* dy_ptr = GetImgDptr(dy, i); T* filter_diff_ptr = filter_diff->mut_dptr(); diff --git a/oneflow/user/kernels/group_deconv_kernel.cpp b/oneflow/user/kernels/group_deconv_kernel.cpp index 483b9026688..c5467e0e070 100644 --- a/oneflow/user/kernels/group_deconv_kernel.cpp +++ b/oneflow/user/kernels/group_deconv_kernel.cpp @@ -47,12 +47,12 @@ void Gemm4ChannelLast(enum CBLAS_TRANSPOSE trans_a, enum CBLAS_TRANSPOSE trans_b template T* GetImgMutDptr(user_op::Tensor* tensor, int64_t idx) { - return tensor->mut_dptr() + tensor->shape().Count(1) * idx; + return tensor->mut_dptr() + tensor->shape_view().Count(1) * idx; } template const T* GetImgDptr(const user_op::Tensor* tensor, int64_t idx) { - return tensor->dptr() + tensor->shape().Count(1) * idx; + return tensor->dptr() + tensor->shape_view().Count(1) * idx; } size_t CalcElemNumOfColBuf(const ShapeView& out_shape, const ShapeView& weight_shape, @@ -361,19 +361,19 @@ class DeconvCpuKernel final : public user_op::OpKernel { user_op::Tensor* col_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); int32_t idx_offset = deconv_cache->idx_offset_; - const int32_t input_group_interval = in->shape().At(1) / deconv_cache->groups; - const int32_t weight_group_interval = weight->shape().At(0) / deconv_cache->groups; - const int32_t output_group_interval = out->shape().At(1) / deconv_cache->groups; - const int32_t input_step = input_group_interval * in->shape().Count(2); - const int32_t weight_step = weight_group_interval * weight->shape().Count(1); - const int32_t output_step = output_group_interval * out->shape().Count(2); + const int32_t input_group_interval = in->shape_view().At(1) / deconv_cache->groups; + const int32_t weight_group_interval = weight->shape_view().At(0) / deconv_cache->groups; + const int32_t output_group_interval = out->shape_view().At(1) / deconv_cache->groups; + const int32_t input_step = input_group_interval * in->shape_view().Count(2); + const int32_t weight_step = weight_group_interval * weight->shape_view().Count(1); + const int32_t output_step = output_group_interval * out->shape_view().Count(2); const int32_t m = deconv_cache->weight_5d_shape_.Count(1); const int32_t n = deconv_cache->out_5d_shape_.Count(idx_offset, idx_offset + 3); const int32_t k = deconv_cache->weight_5d_shape_.At(0) / deconv_cache->groups; Memset(ctx->stream(), out->mut_dptr(), 0, - out->shape().elem_cnt() * sizeof(T)); - FOR_RANGE(int64_t, i, 0, in->shape().At(0)) { + out->shape_view().elem_cnt() * sizeof(T)); + FOR_RANGE(int64_t, i, 0, in->shape_view().At(0)) { const T* input_ptr = GetImgDptr(in, i); const T* weight_ptr = weight->dptr(); T* output_ptr = GetImgMutDptr(out, i); diff --git a/oneflow/user/kernels/heap_selection_top_k_kernel.cu b/oneflow/user/kernels/heap_selection_top_k_kernel.cu index aa4c32c7829..712c5950b96 100644 --- a/oneflow/user/kernels/heap_selection_top_k_kernel.cu +++ b/oneflow/user/kernels/heap_selection_top_k_kernel.cu @@ -193,11 +193,11 @@ class GpuHeapSelectionTopKKernel final : public user_op::OpKernel { using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - if (in->shape().elem_cnt() == 0) { return; } + if (in->shape_view().elem_cnt() == 0) { return; } user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t instance_size = in->shape().At(in->shape().NumAxes() - 1); - const int64_t instance_num = in->shape().elem_cnt() / instance_size; + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int64_t instance_num = in->shape_view().elem_cnt() / instance_size; const int64_t k = std::min(static_cast(ctx->Attr("k")), instance_size); // Use as many heaps as possible (# of heaps == # of threads used in thread block). diff --git a/oneflow/user/kernels/identity_kernel.cpp b/oneflow/user/kernels/identity_kernel.cpp index 3d432cfae63..8bf4492357d 100644 --- a/oneflow/user/kernels/identity_kernel.cpp +++ b/oneflow/user/kernels/identity_kernel.cpp @@ -31,8 +31,8 @@ class IdentityKernel final : public user_op::OpKernel, public user_op::CudaGraph void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); - CHECK_EQ(out->shape(), in_shape); + const ShapeView& in_shape = in->shape_view(); + CHECK_EQ(out->shape_view(), in_shape); const DataType in_data_type = in->data_type(); CHECK_EQ(out->data_type(), in_data_type); Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), diff --git a/oneflow/user/kernels/image_batch_align_kernel.cpp b/oneflow/user/kernels/image_batch_align_kernel.cpp index a2ee0ee41d1..880bd2ee9fe 100644 --- a/oneflow/user/kernels/image_batch_align_kernel.cpp +++ b/oneflow/user/kernels/image_batch_align_kernel.cpp @@ -25,10 +25,10 @@ namespace { template void CopyFromTensorBuffer(T* image_ptr, const TensorBuffer& image_buffer, const int batch_height, const int batch_width, const int channels) { - CHECK_EQ(image_buffer.shape().NumAxes(), 3); - const int h = image_buffer.shape().At(0); - const int w = image_buffer.shape().At(1); - const int c = image_buffer.shape().At(2); + CHECK_EQ(image_buffer.shape_view().NumAxes(), 3); + const int h = image_buffer.shape_view().At(0); + const int w = image_buffer.shape_view().At(1); + const int c = image_buffer.shape_view().At(2); CHECK_LE(h, batch_height); CHECK_LE(w, batch_width); CHECK_EQ(c, channels); @@ -59,33 +59,33 @@ class ImageBatchAlignKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in_tensor->shape().NumAxes(), 1); - CHECK_EQ(out_tensor->shape().NumAxes(), 4); - const int64_t num_images = in_tensor->shape().elem_cnt(); + CHECK_EQ(in_tensor->shape_view().NumAxes(), 1); + CHECK_EQ(out_tensor->shape_view().NumAxes(), 4); + const int64_t num_images = in_tensor->shape_view().elem_cnt(); const bool dynamic_out = ctx->Attr("dynamic_out"); CHECK_GT(num_images, 0); int64_t max_height = 0; int64_t max_width = 0; - const int64_t channels = out_tensor->shape().At(3); + const int64_t channels = out_tensor->shape_view().At(3); FOR_RANGE(int, i, 0, num_images) { const TensorBuffer& image_buffer = in_tensor->dptr()[i]; - max_height = std::max(max_height, image_buffer.shape().At(0)); - max_width = std::max(max_width, image_buffer.shape().At(1)); - CHECK_EQ(image_buffer.shape().At(2), channels); + max_height = std::max(max_height, image_buffer.shape_view().At(0)); + max_width = std::max(max_width, image_buffer.shape_view().At(1)); + CHECK_EQ(image_buffer.shape_view().At(2), channels); } int32_t alignment = ctx->Attr("alignment"); max_height = RoundUp(max_height, alignment); max_width = RoundUp(max_width, alignment); if (dynamic_out) { - auto mut_shape_view = out_tensor->mut_shape(); + auto mut_shape_view = out_tensor->mut_shape_view(); mut_shape_view.Set(0, num_images); mut_shape_view.Set(1, max_height); mut_shape_view.Set(2, max_width); } memset(out_tensor->mut_dptr(), 0, - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type())); + out_tensor->shape_view().elem_cnt() * GetSizeOfDataType(out_tensor->data_type())); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& image_buffer = in_tensor->dptr()[i]; T* out_ptr = out_tensor->mut_dptr() + i * max_height * max_width * channels; diff --git a/oneflow/user/kernels/image_decode_kernel.cpp b/oneflow/user/kernels/image_decode_kernel.cpp index 1a51d04b7f2..54b87cc9a5e 100644 --- a/oneflow/user/kernels/image_decode_kernel.cpp +++ b/oneflow/user/kernels/image_decode_kernel.cpp @@ -69,15 +69,15 @@ class ImageDecodeKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in_tensor->shape().elem_cnt(), out_tensor->shape().elem_cnt()); - CHECK_GT(in_tensor->shape().elem_cnt(), 0); + CHECK_EQ(in_tensor->shape_view().elem_cnt(), out_tensor->shape_view().elem_cnt()); + CHECK_GT(in_tensor->shape_view().elem_cnt(), 0); const TensorBuffer* in_img_buf = in_tensor->dptr(); TensorBuffer* out_img_buf = out_tensor->mut_dptr(); const std::string& color_space = ctx->Attr("color_space"); const DataType data_type = ctx->Attr("data_type"); - MultiThreadLoop(in_tensor->shape().elem_cnt(), [&](size_t i) { + MultiThreadLoop(in_tensor->shape_view().elem_cnt(), [&](size_t i) { DecodeImage(in_img_buf[i], out_img_buf + i, color_space, data_type); }); } diff --git a/oneflow/user/kernels/image_object_preprocess_kernels.cpp b/oneflow/user/kernels/image_object_preprocess_kernels.cpp index 7ee3f504e0a..0e2b98fa1ef 100644 --- a/oneflow/user/kernels/image_object_preprocess_kernels.cpp +++ b/oneflow/user/kernels/image_object_preprocess_kernels.cpp @@ -55,7 +55,7 @@ void FlipImage(TensorBuffer* image_buffer, FlipCode flip_code) { template void FlipBoxes(TensorBuffer* boxes_buffer, int32_t image_width, int32_t image_height, FlipCode flip_code) { - int num_boxes = boxes_buffer->shape().At(0); + int num_boxes = boxes_buffer->shape_view().At(0); FOR_RANGE(int, i, 0, num_boxes) { T* cur_box_ptr = boxes_buffer->mut_data() + i * 4; if (flip_code & FlipCode::kHorizontalFlip) { @@ -81,7 +81,7 @@ DEFINE_STATIC_SWITCH_FUNC(void, FlipBoxes, MAKE_FLIP_BOXES_SWITCH_ENTRY, template void ScaleBoxes(TensorBuffer* boxes_buffer, T scale_w, T scale_h) { - int num_boxes = boxes_buffer->shape().At(0); + int num_boxes = boxes_buffer->shape_view().At(0); FOR_RANGE(int, i, 0, num_boxes) { T* cur_box_ptr = boxes_buffer->mut_data() + i * 4; cur_box_ptr[0] *= scale_w; @@ -100,7 +100,7 @@ DEFINE_STATIC_SWITCH_FUNC(void, ScaleBoxes, MAKE_SCALE_BOXES_SWITCH_ENTRY, template void FlipPolygons(TensorBuffer* polygons_buffer, int32_t image_width, int32_t image_height, FlipCode flip_code) { - int num_points = polygons_buffer->shape().At(0); + int num_points = polygons_buffer->shape_view().At(0); FOR_RANGE(int, i, 0, num_points) { T* cur_poly_ptr = polygons_buffer->mut_data() + i * 2; if (flip_code & FlipCode::kHorizontalFlip) { cur_poly_ptr[0] = image_width - cur_poly_ptr[0]; } @@ -116,7 +116,7 @@ DEFINE_STATIC_SWITCH_FUNC(void, FlipPolygons, MAKE_FLIP_POLYGONS_SWITCH_ENTRY, template void ScalePolygons(TensorBuffer* poly_buffer, T scale_w, T scale_h) { - int num_pts = poly_buffer->shape().At(0); + int num_pts = poly_buffer->shape_view().At(0); FOR_RANGE(int, i, 0, num_pts) { T* cur_pt = poly_buffer->mut_data() + i * 2; cur_pt[0] *= scale_w; @@ -133,10 +133,10 @@ DEFINE_STATIC_SWITCH_FUNC(void, ScalePolygons, MAKE_SCALE_POLYGONS_SWITCH_ENTRY, template void ImageNormalizeByChannel(TensorBuffer* image_buffer, const std::vector& std_vec, const std::vector& mean_vec) { - CHECK_EQ(image_buffer->shape().NumAxes(), 3); - int h = image_buffer->shape().At(0); - int w = image_buffer->shape().At(1); - int c = image_buffer->shape().At(2); + CHECK_EQ(image_buffer->shape_view().NumAxes(), 3); + int h = image_buffer->shape_view().At(0); + int w = image_buffer->shape_view().At(1); + int c = image_buffer->shape_view().At(2); CHECK_EQ(std_vec.size(), c); CHECK_EQ(mean_vec.size(), c); FOR_RANGE(int, i, 0, (h * w)) { @@ -154,12 +154,12 @@ DEFINE_STATIC_SWITCH_FUNC(void, ImageNormalizeByChannel, MAKE_IMAGE_NORMALIZE_SW template void PolygonsToMask(const TensorBuffer& polys, const TensorBuffer& polys_nd_index, TensorBuffer* masks, int32_t im_w, int32_t im_h) { - CHECK_EQ(polys.shape().NumAxes(), 2); - CHECK_EQ(polys.shape().At(1), 2); - CHECK_EQ(polys_nd_index.shape().NumAxes(), 2); - CHECK_EQ(polys_nd_index.shape().At(1), 3); - int num_points = polys.shape().At(0); - CHECK_EQ(polys_nd_index.shape().At(0), num_points); + CHECK_EQ(polys.shape_view().NumAxes(), 2); + CHECK_EQ(polys.shape_view().At(1), 2); + CHECK_EQ(polys_nd_index.shape_view().NumAxes(), 2); + CHECK_EQ(polys_nd_index.shape_view().At(1), 3); + int num_points = polys.shape_view().At(0); + CHECK_EQ(polys_nd_index.shape_view().At(0), num_points); std::vector> poly_point_vec; std::vector mask_mat_vec; @@ -225,12 +225,12 @@ class ImageFlipKernel final : public user_op::OpKernel { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); const user_op::Tensor* flip_code_tensor = ctx->Tensor4ArgNameAndIndex("flip_code", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - int num_images = in_tensor->shape().elem_cnt(); - CHECK_EQ(out_tensor->shape().elem_cnt(), num_images); + int num_images = in_tensor->shape_view().elem_cnt(); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), num_images); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& in_buffer = in_tensor->dptr()[i]; - CHECK_EQ(in_buffer.shape().NumAxes(), 3); + CHECK_EQ(in_buffer.shape_view().NumAxes(), 3); TensorBuffer* out_buffer = out_tensor->mut_dptr() + i; out_buffer->CopyFrom(in_buffer); FlipCode flip_code = static_cast(flip_code_tensor->dptr()[i]); @@ -252,16 +252,16 @@ class ObjectBboxFlipKernel final : public user_op::OpKernel { const user_op::Tensor* flip_code_tensor = ctx->Tensor4ArgNameAndIndex("flip_code", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - int num_images = bbox_tensor->shape().elem_cnt(); + int num_images = bbox_tensor->shape_view().elem_cnt(); CHECK_GT(num_images, 0); - CHECK_EQ(out_tensor->shape().elem_cnt(), num_images); - CHECK_EQ(image_size_tensor->shape().At(0), num_images); - CHECK_EQ(flip_code_tensor->shape().elem_cnt(), num_images); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), num_images); + CHECK_EQ(image_size_tensor->shape_view().At(0), num_images); + CHECK_EQ(flip_code_tensor->shape_view().elem_cnt(), num_images); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& bbox_buffer = bbox_tensor->dptr()[i]; - CHECK_EQ(bbox_buffer.shape().NumAxes(), 2); - CHECK_EQ(bbox_buffer.shape().At(1), 4); + CHECK_EQ(bbox_buffer.shape_view().NumAxes(), 2); + CHECK_EQ(bbox_buffer.shape_view().At(1), 4); TensorBuffer* out_bbox_buffer = out_tensor->mut_dptr() + i; out_bbox_buffer->CopyFrom(bbox_buffer); int32_t image_width = image_size_tensor->dptr()[i * 2 + 0]; @@ -285,15 +285,15 @@ class ObjectBboxScaleKernel final : public user_op::OpKernel { const user_op::Tensor* scale_tensor = ctx->Tensor4ArgNameAndIndex("scale", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - int num_images = bbox_tensor->shape().elem_cnt(); + int num_images = bbox_tensor->shape_view().elem_cnt(); CHECK_GT(num_images, 0); - CHECK_EQ(scale_tensor->shape().At(0), num_images); - CHECK_EQ(out_tensor->shape().elem_cnt(), num_images); + CHECK_EQ(scale_tensor->shape_view().At(0), num_images); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), num_images); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& bbox_buffer = bbox_tensor->dptr()[i]; - CHECK_EQ(bbox_buffer.shape().NumAxes(), 2); - CHECK_EQ(bbox_buffer.shape().At(1), 4); + CHECK_EQ(bbox_buffer.shape_view().NumAxes(), 2); + CHECK_EQ(bbox_buffer.shape_view().At(1), 4); TensorBuffer* out_bbox_buffer = out_tensor->mut_dptr() + i; out_bbox_buffer->CopyFrom(bbox_buffer); float scale_w = scale_tensor->dptr()[i * 2 + 0]; @@ -316,16 +316,16 @@ class ObjectSegmentationPolygonFlipKernel final : public user_op::OpKernel { const user_op::Tensor* flip_code_tensor = ctx->Tensor4ArgNameAndIndex("flip_code", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - int num_images = polygon_tensor->shape().elem_cnt(); + int num_images = polygon_tensor->shape_view().elem_cnt(); CHECK_GT(num_images, 0); - CHECK_EQ(out_tensor->shape().elem_cnt(), num_images); - CHECK_EQ(image_size_tensor->shape().At(0), num_images); - CHECK_EQ(flip_code_tensor->shape().elem_cnt(), num_images); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), num_images); + CHECK_EQ(image_size_tensor->shape_view().At(0), num_images); + CHECK_EQ(flip_code_tensor->shape_view().elem_cnt(), num_images); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& polygons_buffer = polygon_tensor->dptr()[i]; - CHECK_EQ(polygons_buffer.shape().NumAxes(), 2); - CHECK_EQ(polygons_buffer.shape().At(1), 2); + CHECK_EQ(polygons_buffer.shape_view().NumAxes(), 2); + CHECK_EQ(polygons_buffer.shape_view().At(1), 2); TensorBuffer* out_polygons_buffer = out_tensor->mut_dptr() + i; out_polygons_buffer->CopyFrom(polygons_buffer); int32_t image_width = image_size_tensor->dptr()[i * 2 + 0]; @@ -349,15 +349,15 @@ class ObjectSegmentationPolygonScaleKernel final : public user_op::OpKernel { const user_op::Tensor* scale_tensor = ctx->Tensor4ArgNameAndIndex("scale", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - int num_images = poly_tensor->shape().elem_cnt(); + int num_images = poly_tensor->shape_view().elem_cnt(); CHECK_GT(num_images, 0); - CHECK_EQ(scale_tensor->shape().At(0), num_images); - CHECK_EQ(out_tensor->shape().elem_cnt(), num_images); + CHECK_EQ(scale_tensor->shape_view().At(0), num_images); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), num_images); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& poly_buffer = poly_tensor->dptr()[i]; - CHECK_EQ(poly_buffer.shape().NumAxes(), 2); - CHECK_EQ(poly_buffer.shape().At(1), 2); + CHECK_EQ(poly_buffer.shape_view().NumAxes(), 2); + CHECK_EQ(poly_buffer.shape_view().At(1), 2); TensorBuffer* out_poly_buffer = out_tensor->mut_dptr() + i; out_poly_buffer->CopyFrom(poly_buffer); float scale_w = scale_tensor->dptr()[i * 2 + 0]; @@ -378,14 +378,14 @@ class ImageNormalize final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - int num_images = in_tensor->shape().elem_cnt(); - CHECK_EQ(out_tensor->shape().elem_cnt(), num_images); + int num_images = in_tensor->shape_view().elem_cnt(); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), num_images); const auto& std_vec = ctx->Attr>("std"); const auto& mean_vec = ctx->Attr>("mean"); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& in_buffer = in_tensor->dptr()[i]; - CHECK_EQ(in_buffer.shape().NumAxes(), 3); + CHECK_EQ(in_buffer.shape_view().NumAxes(), 3); TensorBuffer* out_buffer = out_tensor->mut_dptr() + i; out_buffer->CopyFrom(in_buffer); SwitchImageNormalizeByChannel(SwitchCase(out_buffer->data_type()), out_buffer, std_vec, @@ -407,11 +407,11 @@ class ObjectSegmentationPolygonToMask final : public user_op::OpKernel { const user_op::Tensor* image_size_tensor = ctx->Tensor4ArgNameAndIndex("image_size", 0); user_op::Tensor* mask_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - int num_images = poly_tensor->shape().elem_cnt(); + int num_images = poly_tensor->shape_view().elem_cnt(); CHECK_GT(num_images, 0); - CHECK_EQ(poly_index_tensor->shape().elem_cnt(), num_images); - CHECK_EQ(image_size_tensor->shape().At(0), num_images); - CHECK_EQ(mask_tensor->shape().elem_cnt(), num_images); + CHECK_EQ(poly_index_tensor->shape_view().elem_cnt(), num_images); + CHECK_EQ(image_size_tensor->shape_view().At(0), num_images); + CHECK_EQ(mask_tensor->shape_view().elem_cnt(), num_images); MultiThreadLoop(num_images, [&](size_t i) { const TensorBuffer& poly_buffer = poly_tensor->dptr()[i]; diff --git a/oneflow/user/kernels/image_preprocess_kernels.cpp b/oneflow/user/kernels/image_preprocess_kernels.cpp index 0e08ea29456..b544dde3b3a 100644 --- a/oneflow/user/kernels/image_preprocess_kernels.cpp +++ b/oneflow/user/kernels/image_preprocess_kernels.cpp @@ -85,9 +85,9 @@ std::vector GetMirrorVec(user_op::KernelComputeContext* ctx) { std::vector mirror; user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* mirror_blob = ctx->Tensor4ArgNameAndIndex("mirror", 0); - int64_t record_num = in_blob->shape().At(0); + int64_t record_num = in_blob->shape_view().At(0); if (mirror_blob) { - CHECK_EQ(record_num, mirror_blob->shape().elem_cnt()); + CHECK_EQ(record_num, mirror_blob->shape_view().elem_cnt()); mirror.insert(mirror.end(), mirror_blob->dptr(), mirror_blob->dptr() + record_num); } else { @@ -140,7 +140,7 @@ class CropMirrorNormalizeFromStaticShapeToFloatKernel final : public user_op::Op user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); std::vector mirror = GetMirrorVec(ctx); - int64_t record_num = in_blob->shape().At(0); + int64_t record_num = in_blob->shape_view().At(0); const std::string& color_space = ctx->Attr("color_space"); int64_t C = ImageUtil::IsColor(color_space) ? 3 : 1; float crop_pos_y = ctx->Attr("crop_pos_y"); @@ -149,13 +149,13 @@ class CropMirrorNormalizeFromStaticShapeToFloatKernel final : public user_op::Op float* out_dptr = out_blob->mut_dptr(); const uint8_t* in_dptr = in_blob->dptr(); - const ShapeView& in_shape = in_blob->shape(); + const ShapeView& in_shape = in_blob->shape_view(); int64_t N = in_shape.At(0); int64_t in_H = in_shape.At(1); int64_t in_W = in_shape.At(2); CHECK_EQ(C, in_shape.At(3)); int64_t in_image_elem_cnt = in_H * in_W * C; - const ShapeView& out_shape = out_blob->shape(); + const ShapeView& out_shape = out_blob->shape_view(); CHECK_EQ(out_shape.NumAxes(), 4); CHECK_EQ(out_shape.At(0), N); if (output_layout == "NCHW") { @@ -222,7 +222,7 @@ class CropMirrorNormalizeFromTensorBufferToFloatKernel final : public user_op::O user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); std::vector mirror = GetMirrorVec(ctx); - int64_t record_num = in_blob->shape().At(0); + int64_t record_num = in_blob->shape_view().At(0); const std::string& color_space = ctx->Attr("color_space"); int64_t C = ImageUtil::IsColor(color_space) ? 3 : 1; float crop_pos_y = ctx->Attr("crop_pos_y"); @@ -231,10 +231,10 @@ class CropMirrorNormalizeFromTensorBufferToFloatKernel final : public user_op::O float* out_dptr = out_blob->mut_dptr(); const TensorBuffer* in_buffers = in_blob->dptr(); - const ShapeView& in_shape = in_blob->shape(); + const ShapeView& in_shape = in_blob->shape_view(); int64_t N = in_shape.At(0); CHECK_EQ(in_shape.NumAxes(), 1); - const ShapeView& out_shape = out_blob->shape(); + const ShapeView& out_shape = out_blob->shape_view(); CHECK_EQ(out_shape.NumAxes(), 4); CHECK_EQ(out_shape.At(0), N); if (output_layout == "NCHW") { @@ -329,7 +329,7 @@ class CoinFlipKernel final : public user_op::OpKernel { auto* rand_bool_gen = dynamic_cast(state); user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); int8_t* dptr = out_blob->mut_dptr(); - for (int32_t i = 0; i < out_blob->shape().elem_cnt(); ++i) { + for (int32_t i = 0; i < out_blob->shape_view().elem_cnt(); ++i) { *(dptr + i) = rand_bool_gen->GetNextBool() ? 1 : 0; } } @@ -364,7 +364,7 @@ void ImageRandomCropImpl(const TensorBuffer* in_buffer, TensorBuffer* out_buffer H = image.rows; CHECK(image.isContinuous()); - const int c = in_buffer->shape().At(2); + const int c = in_buffer->shape_view().At(2); CHECK_EQ(c, image.channels()); Shape image_shape({H, W, c}); out_buffer->Resize(image_shape, in_buffer->data_type()); @@ -389,10 +389,10 @@ class ImageRandomCropKernel final : public user_op::OpKernel { auto* crop_window_generators = dynamic_cast(state); CHECK_NOTNULL(crop_window_generators); user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t record_num = out_blob->shape().elem_cnt(); + int64_t record_num = out_blob->shape_view().elem_cnt(); CHECK(record_num > 0); user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); - CHECK_EQ(out_blob->shape(), in_blob->shape()); + CHECK_EQ(out_blob->shape_view(), in_blob->shape_view()); const TensorBuffer* in_buffers = in_blob->dptr(); TensorBuffer* out_buffers = out_blob->mut_dptr(); MultiThreadLoop(record_num, [&](size_t i) { diff --git a/oneflow/user/kernels/image_preprocess_kernels.cu b/oneflow/user/kernels/image_preprocess_kernels.cu index 2b2e287e69c..3242967fa66 100644 --- a/oneflow/user/kernels/image_preprocess_kernels.cu +++ b/oneflow/user/kernels/image_preprocess_kernels.cu @@ -151,8 +151,8 @@ class CropMirrorNormalizeGpuKernel final : public user_op::OpKernel { const std::string& output_layout = ctx->Attr("output_layout"); float* out_dptr = out_blob->mut_dptr(); const uint8_t* in_dptr = in_blob->dptr(); - const ShapeView& in_shape = in_blob->shape(); - const ShapeView& out_shape = out_blob->shape(); + const ShapeView& in_shape = in_blob->shape_view(); + const ShapeView& out_shape = out_blob->shape_view(); CHECK_EQ(in_shape.NumAxes(), 4); CHECK_EQ(out_shape.NumAxes(), 4); int32_t elem_cnt = out_shape.elem_cnt(); diff --git a/oneflow/user/kernels/image_resize_kernels.cpp b/oneflow/user/kernels/image_resize_kernels.cpp index ea4ff93fbac..f79eb065b45 100644 --- a/oneflow/user/kernels/image_resize_kernels.cpp +++ b/oneflow/user/kernels/image_resize_kernels.cpp @@ -30,9 +30,9 @@ std::pair GetTargetResizedSize4ImageBuffer(const TensorBuffer& image_buffe CHECK_GT(target_size, 0); if (min_size > 0) { CHECK_GE(target_size, min_size); } if (max_size > 0) { CHECK_LE(target_size, max_size); } - CHECK_EQ(image_buffer.shape().NumAxes(), 3); - const T origin_height = image_buffer.shape().At(0); - const T origin_width = image_buffer.shape().At(1); + CHECK_EQ(image_buffer.shape_view().NumAxes(), 3); + const T origin_height = image_buffer.shape_view().At(0); + const T origin_width = image_buffer.shape_view().At(1); // set round to banker's rounding int origin_round_way = std::fegetround(); @@ -122,28 +122,28 @@ class ImageResizeToFixedSizeKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); CHECK_NOTNULL(in_tensor); - const int64_t batch_size = in_tensor->shape().elem_cnt(); + const int64_t batch_size = in_tensor->shape_view().elem_cnt(); CHECK_GT(batch_size, 0); user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(out_tensor->shape().NumAxes(), 4); - CHECK_EQ(out_tensor->shape().At(0), batch_size); - int64_t res_h = out_tensor->shape().At(1); - int64_t res_w = out_tensor->shape().At(2); - int64_t channels = out_tensor->shape().At(3); + CHECK_EQ(out_tensor->shape_view().NumAxes(), 4); + CHECK_EQ(out_tensor->shape_view().At(0), batch_size); + int64_t res_h = out_tensor->shape_view().At(1); + int64_t res_w = out_tensor->shape_view().At(2); + int64_t channels = out_tensor->shape_view().At(3); int64_t elem_cnt_per_img = res_h * res_w * channels; user_op::Tensor* scale_tensor = ctx->Tensor4ArgNameAndIndex("scale", 0); - CHECK_EQ(scale_tensor->shape().NumAxes(), 2); - CHECK_EQ(scale_tensor->shape().At(0), batch_size); - CHECK_EQ(scale_tensor->shape().At(1), 2); + CHECK_EQ(scale_tensor->shape_view().NumAxes(), 2); + CHECK_EQ(scale_tensor->shape_view().At(0), batch_size); + CHECK_EQ(scale_tensor->shape_view().At(1), 2); MultiThreadLoop(batch_size, [&](size_t i) { const TensorBuffer& in_buffer = in_tensor->dptr()[i]; - CHECK_EQ(in_buffer.shape().NumAxes(), 3); - const int64_t origin_height = in_buffer.shape().At(0); - const int64_t origin_width = in_buffer.shape().At(1); - CHECK_EQ(in_buffer.shape().At(2), channels); + CHECK_EQ(in_buffer.shape_view().NumAxes(), 3); + const int64_t origin_height = in_buffer.shape_view().At(0); + const int64_t origin_width = in_buffer.shape_view().At(1); + CHECK_EQ(in_buffer.shape_view().At(2), channels); DataType dtype = ctx->Attr("data_type"); int interp_flag = GetCvInterpolationFlag(ctx->Attr("interpolation_type"), origin_width, origin_height, res_w, res_h); @@ -195,7 +195,7 @@ class ImageResizeKeepAspectRatioKernel final : public user_op::OpKernel { TensorBuffer* scale_buf = scale_tensor->mut_dptr(); TensorBuffer* size_buf = size_tensor->mut_dptr(); - const int64_t num_images = in_tensor->shape().elem_cnt(); + const int64_t num_images = in_tensor->shape_view().elem_cnt(); const bool resize_longer = ctx->Attr("resize_longer"); const int32_t target_size = ctx->Attr("target_size"); const int32_t min_size = ctx->Attr("min_size"); @@ -205,10 +205,10 @@ class ImageResizeKeepAspectRatioKernel final : public user_op::OpKernel { MultiThreadLoop(num_images, [&](size_t i) { ImageTargetResize(in_img_buf[i], out_img_buf + i, resize_longer, target_size, min_size, max_size, interp_type); - const int64_t org_h = in_img_buf[i].shape().At(0); - const int64_t org_w = in_img_buf[i].shape().At(1); - const int64_t res_h = out_img_buf[i].shape().At(0); - const int64_t res_w = out_img_buf[i].shape().At(1); + const int64_t org_h = in_img_buf[i].shape_view().At(0); + const int64_t org_w = in_img_buf[i].shape_view().At(1); + const int64_t res_h = out_img_buf[i].shape_view().At(0); + const int64_t res_w = out_img_buf[i].shape_view().At(1); scale_buf[i].Resize(Shape({2}), DataType::kFloat); scale_buf[i].mut_data()[0] = static_cast(res_w) / static_cast(org_w); diff --git a/oneflow/user/kernels/image_target_resize_kernel.cpp b/oneflow/user/kernels/image_target_resize_kernel.cpp index 927fff8f5f7..7b032318565 100644 --- a/oneflow/user/kernels/image_target_resize_kernel.cpp +++ b/oneflow/user/kernels/image_target_resize_kernel.cpp @@ -26,9 +26,9 @@ namespace { template std::pair GetTargetResizedSize4ImageBuffer(const TensorBuffer& image_buffer, const T target_size, const T max_size) { - CHECK_EQ(image_buffer.shape().NumAxes(), 3); - const T origin_height = image_buffer.shape().At(0); - const T origin_width = image_buffer.shape().At(1); + CHECK_EQ(image_buffer.shape_view().NumAxes(), 3); + const T origin_height = image_buffer.shape_view().At(0); + const T origin_width = image_buffer.shape_view().At(1); // set round to banker's rounding int origin_round_way = std::fegetround(); @@ -57,7 +57,7 @@ std::pair GetTargetResizedSize4ImageBuffer(const TensorBuffer& image_buffe void ImageTargetResize(const TensorBuffer& image_buffer, TensorBuffer* resized_image_buffer, const int32_t target_size, const int32_t max_size) { - CHECK_EQ(image_buffer.shape().NumAxes(), 3); + CHECK_EQ(image_buffer.shape_view().NumAxes(), 3); CHECK_GT(target_size, 0); CHECK_GE(max_size, target_size); @@ -90,10 +90,10 @@ class ImageTargetResizeKernel final : public user_op::OpKernel { user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* size_tensor = ctx->Tensor4ArgNameAndIndex("size", 0); user_op::Tensor* scale_tensor = ctx->Tensor4ArgNameAndIndex("scale", 0); - CHECK_GT(in_tensor->shape().elem_cnt(), 0); - CHECK_EQ(in_tensor->shape().elem_cnt(), out_tensor->shape().elem_cnt()); - CHECK_EQ(in_tensor->shape().elem_cnt(), size_tensor->shape().At(0)); - CHECK_EQ(in_tensor->shape().elem_cnt(), scale_tensor->shape().At(0)); + CHECK_GT(in_tensor->shape_view().elem_cnt(), 0); + CHECK_EQ(in_tensor->shape_view().elem_cnt(), out_tensor->shape_view().elem_cnt()); + CHECK_EQ(in_tensor->shape_view().elem_cnt(), size_tensor->shape_view().At(0)); + CHECK_EQ(in_tensor->shape_view().elem_cnt(), scale_tensor->shape_view().At(0)); const TensorBuffer* in_img_buf = in_tensor->dptr(); TensorBuffer* out_img_buf = out_tensor->mut_dptr(); @@ -102,17 +102,17 @@ class ImageTargetResizeKernel final : public user_op::OpKernel { const int32_t target_size = ctx->Attr("target_size"); const int32_t max_size = ctx->Attr("max_size"); - MultiThreadLoop(in_tensor->shape().elem_cnt(), [&](size_t i) { + MultiThreadLoop(in_tensor->shape_view().elem_cnt(), [&](size_t i) { ImageTargetResize(in_img_buf[i], out_img_buf + i, target_size, max_size); if (size_ptr != nullptr) { - size_ptr[i * 2 + 0] = out_img_buf[i].shape().At(0); - size_ptr[i * 2 + 1] = out_img_buf[i].shape().At(1); + size_ptr[i * 2 + 0] = out_img_buf[i].shape_view().At(0); + size_ptr[i * 2 + 1] = out_img_buf[i].shape_view().At(1); } if (scale_ptr != nullptr) { - scale_ptr[i * 2 + 0] = static_cast(out_img_buf[i].shape().At(0)) - / static_cast(in_img_buf[i].shape().At(0)); - scale_ptr[i * 2 + 1] = static_cast(out_img_buf[i].shape().At(1)) - / static_cast(in_img_buf[i].shape().At(1)); + scale_ptr[i * 2 + 0] = static_cast(out_img_buf[i].shape_view().At(0)) + / static_cast(in_img_buf[i].shape_view().At(0)); + scale_ptr[i * 2 + 1] = static_cast(out_img_buf[i].shape_view().At(1)) + / static_cast(in_img_buf[i].shape_view().At(1)); } }); } diff --git a/oneflow/user/kernels/in_top_k_kernel.cpp b/oneflow/user/kernels/in_top_k_kernel.cpp index df5a4943043..562c17f8b04 100644 --- a/oneflow/user/kernels/in_top_k_kernel.cpp +++ b/oneflow/user/kernels/in_top_k_kernel.cpp @@ -30,11 +30,11 @@ class InTopkKernel final : public user_op::OpKernel { const user_op::Tensor* predictions = ctx->Tensor4ArgNameAndIndex("predictions", 0); const int32_t k = ctx->Attr("k"); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(targets->shape().At(0), predictions->shape().At(0)); - CHECK_EQ(targets->shape().NumAxes(), 1); - CHECK_EQ(predictions->shape().NumAxes(), 2); - const int32_t instance_num = predictions->shape().At(0); - const int32_t classes_num = predictions->shape().At(1); + CHECK_EQ(targets->shape_view().At(0), predictions->shape_view().At(0)); + CHECK_EQ(targets->shape_view().NumAxes(), 1); + CHECK_EQ(predictions->shape_view().NumAxes(), 2); + const int32_t instance_num = predictions->shape_view().At(0); + const int32_t classes_num = predictions->shape_view().At(1); InTopkKernelUtil::InTopk(ctx->stream(), instance_num, classes_num, targets->dptr(), predictions->dptr(), k, out->mut_dptr()); diff --git a/oneflow/user/kernels/indexed_slices_reduce_sum_kernel.cpp b/oneflow/user/kernels/indexed_slices_reduce_sum_kernel.cpp index b9658b92df2..92e554c4007 100644 --- a/oneflow/user/kernels/indexed_slices_reduce_sum_kernel.cpp +++ b/oneflow/user/kernels/indexed_slices_reduce_sum_kernel.cpp @@ -35,9 +35,9 @@ class IndexedSlicesReduceSumKernel final : public user_op::OpKernel { user_op::Tensor* num_unique = ctx->Tensor4ArgNameAndIndex("num_unique", 0); user_op::Tensor* tmp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); void* tmp_ptr = tmp ? tmp->mut_dptr() : nullptr; - int64_t tmp_size = tmp ? tmp->shape().elem_cnt() * GetSizeOfDataType(tmp->data_type()) : 0; - const int64_t n = x_indices->shape().elem_cnt(); - const int64_t m = x_values->shape().elem_cnt() / n; + int64_t tmp_size = tmp ? tmp->shape_view().elem_cnt() * GetSizeOfDataType(tmp->data_type()) : 0; + const int64_t n = x_indices->shape_view().elem_cnt(); + const int64_t m = x_values->shape_view().elem_cnt() / n; IndexedSlicesReduceSumKernelUtil::ReduceSum( ctx->stream(), n, m, x_indices->dptr(), x_values->dptr(), num_unique->mut_dptr(), y_indices->mut_dptr(), y_values->mut_dptr(), tmp_ptr, diff --git a/oneflow/user/kernels/l1_l2_regularize_gradient_kernel.cpp b/oneflow/user/kernels/l1_l2_regularize_gradient_kernel.cpp index adf026e8eb6..203a0e6c98c 100644 --- a/oneflow/user/kernels/l1_l2_regularize_gradient_kernel.cpp +++ b/oneflow/user/kernels/l1_l2_regularize_gradient_kernel.cpp @@ -34,7 +34,7 @@ class L1L2RegularizeGradientKernel final : public user_op::OpKernel { const auto l1 = ctx->Attr("l1"); const auto l2 = ctx->Attr("l2"); L1L2RegularizeGradientKernelUtil::RegularizeGradient( - ctx->stream(), out->shape().elem_cnt(), model->dptr(), model_diff->dptr(), + ctx->stream(), out->shape_view().elem_cnt(), model->dptr(), model_diff->dptr(), out->mut_dptr(), l1, l2); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/l2_normalize_kernel.cpp b/oneflow/user/kernels/l2_normalize_kernel.cpp index 413572012c1..07779768d0c 100644 --- a/oneflow/user/kernels/l2_normalize_kernel.cpp +++ b/oneflow/user/kernels/l2_normalize_kernel.cpp @@ -78,11 +78,11 @@ class CpuL2NormalizeKernel final : public user_op::OpKernel { user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0); const float epsilon = ctx->Attr("epsilon"); int32_t axis = ctx->Attr("axis"); - int32_t c = x->shape().At(axis); - int32_t n = x->shape().elem_cnt() / c; - int32_t d = x->shape().Count(axis + 1); + int32_t c = x->shape_view().At(axis); + int32_t n = x->shape_view().elem_cnt() / c; + int32_t d = x->shape_view().Count(axis + 1); - size_t square_x_sum_byte_size = square_x_sum->shape().elem_cnt() * sizeof(T); + size_t square_x_sum_byte_size = square_x_sum->shape_view().elem_cnt() * sizeof(T); Memset(ctx->stream(), square_x_sum->mut_dptr(), 0, square_x_sum_byte_size); L2NormalizeForward(n, c, d, static_cast(epsilon), x->dptr(), square_x_sum->mut_dptr(), y->mut_dptr()); @@ -112,9 +112,9 @@ class CpuL2NormalizeGradKernel final : public user_op::OpKernel { user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const float epsilon = ctx->Attr("epsilon"); int32_t axis = ctx->Attr("axis"); - int32_t c = dy->shape().At(axis); - int32_t n = dy->shape().elem_cnt() / c; - int32_t d = dy->shape().Count(axis + 1); + int32_t c = dy->shape_view().At(axis); + int32_t n = dy->shape_view().elem_cnt() / c; + int32_t d = dy->shape_view().Count(axis + 1); L2NormalizeBackward(n, c, d, static_cast(epsilon), y->dptr(), dy->dptr(), square_x_sum->dptr(), dx->mut_dptr()); } diff --git a/oneflow/user/kernels/l2_normalize_kernel.cu b/oneflow/user/kernels/l2_normalize_kernel.cu index 141a70e9899..33c0786faa8 100644 --- a/oneflow/user/kernels/l2_normalize_kernel.cu +++ b/oneflow/user/kernels/l2_normalize_kernel.cu @@ -97,9 +97,9 @@ class GpuL2NormalizeKernel final : public user_op::OpKernel { user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0); const float epsilon = ctx->Attr("epsilon"); int32_t axis = ctx->Attr("axis"); - int32_t c = x->shape().At(axis); - int32_t n = x->shape().elem_cnt() / c; - int32_t d = x->shape().Count(axis + 1); + int32_t c = x->shape_view().At(axis); + int32_t n = x->shape_view().elem_cnt() / c; + int32_t d = x->shape_view().Count(axis + 1); RUN_CUDA_KERNEL((L2NormalizeForward), ctx->stream(), n, n, c, d, static_cast(epsilon), x->dptr(), square_x_sum->mut_dptr(), y->mut_dptr()); } @@ -129,9 +129,9 @@ class GpuL2NormalizeGradKernel final : public user_op::OpKernel { user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const float epsilon = ctx->Attr("epsilon"); int32_t axis = ctx->Attr("axis"); - int32_t c = dy->shape().At(axis); - int32_t n = dy->shape().elem_cnt() / c; - int32_t d = dy->shape().Count(axis + 1); + int32_t c = dy->shape_view().At(axis); + int32_t n = dy->shape_view().elem_cnt() / c; + int32_t d = dy->shape_view().Count(axis + 1); RUN_CUDA_KERNEL((L2NormalizeBackward), ctx->stream(), n, n, c, d, static_cast(epsilon), y->dptr(), dy->dptr(), square_x_sum->dptr(), dx->mut_dptr()); } diff --git a/oneflow/user/kernels/layer_norm_gpu_kernel.cu b/oneflow/user/kernels/layer_norm_gpu_kernel.cu index 208057c4e21..c2736f448a6 100644 --- a/oneflow/user/kernels/layer_norm_gpu_kernel.cu +++ b/oneflow/user/kernels/layer_norm_gpu_kernel.cu @@ -307,14 +307,14 @@ class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaG user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); const double epsilon = ctx->Attr("epsilon"); CHECK_GE(epsilon, CUDNN_BN_MIN_EPSILON); - const int64_t num_instances = mean->shape().elem_cnt(); - const int64_t norm_size = x->shape().elem_cnt() / num_instances; + const int64_t num_instances = mean->shape_view().elem_cnt(); + const int64_t norm_size = x->shape_view().elem_cnt() / num_instances; const T* gamma_ptr = nullptr; const T* beta_ptr = nullptr; if (ctx->has_input("gamma", 0)) { const user_op::Tensor* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); gamma_ptr = gamma->dptr(); - CHECK_EQ(gamma->shape().elem_cnt(), norm_size); + CHECK_EQ(gamma->shape_view().elem_cnt(), norm_size); } if (ctx->has_input("beta", 0)) { beta_ptr = ctx->Tensor4ArgNameAndIndex("beta", 0)->dptr(); } DispatchLayerNormForwardGpu(ctx->stream(), num_instances, norm_size, epsilon, x->dptr(), @@ -347,8 +347,8 @@ class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::C const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t num_instances = mean->shape().elem_cnt(); - const int64_t norm_size = x->shape().elem_cnt() / num_instances; + const int64_t num_instances = mean->shape_view().elem_cnt(); + const int64_t norm_size = x->shape_view().elem_cnt() / num_instances; const T* gamma_ptr = nullptr; if (ctx->has_input("gamma", 0)) { gamma_ptr = ctx->Tensor4ArgNameAndIndex("gamma", 0)->dptr(); @@ -357,7 +357,7 @@ class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::C if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), dx->data_type()); - CHECK_EQ(add_to_output->shape(), dx->shape()); + CHECK_EQ(add_to_output->shape_view(), dx->shape_view()); add_to_output_ptr = add_to_output->dptr(); } LaunchLayerNormBackward(ctx->stream(), num_instances, norm_size, dy->dptr(), x->dptr(), @@ -398,8 +398,8 @@ class LayerNormParamGradGpuKernel final : public user_op::OpKernel, const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); - const int64_t num_instances = mean->shape().elem_cnt(); - const int64_t norm_size = x->shape().elem_cnt() / num_instances; + const int64_t num_instances = mean->shape_view().elem_cnt(); + const int64_t norm_size = x->shape_view().elem_cnt() / num_instances; user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const DataType data_type = dy->data_type(); const int grid_dim_x = (norm_size + tile_size - 1) / tile_size; diff --git a/oneflow/user/kernels/log_softmax_kernel.cpp b/oneflow/user/kernels/log_softmax_kernel.cpp index 70b92ee6d82..5df0bc9443c 100644 --- a/oneflow/user/kernels/log_softmax_kernel.cpp +++ b/oneflow/user/kernels/log_softmax_kernel.cpp @@ -60,8 +60,8 @@ class LogSoftmaxKernel final : public user_op::OpKernel, public user_op::CudaGra void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); - const int64_t num_classes = in->shape().At(in->shape().NumAxes() - 1); - const int64_t num_instances = in->shape().Count(0, in->shape().NumAxes() - 1); + const int64_t num_classes = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int64_t num_instances = in->shape_view().Count(0, in->shape_view().NumAxes() - 1); std::unique_ptr primitive = NewLogSoftmaxPrimitive(ctx); CHECK(primitive); primitive->Launch(ctx->stream(), num_instances, num_classes, in->dptr(), prob->mut_dptr()); @@ -82,8 +82,8 @@ class LogSoftmaxGradKernel final : public user_op::OpKernel, public user_op::Cud const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t num_classes = prob->shape().At(prob->shape().NumAxes() - 1); - const int64_t num_instances = prob->shape().elem_cnt() / num_classes; + const int64_t num_classes = prob->shape_view().At(prob->shape_view().NumAxes() - 1); + const int64_t num_instances = prob->shape_view().elem_cnt() / num_classes; std::unique_ptr primitive = NewLogSoftmaxBackwardPrimitive(ctx); diff --git a/oneflow/user/kernels/logical_not_kernel.cpp b/oneflow/user/kernels/logical_not_kernel.cpp index eb4a4384265..c73ee165784 100644 --- a/oneflow/user/kernels/logical_not_kernel.cpp +++ b/oneflow/user/kernels/logical_not_kernel.cpp @@ -39,7 +39,7 @@ class CpuLogicalNotKernel final : public user_op::OpKernel { user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); const T* x = tensor_x->dptr(); K* y = tensor_y->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); if (n != 0) { LogicalNotFunctor()(ctx->stream(), n, x, y); } } diff --git a/oneflow/user/kernels/logical_not_kernel.cu b/oneflow/user/kernels/logical_not_kernel.cu index 1dfb210cb66..944074c18c6 100644 --- a/oneflow/user/kernels/logical_not_kernel.cu +++ b/oneflow/user/kernels/logical_not_kernel.cu @@ -41,7 +41,7 @@ class GpuLogicalNotKernel final : public user_op::OpKernel, public user_op::Cuda void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int64_t elem_cnt = x->shape().elem_cnt(); + const int64_t elem_cnt = x->shape_view().elem_cnt(); OF_CUDA_CHECK( (cuda::elementwise::Unary(LogicalNotFunctor(), elem_cnt, y->mut_dptr(), x->dptr(), ctx->stream()->As()->cuda_stream()))); diff --git a/oneflow/user/kernels/loss_kernel_util.h b/oneflow/user/kernels/loss_kernel_util.h index 417e17dbeb8..144a17be810 100644 --- a/oneflow/user/kernels/loss_kernel_util.h +++ b/oneflow/user/kernels/loss_kernel_util.h @@ -38,7 +38,7 @@ class SimpleLossKernel : public user_op::OpKernel { const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* input = input_blob->dptr(); const T* target = target_blob->dptr(); @@ -64,7 +64,7 @@ class SimpleLossGradKernel : public user_op::OpKernel { const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t elem_cnt = input_blob->shape().elem_cnt(); + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const T* dy = dy_blob->dptr(); const T* input = input_blob->dptr(); diff --git a/oneflow/user/kernels/masked_fill_kernel.cpp b/oneflow/user/kernels/masked_fill_kernel.cpp index 174ca7bee3b..fe01b9535ec 100644 --- a/oneflow/user/kernels/masked_fill_kernel.cpp +++ b/oneflow/user/kernels/masked_fill_kernel.cpp @@ -40,9 +40,9 @@ class MaskedFillKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED() << "The scalar in MaskedFill should be float or int."; } - WhereKernelUtil::WhereXScalar(ctx->stream(), out->shape().elem_cnt(), - mask->dptr(), scalar_operand, - x->dptr(), out->mut_dptr()); + WhereKernelUtil::WhereXScalar( + ctx->stream(), out->shape_view().elem_cnt(), mask->dptr(), scalar_operand, + x->dptr(), out->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/math_binary_broadcast_kernels.cpp b/oneflow/user/kernels/math_binary_broadcast_kernels.cpp index 5d71785320b..58e90079671 100644 --- a/oneflow/user/kernels/math_binary_broadcast_kernels.cpp +++ b/oneflow/user/kernels/math_binary_broadcast_kernels.cpp @@ -50,14 +50,14 @@ class MathBinaryBroadcastEpKernel final : public user_op::OpKernel, NewBroadcastElementwiseBinaryPrimitive(ctx); CHECK(primitive.get() != nullptr) << "Exceeds maximum supported dimensions"; - const int64_t x_elem_cnt = x->shape().elem_cnt(); - const int64_t y_elem_cnt = y->shape().elem_cnt(); - size_t num_src0_dims = x->shape().NumAxes(); - size_t num_src1_dims = y->shape().NumAxes(); + const int64_t x_elem_cnt = x->shape_view().elem_cnt(); + const int64_t y_elem_cnt = y->shape_view().elem_cnt(); + size_t num_src0_dims = x->shape_view().NumAxes(); + size_t num_src1_dims = y->shape_view().NumAxes(); int64_t zero_dim = 1; - int64_t* src0_dims = const_cast(x->shape().ptr()); - int64_t* src1_dims = const_cast(y->shape().ptr()); + int64_t* src0_dims = const_cast(x->shape_view().ptr()); + int64_t* src1_dims = const_cast(y->shape_view().ptr()); if (x_elem_cnt != 0 && y_elem_cnt != 0) { if (num_src0_dims == 0) { @@ -127,10 +127,10 @@ class MathBinaryBroadcastKernel final : public user_op::OpKernel, public user_op const T* dptr_x = tensor_x->dptr(); const T* dptr_y = tensor_y->dptr(); K* dptr_z = tensor_z->mut_dptr(); - size_t num_axes = tensor_z->shape().NumAxes(); - binary_func(ctx->stream(), XpuVarNdarray(tensor_z->shape(), dptr_z, num_axes), - XpuVarNdarray(tensor_x->shape(), dptr_x, num_axes), - XpuVarNdarray(tensor_y->shape(), dptr_y, num_axes)); + size_t num_axes = tensor_z->shape_view().NumAxes(); + binary_func(ctx->stream(), XpuVarNdarray(tensor_z->shape_view(), dptr_z, num_axes), + XpuVarNdarray(tensor_x->shape_view(), dptr_x, num_axes), + XpuVarNdarray(tensor_y->shape_view(), dptr_y, num_axes)); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/math_binary_elementwise_kernel.cpp b/oneflow/user/kernels/math_binary_elementwise_kernel.cpp index c4d8e0c51ea..c5927b73fcc 100644 --- a/oneflow/user/kernels/math_binary_elementwise_kernel.cpp +++ b/oneflow/user/kernels/math_binary_elementwise_kernel.cpp @@ -34,7 +34,7 @@ class MathBinaryElementwiseCpuKernel final : public user_op::OpKernel { const T* x = tensor_x->dptr(); const T* y = tensor_y->dptr(); T* z = tensor_z->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); ep::CpuStream* cpu_stream = ctx->stream()->As(); @@ -62,7 +62,7 @@ class MathBinaryElementwiseXGradCpuKernel final : public user_op::OpKernel { const T* y = tensor_y->dptr(); const T* dz = tensor_dz->dptr(); T* dx = tensor_dx->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); for (int32_t i = 0; i < n; ++i) { dx[i] = BinaryFunctor::BackwardXGrad(x[i], y[i], dz[i]); } } @@ -86,7 +86,7 @@ class MathBinaryElementwiseYGradCpuKernel final : public user_op::OpKernel { const T* y = tensor_y->dptr(); const T* dz = tensor_dz->dptr(); T* dy = tensor_dy->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); for (int32_t i = 0; i < n; ++i) { dy[i] = BinaryFunctor::BackwardYGrad(x[i], y[i], dz[i]); } } diff --git a/oneflow/user/kernels/math_binary_elementwise_kernel.cu b/oneflow/user/kernels/math_binary_elementwise_kernel.cu index d689efd42e6..1fe6ac262bf 100644 --- a/oneflow/user/kernels/math_binary_elementwise_kernel.cu +++ b/oneflow/user/kernels/math_binary_elementwise_kernel.cu @@ -52,7 +52,7 @@ class MathBinaryElementwiseGpuKernel final : public user_op::OpKernel { const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathBinaryElementwiseForwardGpu @@ -76,7 +76,7 @@ class MathBinaryElementwiseXGradGpuKernel final : public user_op::OpKernel { const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathBinaryElementwiseBackwardXGradGpu @@ -101,7 +101,7 @@ class MathBinaryElementwiseYGradGpuKernel final : public user_op::OpKernel { const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathBinaryElementwiseBackwardYGradGpu @@ -155,7 +155,7 @@ class MathBinaryElementwiseGpuHalfKernel final : public user_op::OpKernel { const half* x = reinterpret_cast(tensor_x->dptr()); const half* y = reinterpret_cast(tensor_y->dptr()); half* z = reinterpret_cast(tensor_z->mut_dptr()); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathBinaryElementwiseForwardGpu @@ -183,7 +183,7 @@ class MathBinaryElementwiseXGradGpuHalfKernel final : public user_op::OpKernel { const half* y = reinterpret_cast(tensor_y->dptr()); const half* dz = reinterpret_cast(tensor_dz->dptr()); half* dx = reinterpret_cast(tensor_dx->mut_dptr()); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathBinaryElementwiseBackwardXGradGpu @@ -211,7 +211,7 @@ class MathBinaryElementwiseYGradGpuHalfKernel final : public user_op::OpKernel { const half* y = reinterpret_cast(tensor_y->dptr()); const half* dz = reinterpret_cast(tensor_dz->dptr()); half* dy = reinterpret_cast(tensor_dy->mut_dptr()); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathBinaryElementwiseBackwardYGradGpu diff --git a/oneflow/user/kernels/math_unary_elementwise_kernel.cpp b/oneflow/user/kernels/math_unary_elementwise_kernel.cpp index 32efa356760..40e5f6a004a 100644 --- a/oneflow/user/kernels/math_unary_elementwise_kernel.cpp +++ b/oneflow/user/kernels/math_unary_elementwise_kernel.cpp @@ -30,7 +30,7 @@ class MathUnaryElementwiseCpuKernel final : public user_op::OpKernel { user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); const T* x = tensor_x->dptr(); T* y = tensor_y->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); for (int32_t i = 0; i < n; ++i) { y[i] = UnaryFunctor::Forward(x[i]); } } @@ -52,7 +52,7 @@ class MathUnaryElementwiseGradCpuKernel final : public user_op::OpKernel { const T* x = tensor_x->dptr(); const T* dy = tensor_dy->dptr(); T* dx = tensor_dx->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); for (int32_t i = 0; i < n; ++i) { dx[i] = UnaryFunctor::Backward(x[i], dy[i]); } } diff --git a/oneflow/user/kernels/math_unary_elementwise_kernel.cu b/oneflow/user/kernels/math_unary_elementwise_kernel.cu index 9a3ac4833b5..3f1b9251fdc 100644 --- a/oneflow/user/kernels/math_unary_elementwise_kernel.cu +++ b/oneflow/user/kernels/math_unary_elementwise_kernel.cu @@ -49,7 +49,7 @@ class MathUnaryElementwiseGpuKernel final : public user_op::OpKernel, user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); const T* x = tensor_x->dptr(); T* y = tensor_y->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathUnaryElementwiseForwardGpu @@ -76,7 +76,7 @@ class MathUnaryElementwiseGradGpuKernel final : public user_op::OpKernel, const T* x = tensor_x->dptr(); const T* dy = tensor_dy->dptr(); T* dx = tensor_dx->mut_dptr(); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathUnaryElementwiseBackwardGpu @@ -125,7 +125,7 @@ class MathUnaryElementwiseGpuHalfKernel final : public user_op::OpKernel, user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); const half* x = reinterpret_cast(tensor_x->dptr()); half* y = reinterpret_cast(tensor_y->mut_dptr()); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathUnaryElementwiseForwardGpu @@ -152,7 +152,7 @@ class MathUnaryElementwiseGradGpuHalfKernel final : public user_op::OpKernel, const half* x = reinterpret_cast(tensor_x->dptr()); const half* dy = reinterpret_cast(tensor_dy->dptr()); half* dx = reinterpret_cast(tensor_dx->mut_dptr()); - int64_t n = tensor_x->shape().elem_cnt(); + int64_t n = tensor_x->shape_view().elem_cnt(); CHECK_LE(n, GetMaxVal() / 2); if (n == 0) { return; } MathUnaryElementwiseBackwardGpu diff --git a/oneflow/user/kernels/matmul_kernels.cpp b/oneflow/user/kernels/matmul_kernels.cpp index 63247a5e75a..e8584f58e62 100644 --- a/oneflow/user/kernels/matmul_kernels.cpp +++ b/oneflow/user/kernels/matmul_kernels.cpp @@ -126,26 +126,27 @@ class MatmulKernel final : public user_op::OpKernel, public user_op::CudaGraphSu const auto trans_a = GetBlasTransposeType(ctx, "transpose_a"); const auto trans_b = GetBlasTransposeType(ctx, "transpose_b"); const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("a", 0); - CHECK_EQ(a->shape().NumAxes(), 2); + CHECK_EQ(a->shape_view().NumAxes(), 2); const DataType data_type = a->data_type(); const user_op::Tensor* b = ctx->Tensor4ArgNameAndIndex("b", 0); - CHECK_EQ(b->shape().NumAxes(), 2); + CHECK_EQ(b->shape_view().NumAxes(), 2); CHECK_EQ(b->data_type(), data_type); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(out->shape().NumAxes(), 2); + CHECK_EQ(out->shape_view().NumAxes(), 2); CHECK_EQ(out->data_type(), data_type); size_t m = 0, n = 0, k = 0; - InferMatmulMNK(a->shape(), b->shape(), out->shape(), trans_a, trans_b, &m, &n, &k); + InferMatmulMNK(a->shape_view(), b->shape_view(), out->shape_view(), trans_a, trans_b, &m, &n, + &k); const double alpha = ctx->Attr("alpha"); double beta = 0.0; if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), data_type); - CHECK_EQ(add_to_output->shape(), out->shape()); + CHECK_EQ(add_to_output->shape_view(), out->shape_view()); auto memcpy = NewMemcpyPrimitive(ctx); CHECK(memcpy); memcpy->Launch(ctx->stream(), out->mut_dptr(), add_to_output->dptr(), - add_to_output->shape().elem_cnt() * GetSizeOfDataType(data_type)); + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(data_type)); beta = 1.0; } auto matmul = NewMatmulPrimitive(ctx); @@ -178,24 +179,25 @@ class BatchMatmulKernel final : public user_op::OpKernel, public user_op::CudaGr const auto trans_b = GetBlasTransposeType(ctx, "transpose_b"); const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("a", 0); const DataType data_type = a->data_type(); - const int64_t num_axes = a->shape().NumAxes(); + const int64_t num_axes = a->shape_view().NumAxes(); CHECK_GT(num_axes, 2); const user_op::Tensor* b = ctx->Tensor4ArgNameAndIndex("b", 0); CHECK_EQ(b->data_type(), data_type); - CHECK_EQ(b->shape().NumAxes(), num_axes); + CHECK_EQ(b->shape_view().NumAxes(), num_axes); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(out->data_type(), data_type); - CHECK_EQ(out->shape().NumAxes(), num_axes); + CHECK_EQ(out->shape_view().NumAxes(), num_axes); size_t m = 0; size_t n = 0; size_t k = 0; - InferMatmulMNK(a->shape(), b->shape(), out->shape(), trans_a, trans_b, &m, &n, &k); + InferMatmulMNK(a->shape_view(), b->shape_view(), out->shape_view(), trans_a, trans_b, &m, &n, + &k); size_t batch_size = 1; for (size_t i = 0; i < num_axes - 2; ++i) { - const int64_t dim_size = a->shape().At(i); + const int64_t dim_size = a->shape_view().At(i); CHECK_GT(dim_size, 0); - CHECK_EQ(b->shape().At(i), dim_size); - CHECK_EQ(out->shape().At(i), dim_size); + CHECK_EQ(b->shape_view().At(i), dim_size); + CHECK_EQ(out->shape_view().At(i), dim_size); batch_size *= dim_size; } const double alpha = ctx->Attr("alpha"); @@ -203,11 +205,11 @@ class BatchMatmulKernel final : public user_op::OpKernel, public user_op::CudaGr if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), data_type); - CHECK_EQ(add_to_output->shape(), out->shape()); + CHECK_EQ(add_to_output->shape_view(), out->shape_view()); auto memcpy = NewMemcpyPrimitive(ctx); CHECK(memcpy); memcpy->Launch(ctx->stream(), out->mut_dptr(), add_to_output->dptr(), - add_to_output->shape().elem_cnt() * GetSizeOfDataType(data_type)); + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(data_type)); beta = 1.0; } auto batch_matmul = NewBatchMatmulPrimitive(ctx); @@ -250,26 +252,26 @@ class BroadcastMatmulKernel final : public user_op::OpKernel, public user_op::Cu double beta = 0.0; if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK_EQ(add_to_output->shape(), out->shape()); + CHECK_EQ(add_to_output->shape_view(), out->shape_view()); auto memcpy = NewMemcpyPrimitive(ctx); CHECK(memcpy); memcpy->Launch( ctx->stream(), out->mut_dptr(), add_to_output->dptr(), - add_to_output->shape().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); beta = 1.0; } - CHECK_EQ(b->shape().NumAxes(), 2); - CHECK_GT(a->shape().NumAxes(), b->shape().NumAxes()); - int64_t m = a->shape().Count(0, a->shape().NumAxes() - 1); - int64_t k = a->shape().At(a->shape().NumAxes() - 1); + CHECK_EQ(b->shape_view().NumAxes(), 2); + CHECK_GT(a->shape_view().NumAxes(), b->shape_view().NumAxes()); + int64_t m = a->shape_view().Count(0, a->shape_view().NumAxes() - 1); + int64_t k = a->shape_view().At(a->shape_view().NumAxes() - 1); int64_t n = -1; if (!transpose_b) { - n = b->shape().At(1); - CHECK_EQ(k, b->shape().At(0)); + n = b->shape_view().At(1); + CHECK_EQ(k, b->shape_view().At(0)); } else { - n = b->shape().At(0); - CHECK_EQ(k, b->shape().At(1)); + n = b->shape_view().At(0); + CHECK_EQ(k, b->shape_view().At(1)); } auto matmul = NewMatmulPrimitive(ctx); CHECK(matmul); @@ -312,20 +314,20 @@ class BroadcastMatmulGradBKernel final : public user_op::OpKernel, double beta = 0.0; if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK_EQ(add_to_output->shape(), out->shape()); + CHECK_EQ(add_to_output->shape_view(), out->shape_view()); auto memcpy = NewMemcpyPrimitive(ctx); CHECK(memcpy); memcpy->Launch( ctx->stream(), out->mut_dptr(), add_to_output->dptr(), - add_to_output->shape().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); beta = 1.0; } - CHECK_EQ(a->shape().NumAxes(), b->shape().NumAxes()); - int64_t k = a->shape().Count(0, a->shape().NumAxes() - 1); - CHECK_EQ(b->shape().Count(0, b->shape().NumAxes() - 1), k); - int64_t m = a->shape().At(a->shape().NumAxes() - 1); - int64_t n = b->shape().At(b->shape().NumAxes() - 1); + CHECK_EQ(a->shape_view().NumAxes(), b->shape_view().NumAxes()); + int64_t k = a->shape_view().Count(0, a->shape_view().NumAxes() - 1); + CHECK_EQ(b->shape_view().Count(0, b->shape_view().NumAxes() - 1), k); + int64_t m = a->shape_view().At(a->shape_view().NumAxes() - 1); + int64_t n = b->shape_view().At(b->shape_view().NumAxes() - 1); auto matmul = NewMatmulPrimitiveForBroadcastMatmulGradB(ctx); CHECK(matmul); diff --git a/oneflow/user/kernels/max_pool_kernel.cpp b/oneflow/user/kernels/max_pool_kernel.cpp index b507c9b124a..c6a85638c1c 100644 --- a/oneflow/user/kernels/max_pool_kernel.cpp +++ b/oneflow/user/kernels/max_pool_kernel.cpp @@ -205,14 +205,14 @@ class MaxPool1dKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const MaxPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = y->shape().elem_cnt(); + const int64_t elem_num = y->shape_view().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); int64_t* indice_ptr = indice->mut_dptr(); DimVector y_vector(2); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); + y_vector.at(0) = y->shape_view().At(0) * y->shape_view().At(1); + y_vector.at(1) = y->shape_view().At(2); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); PoolKernelUtil::Maxpool1dForward( @@ -247,14 +247,14 @@ class MaxPool1dGradKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const MaxPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = dy->shape().elem_cnt(); + const int64_t elem_num = dy->shape_view().elem_cnt(); const T* src = dy->dptr(); const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector(2); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); - dy_vector.at(1) = dy->shape().At(2); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + dy_vector.at(0) = dy->shape_view().At(0) * dy->shape_view().At(1); + dy_vector.at(1) = dy->shape_view().At(2); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); if (elem_num < GetMaxVal()) { @@ -291,7 +291,7 @@ class MaxPool2dKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const MaxPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = y->shape().elem_cnt(); + const int64_t elem_num = y->shape_view().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); @@ -300,9 +300,9 @@ class MaxPool2dKernel final : public user_op::OpKernel { const std::string& data_format = ctx->Attr("data_format"); if (data_format == "channels_first") { DimVector y_vector(3); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); - y_vector.at(2) = y->shape().At(3); + y_vector.at(0) = y->shape_view().At(0) * y->shape_view().At(1); + y_vector.at(1) = y->shape_view().At(2); + y_vector.at(2) = y->shape_view().At(3); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); PoolKernelUtil::Maxpool2dForwardCFirst( @@ -314,7 +314,7 @@ class MaxPool2dKernel final : public user_op::OpKernel { } } else if (data_format == "channels_last") { DimVector y_vector; - y->shape().ToDimVector(&y_vector); + y->shape_view().ToDimVector(&y_vector); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); PoolKernelUtil::Maxpool2dForwardCLast( @@ -352,21 +352,21 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const MaxPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = dy->shape().elem_cnt(); + const int64_t elem_num = dy->shape_view().elem_cnt(); const T* src = dy->dptr(); const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); const std::string& data_format = ctx->Attr("data_format"); if (data_format == "channels_first") { DimVector dy_vector(3); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); - dy_vector.at(1) = dy->shape().At(2); - dy_vector.at(2) = dy->shape().At(3); + dy_vector.at(0) = dy->shape_view().At(0) * dy->shape_view().At(1); + dy_vector.at(1) = dy->shape_view().At(2); + dy_vector.at(2) = dy->shape_view().At(3); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); PoolKernelUtil::Maxpool2dBackwardCFirst( @@ -378,7 +378,7 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { } } else if (data_format == "channels_last") { DimVector dy_vector; - dy->shape().ToDimVector(&dy_vector); + dy->shape_view().ToDimVector(&dy_vector); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); PoolKernelUtil::Maxpool2dBackwardCLast( @@ -416,16 +416,16 @@ class MaxPool3dKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const MaxPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = y->shape().elem_cnt(); + const int64_t elem_num = y->shape_view().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); int64_t* indice_ptr = indice->mut_dptr(); DimVector y_vector(4); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); - y_vector.at(2) = y->shape().At(3); - y_vector.at(3) = y->shape().At(4); + y_vector.at(0) = y->shape_view().At(0) * y->shape_view().At(1); + y_vector.at(1) = y->shape_view().At(2); + y_vector.at(2) = y->shape_view().At(3); + y_vector.at(3) = y->shape_view().At(4); if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); @@ -461,18 +461,18 @@ class MaxPool3dGradKernel final : public user_op::OpKernel { const auto* pool_cache = dynamic_cast(cache); const MaxPoolParams3D& params_3d = pool_cache->GetParams3D(); - const int64_t elem_num = dy->shape().elem_cnt(); + const int64_t elem_num = dy->shape_view().elem_cnt(); const T* src = dy->dptr(); const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector(4); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); - dy_vector.at(1) = dy->shape().At(2); - dy_vector.at(2) = dy->shape().At(3); - dy_vector.at(3) = dy->shape().At(4); + dy_vector.at(0) = dy->shape_view().At(0) * dy->shape_view().At(1); + dy_vector.at(1) = dy->shape_view().At(2); + dy_vector.at(2) = dy->shape_view().At(3); + dy_vector.at(3) = dy->shape_view().At(4); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); if (elem_num < GetMaxVal()) { diff --git a/oneflow/user/kernels/median_kernel.cpp b/oneflow/user/kernels/median_kernel.cpp index e3ded5b3fd9..3a8238edc62 100644 --- a/oneflow/user/kernels/median_kernel.cpp +++ b/oneflow/user/kernels/median_kernel.cpp @@ -27,7 +27,7 @@ class CpuMedianKernel final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0); - const int64_t size = in->shape().elem_cnt(); + const int64_t size = in->shape_view().elem_cnt(); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("output", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); T* out_ptr = out->mut_dptr(); diff --git a/oneflow/user/kernels/median_kernel.cu b/oneflow/user/kernels/median_kernel.cu index 90929a8776d..022af78d18d 100644 --- a/oneflow/user/kernels/median_kernel.cu +++ b/oneflow/user/kernels/median_kernel.cu @@ -32,12 +32,12 @@ class CudaMedianKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("output", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int32_t instance_size = in->shape().elem_cnt(); + const int32_t instance_size = in->shape_view().elem_cnt(); const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(T)); SortKeysAscending( in->dptr(), 1, instance_size, reinterpret_cast(tmp_buffer->mut_dptr() + sort_tensor_buffer_bytes), - tmp_buffer->shape().elem_cnt() - sort_tensor_buffer_bytes, tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt() - sort_tensor_buffer_bytes, tmp_buffer->mut_dptr(), ctx->stream()->As()->cuda_stream()); Memcpy(ctx->stream(), out->mut_dptr(), tmp_buffer->mut_dptr() + (instance_size - 1) / 2, sizeof(T)); diff --git a/oneflow/user/kernels/median_with_indices_kernel.cpp b/oneflow/user/kernels/median_with_indices_kernel.cpp index a42cc8c2b9c..d61db192206 100644 --- a/oneflow/user/kernels/median_with_indices_kernel.cpp +++ b/oneflow/user/kernels/median_with_indices_kernel.cpp @@ -28,10 +28,10 @@ class CpuMedianWithIndicesKernel final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0); - const int64_t num_axes = in->shape().NumAxes(); - const int64_t size = in->shape().elem_cnt(); + const int64_t num_axes = in->shape_view().NumAxes(); + const int64_t size = in->shape_view().elem_cnt(); if (size == 0) return; - const int64_t stride = in->shape().At(num_axes - 1); + const int64_t stride = in->shape_view().At(num_axes - 1); const int64_t instance_num = size / stride; user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); diff --git a/oneflow/user/kernels/median_with_indices_kernel.cu b/oneflow/user/kernels/median_with_indices_kernel.cu index d111726b426..405f0a1f5ba 100644 --- a/oneflow/user/kernels/median_with_indices_kernel.cu +++ b/oneflow/user/kernels/median_with_indices_kernel.cu @@ -105,15 +105,15 @@ class CudaMedianWithIndicesKernel final : public user_op::OpKernel { using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0); - if (in->shape().elem_cnt() == 0) return; + if (in->shape_view().elem_cnt() == 0) return; user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(tmp_buffer->shape().elem_cnt(), tmp_buffer->mut_dptr(), - in->shape()); + TmpBufferManager buf_manager(tmp_buffer->shape_view().elem_cnt(), + tmp_buffer->mut_dptr(), in->shape_view()); - const int64_t elem_cnt = in->shape().elem_cnt(); - const int64_t instance_size = in->shape().At(in->shape().NumAxes() - 1); + const int64_t elem_cnt = in->shape_view().elem_cnt(); + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); const int64_t instance_num = elem_cnt / instance_size; RUN_CUDA_KERNEL(InitializeIndices, ctx->stream(), elem_cnt, elem_cnt, buf_manager.InIndicesPtr(), instance_size); diff --git a/oneflow/user/kernels/min_max_observer_kernel.cpp b/oneflow/user/kernels/min_max_observer_kernel.cpp index 2cea3d89101..84bf9b50867 100644 --- a/oneflow/user/kernels/min_max_observer_kernel.cpp +++ b/oneflow/user/kernels/min_max_observer_kernel.cpp @@ -81,10 +81,10 @@ class CpuMinMaxObserverKernel final : public user_op::OpKernel { if (quantization_formula == "google") { // NOTE(Liang Depeng): per-layer quantization by default int64_t outer_num = 1; - int64_t inner_num = in->shape().elem_cnt(); + int64_t inner_num = in->shape_view().elem_cnt(); if (!per_layer_quantization) { // per-channel quantization - outer_num = in->shape().At(0); - inner_num = in->shape().Count(1); + outer_num = in->shape_view().At(0); + inner_num = in->shape_view().Count(1); } if (quantization_scheme == "symmetric") { @@ -106,7 +106,7 @@ class CpuMinMaxObserverKernel final : public user_op::OpKernel { if (!per_layer_quantization) { UNIMPLEMENTED() << " per-channel mode is not supported in cambricon scheme"; } - GenQuantScaleCambricon(in_ptr, quantization_bit, in->shape().elem_cnt(), scale_ptr, + GenQuantScaleCambricon(in_ptr, quantization_bit, in->shape_view().elem_cnt(), scale_ptr, zero_point_ptr); } else { UNIMPLEMENTED(); diff --git a/oneflow/user/kernels/min_max_observer_kernel.cu b/oneflow/user/kernels/min_max_observer_kernel.cu index fcd9a66e109..786f46d8942 100644 --- a/oneflow/user/kernels/min_max_observer_kernel.cu +++ b/oneflow/user/kernels/min_max_observer_kernel.cu @@ -194,8 +194,8 @@ class GpuMinMaxObserverKernel final : public user_op::OpKernel { const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); const std::string quantization_formula = ctx->Attr("quantization_formula"); - const int64_t elements = in->shape().elem_cnt(); - const int64_t channel = scale->shape().At(0); + const int64_t elements = in->shape_view().elem_cnt(); + const int64_t channel = scale->shape_view().At(0); const int64_t panel_size = elements / channel; T* max_ptr = tmp_buffer->mut_dptr(); T* min_ptr = max_ptr + channel; diff --git a/oneflow/user/kernels/model_update_kernels.cpp b/oneflow/user/kernels/model_update_kernels.cpp index eaeb9d9fe75..94627f0f180 100644 --- a/oneflow/user/kernels/model_update_kernels.cpp +++ b/oneflow/user/kernels/model_update_kernels.cpp @@ -134,17 +134,17 @@ class SGDUpdateKernel final : public user_op::OpKernel, public user_op::CudaGrap if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } SGDUpdateKernelUtil::Update( - ctx->stream(), model->shape().elem_cnt(), static_cast(scale), l1, l2, weight_decay, + ctx->stream(), model->shape_view().elem_cnt(), static_cast(scale), l1, l2, weight_decay, learning_rate_val, learning_rate_ptr, scale_by_ptr, skip_if_ptr, model_diff->dptr(), model->mut_dptr()); } @@ -200,8 +200,8 @@ class IndexedSlicesSGDUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* model_diff_values = ctx->Tensor4ArgNameAndIndex("model_diff_values", 0); user_op::Tensor* model = ctx->Tensor4ArgNameAndIndex("model", 0); const auto weight_decay = ctx->Attr("weight_decay"); - const int64_t num_indices = model_diff_indices->shape().elem_cnt(); - const int64_t num_values = model_diff_values->shape().elem_cnt(); + const int64_t num_indices = model_diff_indices->shape_view().elem_cnt(); + const int64_t num_values = model_diff_values->shape_view().elem_cnt(); if (num_indices == 0) { CHECK_EQ(num_values, 0); return; @@ -211,11 +211,11 @@ class IndexedSlicesSGDUpdateKernel final : public user_op::OpKernel { const int64_t feature_size = num_values / num_indices; auto* kernel_cache = dynamic_cast(cache); CHECK_NOTNULL(kernel_cache); - CHECK_EQ(model->shape().At(0), kernel_cache->upper() - kernel_cache->lower()); + CHECK_EQ(model->shape_view().At(0), kernel_cache->upper() - kernel_cache->lower()); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); TmpBufferManager buffer_manager(tmp_buffer->mut_dptr(), num_indices, num_values); - CHECK_GE(tmp_buffer->shape().elem_cnt(), buffer_manager.GetTotalBufferSize()); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.GetTotalBufferSize()); ReduceSumUtilT::ReduceSum( ctx->stream(), num_indices, feature_size, model_diff_indices->dptr(), model_diff_values->dptr(), buffer_manager.NumUniqueDiffIndicesPtr(), @@ -274,19 +274,19 @@ class MomentumUpdateKernel final : public user_op::OpKernel, public user_op::Cud if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } MomentumUpdateKernelUtil::Update( - ctx->stream(), model->shape().elem_cnt(), static_cast(scale), l1, l2, beta, weight_decay, - learning_rate_val, learning_rate_ptr, scale_by_ptr, skip_if_ptr, model_diff->dptr(), - model->mut_dptr(), momentum->mut_dptr()); + ctx->stream(), model->shape_view().elem_cnt(), static_cast(scale), l1, l2, beta, + weight_decay, learning_rate_val, learning_rate_ptr, scale_by_ptr, skip_if_ptr, + model_diff->dptr(), model->mut_dptr(), momentum->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } }; @@ -330,8 +330,8 @@ class IndexedSlicesMomentumUpdateKernel final : public user_op::OpKernel { user_op::Tensor* momentum = ctx->Tensor4ArgNameAndIndex("momentum", 0); const auto beta = ctx->Attr("beta"); const auto weight_decay = ctx->Attr("weight_decay"); - const int64_t num_indices = model_diff_indices->shape().elem_cnt(); - const int64_t num_values = model_diff_values->shape().elem_cnt(); + const int64_t num_indices = model_diff_indices->shape_view().elem_cnt(); + const int64_t num_values = model_diff_values->shape_view().elem_cnt(); if (num_indices == 0) { CHECK_EQ(num_values, 0); return; @@ -339,14 +339,15 @@ class IndexedSlicesMomentumUpdateKernel final : public user_op::OpKernel { CHECK_NE(num_values, 0); CHECK_EQ(num_values % num_indices, 0); const int64_t feature_size = num_values / num_indices; - CHECK_EQ(feature_size, model_diff_values->shape().Count(model_diff_indices->shape().NumAxes())); + CHECK_EQ(feature_size, + model_diff_values->shape_view().Count(model_diff_indices->shape_view().NumAxes())); auto* kernel_cache = dynamic_cast(cache); CHECK_NOTNULL(kernel_cache); - CHECK_EQ(model->shape().At(0), kernel_cache->upper() - kernel_cache->lower()); + CHECK_EQ(model->shape_view().At(0), kernel_cache->upper() - kernel_cache->lower()); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); TmpBufferManager buffer_manager(tmp_buffer->mut_dptr(), num_indices, num_values); - CHECK_GE(tmp_buffer->shape().elem_cnt(), buffer_manager.GetTotalBufferSize()); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.GetTotalBufferSize()); ReduceSumUtilT::ReduceSum( ctx->stream(), num_indices, feature_size, model_diff_indices->dptr(), model_diff_values->dptr(), buffer_manager.NumUniqueDiffIndicesPtr(), @@ -419,7 +420,8 @@ class AdamUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra const float* bias_correction1_ptr = nullptr; if (ctx->has_input("bias_correction1", 0)) { const user_op::Tensor* bias_correction1 = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0); - CHECK_EQ(bias_correction1->shape().elem_cnt(), 1); // Just for Lazy Optional Input Check. + CHECK_EQ(bias_correction1->shape_view().elem_cnt(), + 1); // Just for Lazy Optional Input Check. bias_correction1_ptr = bias_correction1->dptr(); } @@ -427,7 +429,8 @@ class AdamUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra const float* bias_correction2_ptr = nullptr; if (ctx->has_input("bias_correction2", 0)) { const user_op::Tensor* bias_correction2 = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0); - CHECK_EQ(bias_correction2->shape().elem_cnt(), 1); // Just for Lazy Optional Input Check. + CHECK_EQ(bias_correction2->shape_view().elem_cnt(), + 1); // Just for Lazy Optional Input Check. bias_correction2_ptr = bias_correction2->dptr(); } @@ -435,19 +438,19 @@ class AdamUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } AdamUpdateKernelUtil::Update( - ctx->stream(), model->shape().elem_cnt(), static_cast(scale), l1, l2, beta1, beta2, + ctx->stream(), model->shape_view().elem_cnt(), static_cast(scale), l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate_ptr, scale_by_ptr, skip_if_ptr, bias_correction1_ptr, bias_correction2_ptr, model_diff->dptr(), model->mut_dptr(), m->mut_dptr(), @@ -506,18 +509,18 @@ class AdagradUpdateKernel final : public user_op::OpKernel, public user_op::Cuda if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } AdagradUpdateKernelUtil::Update( - ctx->stream(), model->shape().elem_cnt(), static_cast(scale), l1, l2, lr_decay, epsilon, - weight_decay, learning_rate_val, train_step_val, learning_rate_ptr, train_step_ptr, + ctx->stream(), model->shape_view().elem_cnt(), static_cast(scale), l1, l2, lr_decay, + epsilon, weight_decay, learning_rate_val, train_step_val, learning_rate_ptr, train_step_ptr, scale_by_ptr, skip_if_ptr, model_diff->dptr(), model->mut_dptr(), sum->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } @@ -562,14 +565,14 @@ class IndexedSlicesAdamUpdateKernel final : public user_op::OpKernel { const float* bias_correction1_ptr = nullptr; if (ctx->has_input("bias_correction1", 0)) { const user_op::Tensor* bias_correction1 = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0); - CHECK_EQ(bias_correction1->shape().elem_cnt(), 1); + CHECK_EQ(bias_correction1->shape_view().elem_cnt(), 1); bias_correction1_ptr = bias_correction1->dptr(); } const float* bias_correction2_ptr = nullptr; if (ctx->has_input("bias_correction2", 0)) { const user_op::Tensor* bias_correction2 = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0); - CHECK_EQ(bias_correction2->shape().elem_cnt(), 1); + CHECK_EQ(bias_correction2->shape_view().elem_cnt(), 1); bias_correction2_ptr = bias_correction2->dptr(); } @@ -595,9 +598,9 @@ class IndexedSlicesAdamUpdateKernel final : public user_op::OpKernel { auto* kernel_cache = dynamic_cast(cache); CHECK_NOTNULL(kernel_cache); - CHECK_EQ(model->shape().At(0), kernel_cache->upper() - kernel_cache->lower()); - const int64_t num_indices = model_diff_indices->shape().elem_cnt(); - const int64_t num_values = model_diff_values->shape().elem_cnt(); + CHECK_EQ(model->shape_view().At(0), kernel_cache->upper() - kernel_cache->lower()); + const int64_t num_indices = model_diff_indices->shape_view().elem_cnt(); + const int64_t num_values = model_diff_values->shape_view().elem_cnt(); if (num_indices == 0) { CHECK_EQ(num_values, 0); return; @@ -605,11 +608,12 @@ class IndexedSlicesAdamUpdateKernel final : public user_op::OpKernel { CHECK_NE(num_values, 0); CHECK_EQ(num_values % num_indices, 0); const int64_t feature_size = num_values / num_indices; - CHECK_EQ(feature_size, model_diff_values->shape().Count(model_diff_indices->shape().NumAxes())); + CHECK_EQ(feature_size, + model_diff_values->shape_view().Count(model_diff_indices->shape_view().NumAxes())); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); TmpBufferManager buffer_manager(tmp_buffer->mut_dptr(), num_indices, num_values); - CHECK_GE(tmp_buffer->shape().elem_cnt(), buffer_manager.GetTotalBufferSize()); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.GetTotalBufferSize()); ReduceSumUtilT::ReduceSum( ctx->stream(), num_indices, feature_size, model_diff_indices->dptr(), @@ -692,7 +696,8 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra user_op::Tensor* m = ctx->Tensor4ArgNameAndIndex("m", 0); user_op::Tensor* v = ctx->Tensor4ArgNameAndIndex("v", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - LambTmpBufferManager tbm(tmp_buffer->mut_dptr(), model->shape().elem_cnt()); + LambTmpBufferManager tbm(tmp_buffer->mut_dptr(), + model->shape_view().elem_cnt()); const auto scale = ctx->Attr("scale"); const auto l1 = ctx->Attr("l1"); @@ -708,14 +713,14 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra if (ctx->has_input("bias_correction1", 0)) { const user_op::Tensor* bias_correction1 = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0); // Just for Lazy optional input check. - CHECK_EQ(bias_correction1->shape().elem_cnt(), 1); + CHECK_EQ(bias_correction1->shape_view().elem_cnt(), 1); bias_correction1_ptr = bias_correction1->dptr(); } const float bias_correction2_val = ctx->Attr("bias_correction2_val"); const float* bias_correction2_ptr = nullptr; if (ctx->has_input("bias_correction2", 0)) { const user_op::Tensor* bias_correction2 = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0); - CHECK_EQ(bias_correction2->shape().elem_cnt(), 1); + CHECK_EQ(bias_correction2->shape_view().elem_cnt(), 1); bias_correction2_ptr = bias_correction2->dptr(); } @@ -730,23 +735,23 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } LambUpdateKernelUtil::Update( - ctx->stream(), m->shape().elem_cnt(), scale, l1, l2, beta1, beta2, epsilon, weight_decay, - learning_rate_val, do_bias_correction, bias_correction1_val, bias_correction2_val, - learning_rate_ptr, bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr, skip_if_ptr, - model_diff->dptr(), tbm.AdamDiffPtr(), model->mut_dptr(), m->mut_dptr(), - v->mut_dptr(), tbm.NormBufferPtr()); + ctx->stream(), m->shape_view().elem_cnt(), scale, l1, l2, beta1, beta2, epsilon, + weight_decay, learning_rate_val, do_bias_correction, bias_correction1_val, + bias_correction2_val, learning_rate_ptr, bias_correction1_ptr, bias_correction2_ptr, + scale_by_ptr, skip_if_ptr, model_diff->dptr(), tbm.AdamDiffPtr(), model->mut_dptr(), + m->mut_dptr(), v->mut_dptr(), tbm.NormBufferPtr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } }; @@ -831,13 +836,13 @@ class RmsPropUpdateKernel final : public user_op::OpKernel, public user_op::Cuda if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } T* mean_gradient_ptr = nullptr; @@ -846,9 +851,10 @@ class RmsPropUpdateKernel final : public user_op::OpKernel, public user_op::Cuda mean_gradient_ptr = mean_gradient->mut_dptr(); } RmsPropUpdateKernelUtil::Update( - ctx->stream(), model->shape().elem_cnt(), static_cast(scale), l1, l2, centered, epsilon, - weight_decay, decay_rate, learning_rate_val, learning_rate_ptr, scale_by_ptr, skip_if_ptr, - model_diff->dptr(), model->mut_dptr(), mean_square->mut_dptr(), mean_gradient_ptr); + ctx->stream(), model->shape_view().elem_cnt(), static_cast(scale), l1, l2, centered, + epsilon, weight_decay, decay_rate, learning_rate_val, learning_rate_ptr, scale_by_ptr, + skip_if_ptr, model_diff->dptr(), model->mut_dptr(), mean_square->mut_dptr(), + mean_gradient_ptr); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } }; @@ -916,7 +922,8 @@ class LarsUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra user_op::Tensor* model = ctx->Tensor4ArgNameAndIndex("model", 0); user_op::Tensor* momentum = ctx->Tensor4ArgNameAndIndex("momentum", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - LarsTmpBufferManager tlm(tmp_buffer->mut_dptr(), model->shape().elem_cnt()); + LarsTmpBufferManager tlm(tmp_buffer->mut_dptr(), + model->shape_view().elem_cnt()); const auto scale = ctx->Attr("scale"); const auto l1 = ctx->Attr("l1"); const auto l2 = ctx->Attr("l2"); @@ -928,17 +935,17 @@ class LarsUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } LarsUpdateKernelUtil::Update( - ctx->stream(), model->shape().elem_cnt(), static_cast(scale), l1, l2, momentum_beta, + ctx->stream(), model->shape_view().elem_cnt(), static_cast(scale), l1, l2, momentum_beta, epsilon, lars_coefficient, weight_decay, learning_rate->dptr(), scale_by_ptr, skip_if_ptr, model_diff->dptr(), model->mut_dptr(), momentum->mut_dptr(), tlm.DataTmpPtr(), tlm.ModelDiffPtr()); @@ -1007,18 +1014,18 @@ class FtrlUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), model->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } FtrlUpdateKernelUtil::Update( - ctx->stream(), model->shape().elem_cnt(), static_cast(scale), l1, l2, lr_power, lambda1, - lambda2, beta, weight_decay, learning_rate_val, learning_rate_ptr, scale_by_ptr, + ctx->stream(), model->shape_view().elem_cnt(), static_cast(scale), l1, l2, lr_power, + lambda1, lambda2, beta, weight_decay, learning_rate_val, learning_rate_ptr, scale_by_ptr, skip_if_ptr, model_diff->dptr(), model->mut_dptr(), accumulate->mut_dptr(), z->mut_dptr()); } diff --git a/oneflow/user/kernels/moving_average_min_max_observer_kernel.cpp b/oneflow/user/kernels/moving_average_min_max_observer_kernel.cpp index 6d9d045d5ee..834adc52421 100644 --- a/oneflow/user/kernels/moving_average_min_max_observer_kernel.cpp +++ b/oneflow/user/kernels/moving_average_min_max_observer_kernel.cpp @@ -136,7 +136,7 @@ class CpuMovingAverageMinMaxObserverKernel final : public user_op::OpKernel { T* scale_ptr = scale->mut_dptr(); T* zero_point_ptr = zero_point->mut_dptr(); - int64_t num_elements = in->shape().elem_cnt(); + int64_t num_elements = in->shape_view().elem_cnt(); if (quantization_formula == "google") { if (quantization_scheme == "symmetric") { diff --git a/oneflow/user/kernels/moving_average_min_max_observer_kernel.cu b/oneflow/user/kernels/moving_average_min_max_observer_kernel.cu index d0398fa97c1..2db5a2ef984 100644 --- a/oneflow/user/kernels/moving_average_min_max_observer_kernel.cu +++ b/oneflow/user/kernels/moving_average_min_max_observer_kernel.cu @@ -241,13 +241,13 @@ class GpuMovingAverageMinMaxObserverKernel final : public user_op::OpKernel { const float momentum = ctx->Attr("momentum"); const std::string quantization_formula = ctx->Attr("quantization_formula"); - int64_t elements = in->shape().elem_cnt(); + int64_t elements = in->shape_view().elem_cnt(); T* max_ptr = tmp_buffer->mut_dptr(); T* min_ptr = max_ptr + 1; - int64_t* host_current_train_step_ptr = new int64_t[current_train_step->shape().elem_cnt()]; + int64_t* host_current_train_step_ptr = new int64_t[current_train_step->shape_view().elem_cnt()]; OF_CUDA_CHECK(cudaMemcpy(host_current_train_step_ptr, current_train_step->dptr(), - current_train_step->shape().elem_cnt() * sizeof(int64_t), + current_train_step->shape_view().elem_cnt() * sizeof(int64_t), cudaMemcpyDefault)); auto* cuda_stream = ctx->stream()->As(); if (*host_current_train_step_ptr <= stop_update_after_iters && is_training) { diff --git a/oneflow/user/kernels/multi_reduce_kernels.h b/oneflow/user/kernels/multi_reduce_kernels.h index 276532380f0..32f3c4c6193 100644 --- a/oneflow/user/kernels/multi_reduce_kernels.h +++ b/oneflow/user/kernels/multi_reduce_kernels.h @@ -38,7 +38,7 @@ class MultiReduceSumPowAbsKernel final : public user_op::OpKernel, params.resize(ctx->input_size("x")); for (size_t i = 0; i < params.size(); ++i) { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", i); - params[i].size = x->shape().elem_cnt(); + params[i].size = x->shape_view().elem_cnt(); params[i].data = x->dptr(); } user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); @@ -88,7 +88,7 @@ class MultiReduceXimumAbsKernel final : public user_op::OpKernel, public user_op params.resize(ctx->input_size("x")); for (size_t i = 0; i < params.size(); ++i) { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", i); - params[i].size = x->shape().elem_cnt(); + params[i].size = x->shape_view().elem_cnt(); params[i].data = x->dptr(); } user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); diff --git a/oneflow/user/kernels/narrow_kernel.cpp b/oneflow/user/kernels/narrow_kernel.cpp index f7db7230c4a..a7bc1794874 100644 --- a/oneflow/user/kernels/narrow_kernel.cpp +++ b/oneflow/user/kernels/narrow_kernel.cpp @@ -55,12 +55,12 @@ class NarrowKernel final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - if (in->shape().elem_cnt() == 0) { return; } + if (in->shape_view().elem_cnt() == 0) { return; } user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); const int64_t& dim = ctx->Attr("dim"); const int64_t& start = ctx->Attr("start"); - int64_t length = out->shape().At(dim); - const ShapeView in_shape = in->shape(); + int64_t length = out->shape_view().At(dim); + const ShapeView in_shape = in->shape_view(); auto copy_nd_primitive = NewCopyNdPrimitive(ctx); CHECK(copy_nd_primitive); @@ -92,9 +92,9 @@ class NarrowGradKernel final : public user_op::OpKernel { user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const int64_t& dim = ctx->Attr("dim"); const int64_t& start = ctx->Attr("start"); - int64_t length = dy->shape().At(dim); + int64_t length = dy->shape_view().At(dim); - size_t dx_byte_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t dx_byte_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); void* dst = dx->mut_dptr(); std::unique_ptr memset_primitive = ep::primitive::NewPrimitive(ctx->device_type()); @@ -103,7 +103,7 @@ class NarrowGradKernel final : public user_op::OpKernel { auto copy_nd_primitive = NewCopyNdPrimitive(ctx); CHECK(copy_nd_primitive); - const ShapeView dx_shape = dx->shape(); + const ShapeView dx_shape = dx->shape_view(); const int64_t outer_dim = dx_shape.Count(0, dim); const int64_t inner_dim = dx_shape.Count(dim + 1); diff --git a/oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp b/oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp index 38e23836980..b15c1eb851a 100644 --- a/oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp +++ b/oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp @@ -32,11 +32,10 @@ class NcclLogical2DSameDim0KernelCommState : public user_op::OpKernelState { public: explicit NcclLogical2DSameDim0KernelCommState(user_op::KernelInitContext* ctx) : is_init_(false), - has_independent_stream_(ctx->op_conf().has_stream_name_hint()), - stream_name_("NONE"), + stream_name_(EagerNcclCommMgr::kDefaultStreamName), parallel_desc_(ctx->parallel_desc()), this_parallel_id_(ctx->parallel_ctx().parallel_id()) { - if (has_independent_stream_) { stream_name_ = ctx->op_conf().stream_name_hint(); } + if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); } } ~NcclLogical2DSameDim0KernelCommState() override = default; @@ -71,17 +70,12 @@ class NcclLogical2DSameDim0KernelCommState : public user_op::OpKernelState { device_set.emplace(std::make_pair(machine_id, device_id)); } EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Global::Get()); - if (has_independent_stream_) { - comm_ = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); - } else { - comm_ = comm_mgr->GetCommForDevice(device_set); - } + comm_ = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); num_ranks_ = group_size; is_init_ = true; } bool is_init_; - bool has_independent_stream_; std::string stream_name_; ParallelDesc parallel_desc_; int64_t this_parallel_id_; @@ -136,11 +130,11 @@ class NcclLogical2DSameDim0AllReduce final : public user_op::OpKernel { CHECK(nccl_comm != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); VLOG(3) << "[NcclLogical2D][SameDim0AllReduce] " << nccl_comm->stream_name() << " " << ctx->op_name() << std::endl; - OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), + OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), ncclRedOp_t::ncclSum, nccl_comm->comm(), ctx->stream()->As()->cuda_stream())); @@ -168,10 +162,10 @@ class NcclLogical2DSameDim0AllGather final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = nccl_comm->num_ranks(); - CHECK_EQ(in->shape().elem_cnt() * num_ranks, out->shape().elem_cnt()); + CHECK_EQ(in->shape_view().elem_cnt() * num_ranks, out->shape_view().elem_cnt()); VLOG(3) << "[NcclLogical2D][SameDim0AllGather] " << nccl_comm->stream_name() << " " << ctx->op_name() << std::endl; - OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), + OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), nccl_comm->comm(), ctx->stream()->As()->cuda_stream())); }; @@ -206,24 +200,24 @@ class NcclLogical2DSameDim0AllGatherNoncontinuous final : public user_op::OpKern user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = GetCudaAlignedSize(out->shape().elem_cnt() * dtype_size); + int64_t data_size = GetCudaAlignedSize(out->shape_view().elem_cnt() * dtype_size); void* unpack_from_ptr = tmp_buffer->mut_dptr(); - CHECK_EQ(tmp_buffer->shape().elem_cnt(), data_size); + CHECK_EQ(tmp_buffer->shape_view().elem_cnt(), data_size); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = kernel_state->num_ranks(); const int64_t in_split_axis = kernel_state->src_split_axis(); DimVector logical_shape_dim_vec; - in->shape().ToDimVector(&logical_shape_dim_vec); + in->shape_view().ToDimVector(&logical_shape_dim_vec); logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; VLOG(3) << "[NcclLogical2D][SameDim0AllGatherNoncontinuous] " << kernel_state->stream_name() << " " << ctx->op_name() << std::endl; // NOTE(chengcheng): Do AllGather - CHECK_EQ(in->shape().elem_cnt() * num_ranks, out->shape().elem_cnt()); - OF_NCCL_CHECK(ncclAllGather(in->dptr(), unpack_from_ptr, in->shape().elem_cnt(), + CHECK_EQ(in->shape_view().elem_cnt() * num_ranks, out->shape_view().elem_cnt()); + OF_NCCL_CHECK(ncclAllGather(in->dptr(), unpack_from_ptr, in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), kernel_state->comm(), ctx->stream()->As()->cuda_stream())); @@ -285,22 +279,22 @@ class NcclLogical2DSameDim0All2All final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); int64_t tmp_size = 0; const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = GetCudaAlignedSize(in->shape().elem_cnt() * dtype_size); + int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size); // NOTE(chengcheng): in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out const char* pack_to_ptr = in->dptr(); char* unpack_from_ptr = out->mut_dptr(); - if (tmp_buffer) { tmp_size = tmp_buffer->shape().elem_cnt(); } + if (tmp_buffer) { tmp_size = tmp_buffer->shape_view().elem_cnt(); } CHECK(tmp_size == 0 || tmp_size == data_size || tmp_size == data_size * 2); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = kernel_state->num_ranks(); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt()); - const int64_t elem_cnt = in->shape().elem_cnt(); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()); + const int64_t elem_cnt = in->shape_view().elem_cnt(); const int64_t in_split_axis = kernel_state->src_split_axis(); const int64_t out_split_axis = kernel_state->dst_split_axis(); DimVector logical_shape_dim_vec; - in->shape().ToDimVector(&logical_shape_dim_vec); + in->shape_view().ToDimVector(&logical_shape_dim_vec); logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; VLOG(3) << "[NcclLogical2D][SameDim0All2All] " << kernel_state->stream_name() << " " @@ -309,7 +303,7 @@ class NcclLogical2DSameDim0All2All final : public user_op::OpKernel { if (out_split_axis != 0) { // NOTE(chengcheng): Do pack. Need transpose in -> pack_to // pack use temp buffer offset: [0, data_size] - pack_to_ptr = tmp_buffer->dptr(); + pack_to_ptr = CHECK_NOTNULL(tmp_buffer)->dptr(); DimVector transpose_in_dim_vec = logical_shape_dim_vec; CHECK_EQ(transpose_in_dim_vec.at(in_split_axis) % num_ranks, 0); transpose_in_dim_vec[in_split_axis] = transpose_in_dim_vec.at(in_split_axis) / num_ranks; @@ -332,7 +326,7 @@ class NcclLogical2DSameDim0All2All final : public user_op::OpKernel { if (in_split_axis != 0) { // NOTE(chengcheng): Do unpack. Need transpose unpack_from -> out // unpack use temp buffer offset: [tmp_size - data_size, tmp_size] - unpack_from_ptr = tmp_buffer->mut_dptr() + (tmp_size - data_size); + unpack_from_ptr = CHECK_NOTNULL(tmp_buffer)->mut_dptr() + (tmp_size - data_size); } { @@ -399,11 +393,10 @@ class NcclLogical2DSameDim1KernelCommState final : public user_op::OpKernelState public: explicit NcclLogical2DSameDim1KernelCommState(user_op::KernelInitContext* ctx) : is_init_(false), - has_independent_stream_(ctx->op_conf().has_stream_name_hint()), - stream_name_("NONE"), + stream_name_(EagerNcclCommMgr::kDefaultStreamName), parallel_desc_(ctx->parallel_desc()), this_parallel_id_(ctx->parallel_ctx().parallel_id()) { - if (has_independent_stream_) { stream_name_ = ctx->op_conf().stream_name_hint(); } + if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); } } ~NcclLogical2DSameDim1KernelCommState() = default; @@ -425,12 +418,7 @@ class NcclLogical2DSameDim1KernelCommState final : public user_op::OpKernelState device_set.emplace(std::make_pair(machine_id, device_id)); } EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Global::Get()); - CHECK_NOTNULL(comm_mgr); - if (has_independent_stream_) { - comm_ = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); - } else { - comm_ = comm_mgr->GetCommForDevice(device_set); - } + comm_ = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); is_init_ = true; } return comm_; @@ -440,7 +428,6 @@ class NcclLogical2DSameDim1KernelCommState final : public user_op::OpKernelState private: bool is_init_; - bool has_independent_stream_; std::string stream_name_; ParallelDesc parallel_desc_; int64_t this_parallel_id_; @@ -464,11 +451,11 @@ class NcclLogical2DSameDim1AllReduce final : public user_op::OpKernel { CHECK(nccl_comm != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); VLOG(3) << "[NcclLogical2D][SameDim1AllReduce] " << nccl_comm->stream_name() << " " << ctx->op_name() << std::endl; - OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), + OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), ncclRedOp_t::ncclSum, nccl_comm->comm(), ctx->stream()->As()->cuda_stream())); @@ -521,6 +508,12 @@ REGISTER_USER_KERNEL("_nccl_logical_2D_same_dim1_all_reduce") .SetCreateFn() .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_2D_same_dim0_all_reduce"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_2D_same_dim0_all_gather"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_2D_same_dim0_all_gather_noncontinuous"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_2D_same_dim0_all2all"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_2D_same_dim1_all_reduce"); + } // namespace oneflow #endif // WITH_CUDA && NCCL_VERSION_CODE > 2700 diff --git a/oneflow/user/kernels/nccl_logical_kernels.cpp b/oneflow/user/kernels/nccl_logical_kernels.cpp index 3b1f95e2289..34dec5804ef 100644 --- a/oneflow/user/kernels/nccl_logical_kernels.cpp +++ b/oneflow/user/kernels/nccl_logical_kernels.cpp @@ -32,10 +32,9 @@ class NcclLogicalKernelCommState : public user_op::OpKernelState { public: explicit NcclLogicalKernelCommState(user_op::KernelInitContext* ctx) : is_init_(false), - has_independent_stream_(ctx->op_conf().has_stream_name_hint()), - stream_name_("NONE"), + stream_name_(EagerNcclCommMgr::kDefaultStreamName), parallel_desc_(ctx->parallel_desc()) { - if (has_independent_stream_) { stream_name_ = ctx->op_conf().stream_name_hint(); } + if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); } } ~NcclLogicalKernelCommState() override = default; @@ -48,11 +47,7 @@ class NcclLogicalKernelCommState : public user_op::OpKernelState { device_set.emplace(std::make_pair(machine_id, device_id)); } EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Global::Get()); - if (has_independent_stream_) { - comm_ = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); - } else { - comm_ = comm_mgr->GetCommForDevice(device_set); - } + comm_ = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); is_init_ = true; } return comm_; @@ -62,7 +57,6 @@ class NcclLogicalKernelCommState : public user_op::OpKernelState { private: bool is_init_; - bool has_independent_stream_; std::string stream_name_; ParallelDesc parallel_desc_; ncclComm_t comm_{}; @@ -127,11 +121,11 @@ class NcclLogicalAllReduceKernel final : public user_op::OpKernel { CHECK(nccl_comm != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->shape_view(), out->shape_view()); CHECK_EQ(in->data_type(), out->data_type()); VLOG(3) << "[NcclLogical][AllReduce] " << nccl_comm->stream_name() << " " << ctx->op_name() << std::endl; - OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), + OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), ncclRedOp_t::ncclSum, nccl_comm->comm(), ctx->stream()->As()->cuda_stream())); @@ -159,10 +153,10 @@ class NcclLogicalReduceScatterKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = ctx->parallel_ctx().parallel_num(); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt() * num_ranks); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt() * num_ranks); VLOG(3) << "[NcclLogical][ReduceScatter] " << nccl_comm->stream_name() << " " << ctx->op_name() << std::endl; - OF_NCCL_CHECK(ncclReduceScatter(in->dptr(), out->mut_dptr(), out->shape().elem_cnt(), + OF_NCCL_CHECK(ncclReduceScatter(in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), ncclRedOp_t::ncclSum, nccl_comm->comm(), ctx->stream()->As()->cuda_stream())); @@ -190,10 +184,10 @@ class NcclLogicalAllGatherKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = ctx->parallel_ctx().parallel_num(); - CHECK_EQ(in->shape().elem_cnt() * num_ranks, out->shape().elem_cnt()); + CHECK_EQ(in->shape_view().elem_cnt() * num_ranks, out->shape_view().elem_cnt()); VLOG(3) << "[NcclLogical][AllGather] " << nccl_comm->stream_name() << " " << ctx->op_name() << std::endl; - OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), + OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), nccl_comm->comm(), ctx->stream()->As()->cuda_stream())); }; @@ -227,24 +221,24 @@ class NcclLogicalAllGatherNoncontinuous final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = GetCudaAlignedSize(out->shape().elem_cnt() * dtype_size); + int64_t data_size = GetCudaAlignedSize(out->shape_view().elem_cnt() * dtype_size); void* unpack_from_ptr = tmp_buffer->mut_dptr(); - CHECK_EQ(tmp_buffer->shape().elem_cnt(), data_size); + CHECK_EQ(tmp_buffer->shape_view().elem_cnt(), data_size); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = ctx->parallel_ctx().parallel_num(); const int64_t in_split_axis = kernel_state->src_split_axis(); DimVector logical_shape_dim_vec; - in->shape().ToDimVector(&logical_shape_dim_vec); + in->shape_view().ToDimVector(&logical_shape_dim_vec); logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; VLOG(3) << "[NcclLogical][AllGatherNoncontinuous] " << kernel_state->stream_name() << " " << ctx->op_name() << std::endl; // NOTE(chengcheng): Do AllGather - CHECK_EQ(in->shape().elem_cnt() * num_ranks, out->shape().elem_cnt()); - OF_NCCL_CHECK(ncclAllGather(in->dptr(), unpack_from_ptr, in->shape().elem_cnt(), + CHECK_EQ(in->shape_view().elem_cnt() * num_ranks, out->shape_view().elem_cnt()); + OF_NCCL_CHECK(ncclAllGather(in->dptr(), unpack_from_ptr, in->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), kernel_state->comm(), ctx->stream()->As()->cuda_stream())); @@ -299,15 +293,15 @@ class NcclLogicalReduceScatterNoncontinuous final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = GetCudaAlignedSize(in->shape().elem_cnt() * dtype_size); - CHECK_EQ(tmp_buffer->shape().elem_cnt(), data_size); + int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size); + CHECK_EQ(tmp_buffer->shape_view().elem_cnt(), data_size); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = ctx->parallel_ctx().parallel_num(); const int64_t out_split_axis = kernel_state->dst_split_axis(); DimVector logical_shape_dim_vec; - in->shape().ToDimVector(&logical_shape_dim_vec); + in->shape_view().ToDimVector(&logical_shape_dim_vec); DimVector transpose_in_dim_vec = logical_shape_dim_vec; transpose_in_dim_vec[out_split_axis] = transpose_in_dim_vec.at(out_split_axis) / num_ranks; @@ -327,9 +321,9 @@ class NcclLogicalReduceScatterNoncontinuous final : public user_op::OpKernel { << ctx->op_name() << std::endl; ncclRedOp_t reduce_type = ncclRedOp_t::ncclSum; if (in->data_type() == kBool) { reduce_type = ncclRedOp_t::ncclMax; } - OF_NCCL_CHECK(ncclReduceScatter(tmp_buffer->dptr(), out->mut_dptr(), out->shape().elem_cnt(), - GetNcclDataType(in->data_type()), reduce_type, - kernel_state->comm(), + OF_NCCL_CHECK(ncclReduceScatter(tmp_buffer->dptr(), out->mut_dptr(), + out->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), + reduce_type, kernel_state->comm(), ctx->stream()->As()->cuda_stream())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -374,22 +368,22 @@ class NcclLogicalS2SKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); int64_t tmp_size = 0; const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = GetCudaAlignedSize(in->shape().elem_cnt() * dtype_size); + int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size); // NOTE(chengcheng): in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out const char* pack_to_ptr = in->dptr(); char* unpack_from_ptr = out->mut_dptr(); - if (tmp_buffer) { tmp_size = tmp_buffer->shape().elem_cnt(); } + if (tmp_buffer) { tmp_size = tmp_buffer->shape_view().elem_cnt(); } CHECK(tmp_size == 0 || tmp_size == data_size || tmp_size == data_size * 2); CHECK_EQ(in->data_type(), out->data_type()); const int64_t num_ranks = ctx->parallel_ctx().parallel_num(); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt()); - const int64_t elem_cnt = in->shape().elem_cnt(); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()); + const int64_t elem_cnt = in->shape_view().elem_cnt(); const int64_t in_split_axis = kernel_state->src_split_axis(); const int64_t out_split_axis = kernel_state->dst_split_axis(); DimVector logical_shape_dim_vec; - in->shape().ToDimVector(&logical_shape_dim_vec); + in->shape_view().ToDimVector(&logical_shape_dim_vec); logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; VLOG(3) << "[NcclLogical][S2S] " << kernel_state->stream_name() << " " << ctx->op_name() @@ -398,7 +392,7 @@ class NcclLogicalS2SKernel final : public user_op::OpKernel { if (out_split_axis != 0) { // NOTE(chengcheng): Do pack. Need transpose in -> pack_to // pack use temp buffer offset: [0, data_size] - pack_to_ptr = tmp_buffer->dptr(); + pack_to_ptr = CHECK_NOTNULL(tmp_buffer)->dptr(); DimVector transpose_in_dim_vec = logical_shape_dim_vec; CHECK_EQ(transpose_in_dim_vec.at(in_split_axis) % num_ranks, 0); transpose_in_dim_vec[in_split_axis] = transpose_in_dim_vec.at(in_split_axis) / num_ranks; @@ -421,7 +415,7 @@ class NcclLogicalS2SKernel final : public user_op::OpKernel { if (in_split_axis != 0) { // NOTE(chengcheng): Do unpack. Need transpose unpack_from -> out // unpack use temp buffer offset: [tmp_size - data_size, tmp_size] - unpack_from_ptr = tmp_buffer->mut_dptr() + (tmp_size - data_size); + unpack_from_ptr = CHECK_NOTNULL(tmp_buffer)->mut_dptr() + (tmp_size - data_size); } { @@ -545,6 +539,12 @@ REGISTER_S2S_KERNEL(float) REGISTER_S2S_KERNEL(double) REGISTER_S2S_KERNEL(float16) +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_all_reduce"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_reduce_scatter"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_all_gather"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_all_gather_noncontinuous"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("_nccl_logical_s2s"); + } // namespace oneflow #endif // WITH_CUDA && NCCL_VERSION_CODE > 2700 diff --git a/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp b/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp index 0dcce716725..c0a8ecb8a0d 100644 --- a/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp +++ b/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp @@ -56,7 +56,6 @@ class NcclLogicalSendRecvState final : public user_op::OpKernelState { return *comm_; } - bool has_independent_stream_; std::string stream_name_; std::unique_ptr parallel_desc_; mutable std::unique_ptr comm_; @@ -68,8 +67,8 @@ class NcclLogicalSendRecvState final : public user_op::OpKernelState { }; NcclLogicalSendRecvState::NcclLogicalSendRecvState(user_op::KernelInitContext* ctx) - : has_independent_stream_(ctx->op_conf().has_stream_name_hint()) { - if (has_independent_stream_) { stream_name_ = ctx->op_conf().stream_name_hint(); } + : stream_name_(EagerNcclCommMgr::kDefaultStreamName) { + if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); } const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); parallel_desc_ = std::make_unique(ctx->parallel_desc()); NdSbp src_nd_sbp; @@ -129,11 +128,7 @@ void NcclLogicalSendRecvState::InitComm() const { } EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Global::Get()); ncclComm_t comm = nullptr; - if (has_independent_stream_) { - comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); - } else { - comm = comm_mgr->GetCommForDevice(device_set); - } + comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); comm_.reset(new Comm(comm)); } @@ -227,24 +222,24 @@ void NcclLogicalSendRecv::Compute(user_op::KernelComputeContext* ctx, user_op::O if (out_tensor_slice_copier_vec.at(i)) { if (is_first_slice) { is_first_slice = false; - if (recv_elem_cnts.at(i) != out->shape().elem_cnt()) { + if (recv_elem_cnts.at(i) != out->shape_view().elem_cnt()) { // if not same shape, memset out memset_primitive->Launch(ctx->stream(), out->mut_dptr(), 0, - out->shape().elem_cnt() * GetSizeOfDataType(data_type)); + out->shape_view().elem_cnt() * GetSizeOfDataType(data_type)); } out_tensor_slice_copier_vec.at(i)->Copy(ctx->stream(), out->mut_dptr(), recv_out_ptr.at(i)); } else { - if (recv_elem_cnts.at(i) == out->shape().elem_cnt()) { + if (recv_elem_cnts.at(i) == out->shape_view().elem_cnt()) { add_primitive->Launch(ctx->stream(), out->dptr(), recv_out_ptr.at(i), out->mut_dptr(), - out->shape().elem_cnt()); + out->shape_view().elem_cnt()); } else { void* out_buf = reinterpret_cast(buf_ptr + offset); memset_primitive->Launch(ctx->stream(), out_buf, 0, - out->shape().elem_cnt() * GetSizeOfDataType(data_type)); + out->shape_view().elem_cnt() * GetSizeOfDataType(data_type)); out_tensor_slice_copier_vec.at(i)->Copy(ctx->stream(), out_buf, recv_out_ptr.at(i)); add_primitive->Launch(ctx->stream(), out->dptr(), out_buf, out->mut_dptr(), - out->shape().elem_cnt()); + out->shape_view().elem_cnt()); } } } diff --git a/oneflow/user/kernels/nd_index_slice_kernels.h b/oneflow/user/kernels/nd_index_slice_kernels.h index 871c73f47eb..7df6eadcde8 100644 --- a/oneflow/user/kernels/nd_index_slice_kernels.h +++ b/oneflow/user/kernels/nd_index_slice_kernels.h @@ -73,7 +73,7 @@ void GatherNdKernel::Compute(user_op::KernelComputeContext* c const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); const user_op::Tensor* params = ctx->Tensor4ArgNameAndIndex("params", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - if (indices->shape().elem_cnt() == 0) { return; } + if (indices->shape_view().elem_cnt() == 0) { return; } auto args = ConstructNdIndexSliceArgs(*params, *out, *indices); GatherNdFunctor()(ctx->stream(), args, indices->dptr(), params->dptr(), out->mut_dptr()); @@ -84,9 +84,9 @@ void ScatterNdKernel::Compute(user_op::KernelComputeContext* const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); const user_op::Tensor* updates = ctx->Tensor4ArgNameAndIndex("updates", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - size_t out_bytes_size = out->shape().elem_cnt() * GetSizeOfDataType(out->data_type()); + size_t out_bytes_size = out->shape_view().elem_cnt() * GetSizeOfDataType(out->data_type()); Memset(ctx->stream(), out->mut_dptr(), 0, out_bytes_size); - if (indices->shape().elem_cnt() == 0) { return; } + if (indices->shape_view().elem_cnt() == 0) { return; } auto args = ConstructNdIndexSliceArgs(*out, *updates, *indices); ScatterNdAddFunctor()(ctx->stream(), args, indices->dptr(), updates->dptr(), out->mut_dptr()); @@ -99,9 +99,9 @@ void TensorScatterNdUpdateKernel::Compute( const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); const user_op::Tensor* updates = ctx->Tensor4ArgNameAndIndex("updates", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - size_t out_bytes_size = out->shape().elem_cnt() * GetSizeOfDataType(out->data_type()); + size_t out_bytes_size = out->shape_view().elem_cnt() * GetSizeOfDataType(out->data_type()); Memcpy(ctx->stream(), out->mut_dptr(), params->dptr(), out_bytes_size); - if (indices->shape().elem_cnt() == 0) { return; } + if (indices->shape_view().elem_cnt() == 0) { return; } auto args = ConstructNdIndexSliceArgs(*params, *updates, *indices); ScatterNdUpdateFunctor()(ctx->stream(), args, indices->dptr(), updates->dptr(), out->mut_dptr()); @@ -114,9 +114,9 @@ void TensorScatterNdAddKernel::Compute( const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); const user_op::Tensor* updates = ctx->Tensor4ArgNameAndIndex("updates", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - size_t out_bytes_size = out->shape().elem_cnt() * GetSizeOfDataType(out->data_type()); + size_t out_bytes_size = out->shape_view().elem_cnt() * GetSizeOfDataType(out->data_type()); Memcpy(ctx->stream(), out->mut_dptr(), params->dptr(), out_bytes_size); - if (indices->shape().elem_cnt() == 0) { return; } + if (indices->shape_view().elem_cnt() == 0) { return; } auto args = ConstructNdIndexSliceArgs(*params, *updates, *indices); ScatterNdAddFunctor()(ctx->stream(), args, indices->dptr(), updates->dptr(), out->mut_dptr()); diff --git a/oneflow/user/kernels/nd_index_slice_util.h b/oneflow/user/kernels/nd_index_slice_util.h index 167dd0cba29..22cc9c836a7 100644 --- a/oneflow/user/kernels/nd_index_slice_util.h +++ b/oneflow/user/kernels/nd_index_slice_util.h @@ -36,10 +36,12 @@ inline NdIndexSliceArgs ConstructNdIndexSliceArgs(const user_op::Tensor& d const user_op::Tensor& indices) { NdIndexSliceArgs args; std::memset(&args, 0, sizeof(NdIndexSliceArgs)); - args.num_slices = indices.shape().Count(0, indices.shape().NumAxes() - 1); - args.index_ndims = indices.shape().At(indices.shape().NumAxes() - 1); - args.slice_size = slices.shape().Count(indices.shape().NumAxes() - 1); - FOR_RANGE(int64_t, i, 0, dense.shape().NumAxes()) { args.dense_shape[i] = dense.shape().At(i); } + args.num_slices = indices.shape_view().Count(0, indices.shape_view().NumAxes() - 1); + args.index_ndims = indices.shape_view().At(indices.shape_view().NumAxes() - 1); + args.slice_size = slices.shape_view().Count(indices.shape_view().NumAxes() - 1); + FOR_RANGE(int64_t, i, 0, dense.shape_view().NumAxes()) { + args.dense_shape[i] = dense.shape_view().At(i); + } return args; } diff --git a/oneflow/user/kernels/nll_kernel.cpp b/oneflow/user/kernels/nll_kernel.cpp index f71df661167..8204a95e874 100644 --- a/oneflow/user/kernels/nll_kernel.cpp +++ b/oneflow/user/kernels/nll_kernel.cpp @@ -14,130 +14,180 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/user/kernels/loss_kernel_util.h" +#include "oneflow/core/framework/nd_sbp.h" +#include "oneflow/core/job/nd_sbp_util.h" +#include "oneflow/user/kernels/nll_kernel_util.h" namespace oneflow { -namespace user_op { + namespace { -using namespace loss; - -template -void ComputeNllOut(int64_t num_instances, K num_classes, K ignore_index, const T* input, - const K* target, T* out, const T* weight, T* total_weight) { - *total_weight = 0; - FOR_RANGE(int64_t, i, 0, num_instances) { - K label = target[i]; - if (label == ignore_index) { - out[i] = 0; - continue; +class NLLKernelCache final : public user_op::OpKernelCache { + public: + NLLKernelCache(int64_t class_start, int64_t num_classes) + : class_start_(class_start), num_classes_(num_classes) {} + ~NLLKernelCache() override = default; + + int64_t class_start() const { return class_start_; } + int64_t num_classes() const { return num_classes_; } + + private: + const int64_t class_start_; + const int64_t num_classes_; +}; + +std::shared_ptr CreateNLLKernelCache(user_op::KernelCacheContext* ctx) { + CHECK_GT(ctx->parallel_ctx().parallel_num(), 0) << ctx->op_name() << ": invalid parallel_ctx"; + if (ctx->parallel_ctx().parallel_num() == 1) { return nullptr; } + + const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("input", 0); + const Shape& hierarchy = *ctx->parallel_desc().hierarchy(); + CHECK_EQ(nd_sbp.sbp_parallel_size(), hierarchy.NumAxes()) + << ctx->op_name() << ": Expected input sbp " << NdSbpToString(nd_sbp) << " match hierarchy " + << hierarchy.ToString(); + + const Shape& shape = ctx->LogicalTensorDesc4ArgNameAndIndex("input", 0)->shape(); + const int64_t class_axis = shape.NumAxes() - 1; + + bool split_class_dim = false; + for (const auto& sbp : nd_sbp.sbp_parallel()) { + if (sbp.has_split_parallel() && sbp.split_parallel().axis() == class_axis) { + split_class_dim = true; + break; } - CHECK_GE(label, 0); - CHECK_LT(label, num_classes); - T cur_weight = weight == nullptr ? 1 : weight[label]; - *total_weight += cur_weight; - out[i] = -input[i * num_classes + label] * cur_weight; - } -} -template -void ComputeNllGradOut(int64_t num_instances, K num_classes, K ignore_index, const K* target, - const T* dy, T* dx, const T* weight, const T* total_weight) { - FOR_RANGE(int64_t, i, 0, num_instances) { - K label = target[i]; - if (label == ignore_index) { continue; } - CHECK_GE(label, 0); - CHECK_LT(label, num_classes); - T cur_weight = weight == nullptr ? -1 : -weight[label]; - dx[i * num_classes + label] = dy[i] * cur_weight; } + + if (!split_class_dim) { return nullptr; } + + TensorSliceView view = + GetTensorSliceView4ParallelId(hierarchy, nd_sbp, shape, ctx->parallel_ctx().parallel_id()); + return std::make_shared(view.At(class_axis).begin(), view.At(class_axis).size()); } -template -class NllKernel final : public user_op::OpKernel { + +} // namespace + +template +class NLLKernel final : public user_op::OpKernel { public: - NllKernel() = default; - ~NllKernel() = default; + NLLKernel() = default; + ~NLLKernel() override = default; + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateNLLKernelCache(ctx); + } private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - auto* total_weight_blob = ctx->Tensor4ArgNameAndIndex("total_weight", 0); - - const int64_t num_instances = target_blob->shape().elem_cnt(); - CHECK_EQ(input_blob->shape().elem_cnt() % num_instances, 0); - const K num_classes = static_cast(input_blob->shape().elem_cnt() / num_instances); + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const override { + const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* target = ctx->Tensor4ArgNameAndIndex("target", 0); + auto* output = ctx->Tensor4ArgNameAndIndex("output", 0); + auto* out_weight = ctx->Tensor4ArgNameAndIndex("out_weight", 0); + + const int64_t N = target->shape_view().elem_cnt(); + const int64_t C = input->shape_view().At(input->shape_view().NumAxes() - 1); + CHECK_LE(N, std::numeric_limits::max()) + << "Expected batch size not exceed int32 numeric limits"; + + K class_start = 0; + if (cache) { + const auto* spec_cache = dynamic_cast(cache); + CHECK_NOTNULL(spec_cache); + CHECK_EQ(spec_cache->num_classes(), C) << ctx->op_name() << ": expected num_classes " << C + << ", got " << spec_cache->num_classes(); + class_start = spec_cache->class_start(); + } + const K ignore_index = static_cast(ctx->Attr("ignore_index")); - const T* input = input_blob->dptr(); - const K* target = target_blob->dptr(); - T* out = out_blob->mut_dptr(); - T* total_weight = total_weight_blob->mut_dptr(); - const T* weight = - ctx->has_input("weight", 0) ? ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr() : nullptr; + const T* weight_dptr = nullptr; + if (ctx->has_input("weight", 0)) { + weight_dptr = CHECK_NOTNULL(ctx->Tensor4ArgNameAndIndex("weight", 0))->dptr(); + } - ComputeNllOut(num_instances, num_classes, ignore_index, input, target, out, weight, - total_weight); + NLLKernelUtil::Forward(ctx->stream(), static_cast(N), + static_cast(C), class_start, ignore_index, + input->dptr(), target->dptr(), weight_dptr, + output->mut_dptr(), out_weight->mut_dptr()); } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -template -class NllGradKernel final : public user_op::OpKernel { +template +class NLLGradKernel final : public user_op::OpKernel { public: - NllGradKernel() = default; - ~NllGradKernel() = default; + NLLGradKernel() = default; + ~NLLGradKernel() override = default; + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateNLLKernelCache(ctx); + } private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); - auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - auto* total_weight_blob = ctx->Tensor4ArgNameAndIndex("total_weight", 0); - - const int64_t num_instances = target_blob->shape().elem_cnt(); - const int64_t input_elem_cnt = input_blob->shape().elem_cnt(); - CHECK_EQ(input_elem_cnt % num_instances, 0); - const K num_classes = static_cast(input_elem_cnt / num_instances); + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const override { + const auto* target = ctx->Tensor4ArgNameAndIndex("target", 0); + const auto* out_grad = ctx->Tensor4ArgNameAndIndex("out_grad", 0); + auto* in_grad = ctx->Tensor4ArgNameAndIndex("in_grad", 0); + + const int64_t N = target->shape_view().elem_cnt(); + const int64_t C = in_grad->shape_view().At(in_grad->shape_view().NumAxes() - 1); + CHECK_LE(N, std::numeric_limits::max()) + << "Expected batch size not exceed int32 numeric limits"; + + K class_start = 0; + if (cache) { + const auto* spec_cache = dynamic_cast(cache); + CHECK_NOTNULL(spec_cache); + CHECK_EQ(spec_cache->num_classes(), C) << ctx->op_name() << ": expected num_classes " << C + << ", got " << spec_cache->num_classes(); + class_start = spec_cache->class_start(); + } + const K ignore_index = static_cast(ctx->Attr("ignore_index")); - const T* dy = dy_blob->dptr(); - const K* target = target_blob->dptr(); - const T* total_weight = total_weight_blob->dptr(); - T* dx = dx_blob->mut_dptr(); - const T* weight = - ctx->has_input("weight", 0) ? ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr() : nullptr; - Memset(ctx->stream(), dx, 0, GetCudaAlignedSize(input_elem_cnt * sizeof(T))); - ComputeNllGradOut(num_instances, num_classes, ignore_index, target, dy, dx, weight, - total_weight); + const T* weight_dptr = nullptr; + if (ctx->has_input("weight", 0)) { + weight_dptr = CHECK_NOTNULL(ctx->Tensor4ArgNameAndIndex("weight", 0))->dptr(); + } + + NLLKernelUtil::Backward( + ctx->stream(), static_cast(N), static_cast(C), class_start, ignore_index, + out_grad->dptr(), target->dptr(), weight_dptr, in_grad->mut_dptr()); } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -} // namespace -#define REGISTER_NLL_KERNEL(dtype_pair, ltype_pair) \ - REGISTER_USER_KERNEL("nll") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \ - && (user_op::HobDataType("target", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ - && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair))); - -#define REGISTER_NLL_GRAD_KERNEL(dtype_pair, ltype_pair) \ - REGISTER_USER_KERNEL("nll_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \ - && (user_op::HobDataType("target", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ - && (user_op::HobDataType("dy", 0) == OF_PP_PAIR_SECOND(dtype_pair)) \ - && (user_op::HobDataType("dx", 0) == OF_PP_PAIR_SECOND(dtype_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_NLL_KERNEL, FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_NLL_GRAD_KERNEL, FLOATING_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ) -} // namespace user_op +#define REGISTER_NLL_KERNELS(device, dtype, ltype) \ + REGISTER_USER_KERNEL("nll").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == device) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("nll_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == device) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value) \ + && (user_op::HobDataType("out_grad", 0) == GetDataType::value)) + +REGISTER_NLL_KERNELS(DeviceType::kCPU, float, int32_t); +REGISTER_NLL_KERNELS(DeviceType::kCPU, float, int64_t); +REGISTER_NLL_KERNELS(DeviceType::kCPU, double, int32_t); +REGISTER_NLL_KERNELS(DeviceType::kCPU, double, int64_t); + +#ifdef WITH_CUDA + +REGISTER_NLL_KERNELS(DeviceType::kCUDA, float, int32_t); +REGISTER_NLL_KERNELS(DeviceType::kCUDA, float, int64_t); +REGISTER_NLL_KERNELS(DeviceType::kCUDA, double, int32_t); +REGISTER_NLL_KERNELS(DeviceType::kCUDA, double, int64_t); +REGISTER_NLL_KERNELS(DeviceType::kCUDA, half, int32_t); +REGISTER_NLL_KERNELS(DeviceType::kCUDA, half, int64_t); + +#endif // WITH_CUDA + } // namespace oneflow diff --git a/oneflow/user/kernels/nll_kernel.cu b/oneflow/user/kernels/nll_kernel.cu deleted file mode 100644 index 9e78cf52257..00000000000 --- a/oneflow/user/kernels/nll_kernel.cu +++ /dev/null @@ -1,207 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include -#include "oneflow/core/cuda/atomic.cuh" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/user/kernels/loss_kernel_util.h" -#include "oneflow/core/ep/cuda/cuda_stream.h" - -namespace oneflow { -namespace user_op { -namespace { - -using namespace loss; - -#define RETURN_VOID_IF_NOT_HALF typename std::enable_if_t::value, void> -#define RETURN_VOID_IF_HALF typename std::enable_if_t::value, void> - -template -__global__ RETURN_VOID_IF_NOT_HALF ComputeNllOutNone(const int64_t num_instances, - const K num_classes, const K ignore_index, - const T* input, const K* target, T* out, - const T* weight, T* total_weight) { - const T zero_val = GetZeroVal(); - const T one_val = GetOneVal(); - CUDA_1D_KERNEL_LOOP(i, num_instances) { - K label = target[i]; - if (label == ignore_index) { - out[i] = zero_val; - continue; - } - assert(label >= 0); - assert(label < num_classes); - const T cur_weight = weight == nullptr ? one_val : weight[label]; - cuda::atomic::Add(total_weight, cur_weight); - out[i] = -input[i * num_classes + label] * cur_weight; - } -} - -template -__global__ RETURN_VOID_IF_HALF ComputeNllOutNone(const int64_t num_instances, const K num_classes, - const K ignore_index, const T* input, - const K* target, T* out, const T* weight, - T* total_weight) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) - const T zero_val = __float2half(0.0); - const T one_val = __float2half(1.0); - CUDA_1D_KERNEL_LOOP(i, num_instances) { - K label = target[i]; - if (label == ignore_index) { - out[i] = zero_val; - continue; - } - assert(label >= 0); - assert(label < num_classes); - const half cur_weight = weight == nullptr ? one_val : weight[label]; - cuda::atomic::Add(total_weight, cur_weight); - out[i] = __float2half(-__half2float(input[i * num_classes + label] * cur_weight)); - } -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} - -template -__global__ RETURN_VOID_IF_NOT_HALF ComputeNllGradOut(const int64_t num_instances, - const K num_classes, const K ignore_index, - const K* target, const T* dy, T* dx, - const T* weight, const T* total_weight) { - CUDA_1D_KERNEL_LOOP(i, num_instances) { - K label = target[i]; - if (label == ignore_index) { continue; } - assert(label >= 0); - assert(label < num_classes); - const T cur_weight = weight == nullptr ? -GetOneVal() : -weight[label]; - dx[i * num_classes + label] = dy[i] * cur_weight; - } -} - -template -__global__ RETURN_VOID_IF_HALF ComputeNllGradOut(const int64_t num_instances, const K num_classes, - const K ignore_index, const K* target, const T* dy, - T* dx, const T* weight, const T* total_weight) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) - CUDA_1D_KERNEL_LOOP(i, num_instances) { - K label = target[i]; - if (label == ignore_index) { continue; } - assert(label >= 0); - assert(label < num_classes); - const half cur_weight = weight == nullptr ? __float2half(-1.0) : __hneg(weight[label]); - dx[i * num_classes + label] = __hmul(dy[i], cur_weight); - } -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} - -template -class NllKernel final : public user_op::OpKernel { - public: - NllKernel() = default; - ~NllKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - auto* total_weight_blob = ctx->Tensor4ArgNameAndIndex("total_weight", 0); - - const int64_t num_instances = target_blob->shape().elem_cnt(); - CHECK_EQ(input_blob->shape().elem_cnt() % num_instances, 0); - const K num_classes = static_cast(input_blob->shape().elem_cnt() / num_instances); - const K ignore_index = static_cast(ctx->Attr("ignore_index")); - - const T* input = input_blob->dptr(); - const K* target = target_blob->dptr(); - T* out = out_blob->mut_dptr(); - T* total_weight = total_weight_blob->mut_dptr(); - const T* weight = - ctx->has_input("weight", 0) ? ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr() : nullptr; - Memset(ctx->stream(), total_weight, 0, sizeof(T)); - - ComputeNllOutNone<<stream()->As()->cuda_stream()>>>( - num_instances, num_classes, ignore_index, input, target, out, weight, total_weight); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class NllGradKernel final : public user_op::OpKernel { - public: - NllGradKernel() = default; - ~NllGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); - auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - auto* total_weight_blob = ctx->Tensor4ArgNameAndIndex("total_weight", 0); - - const int64_t num_instances = target_blob->shape().elem_cnt(); - const int64_t input_elem_cnt = input_blob->shape().elem_cnt(); - CHECK_EQ(input_elem_cnt % num_instances, 0); - const K num_classes = static_cast(input_elem_cnt / num_instances); - const K ignore_index = static_cast(ctx->Attr("ignore_index")); - - const T* dy = dy_blob->dptr(); - const K* target = target_blob->dptr(); - const T* total_weight = total_weight_blob->dptr(); - T* dx = dx_blob->mut_dptr(); - const T* weight = - ctx->has_input("weight", 0) ? ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr() : nullptr; - - Memset(ctx->stream(), dx, 0, input_elem_cnt * sizeof(T)); - - ComputeNllGradOut<<stream()->As()->cuda_stream()>>>( - num_instances, num_classes, ignore_index, target, dy, dx, weight, total_weight); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -} // namespace -#define REGISTER_NLL_KERNEL(dtype_pair, ltype_pair) \ - REGISTER_USER_KERNEL("nll") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("target", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ - && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair))); - -#define REGISTER_NLL_GRAD_KERNEL(dtype_pair, ltype_pair) \ - REGISTER_USER_KERNEL("nll_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("target", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ - && (user_op::HobDataType("dy", 0) == OF_PP_PAIR_SECOND(dtype_pair)) \ - && (user_op::HobDataType("dx", 0) == OF_PP_PAIR_SECOND(dtype_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_NLL_KERNEL, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_NLL_GRAD_KERNEL, - FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -} // namespace user_op -} // namespace oneflow diff --git a/oneflow/user/kernels/nll_kernel_util.cpp b/oneflow/user/kernels/nll_kernel_util.cpp new file mode 100644 index 00000000000..bbaf4265975 --- /dev/null +++ b/oneflow/user/kernels/nll_kernel_util.cpp @@ -0,0 +1,63 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/nll_kernel_util.h" + +namespace oneflow { + +template +struct NLLKernelUtil { + static void Forward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* input, const K* target, + const T* weight, T* out, T* out_weight) { + FOR_RANGE(int32_t, i, 0, num_samples) { + K label = target[i]; + T w = T{0}; + T y = T{0}; + if (label != ignore_index) { + label -= class_start; + if (label >= 0 && label < num_classes) { + w = weight ? weight[label] : T{1}; + y = -(input[i * num_classes + label] * w); + } + } + out[i] = y; + out_weight[i] = w; + } + } + + static void Backward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* out_grad, + const K* target, const T* weight, T* in_grad) { + Memset(stream, in_grad, 0, + RoundUp(num_samples * num_classes * sizeof(T), kBlobBodyAlignSize)); + FOR_RANGE(int32_t, i, 0, num_samples) { + K label = target[i]; + if (label == ignore_index) { continue; } + label -= class_start; + if (label >= 0 && label < num_classes) { + const T w = weight ? -weight[label] : T(-1); + in_grad[i * num_classes + label] = out_grad[i] * w; + } + } + } +}; + +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; + +} // namespace oneflow diff --git a/oneflow/user/kernels/nll_kernel_util.cu b/oneflow/user/kernels/nll_kernel_util.cu new file mode 100644 index 00000000000..5e01b7697d1 --- /dev/null +++ b/oneflow/user/kernels/nll_kernel_util.cu @@ -0,0 +1,92 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/nll_kernel_util.h" +#include "oneflow/core/cuda/atomic.cuh" + +namespace oneflow { + +namespace { + +template +__global__ void NLLForward(const int32_t num_samples, const K num_classes, const K class_start, + const K ignore_index, const T* input, const K* target, const T* weight, + T* out, T* out_weight) { + const T zero = GetZeroVal(); + const T one = GetOneVal(); + CUDA_1D_KERNEL_LOOP(i, num_samples) { + K label = target[i]; + T w = zero; + T y = zero; + if (label != ignore_index) { + label -= class_start; + if (label >= 0 && label < num_classes) { + w = weight ? weight[label] : one; + y = -(input[i * num_classes + label] * w); + } + } + out[i] = y; + out_weight[i] = w; + } +} + +template +__global__ void NLLBackward(const int32_t num_samples, const K num_classes, const K class_start, + const K ignore_index, const T* out_grad, const K* target, + const T* weight, T* in_grad) { + const T one = GetOneVal(); + const T zero = GetZeroVal(); + CUDA_1D_KERNEL_LOOP_T(K, i, num_samples * num_classes) { + const K n = i / num_classes; + const K idx = i - n * num_classes; + const K label = target[n]; + if (label != ignore_index && idx == label - class_start) { + in_grad[i] = out_grad[n] * (weight ? -weight[idx] : -one); + } else { + in_grad[i] = zero; + } + } +} + +} // namespace + +template +struct NLLKernelUtil { + static void Forward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* input, const K* target, + const T* weight, T* out, T* out_weight) { + NLLForward<<As()->cuda_stream()>>>(num_samples, num_classes, + class_start, ignore_index, input, + target, weight, out, out_weight); + } + + static void Backward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* out_grad, + const K* target, const T* weight, T* in_grad) { + NLLBackward<<As()->cuda_stream()>>>( + num_samples, num_classes, class_start, ignore_index, out_grad, target, weight, in_grad); + } +}; + +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; + +} // namespace oneflow diff --git a/oneflow/user/kernels/nll_kernel_util.h b/oneflow/user/kernels/nll_kernel_util.h new file mode 100644 index 00000000000..25953d9b64f --- /dev/null +++ b/oneflow/user/kernels/nll_kernel_util.h @@ -0,0 +1,36 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_USER_KERNELS_NLL_KERNEL_UTIL_H_ +#define ONEFLOW_USER_KERNELS_NLL_KERNEL_UTIL_H_ + +#include "oneflow/core/kernel/kernel_util.h" + +namespace oneflow { + +template +struct NLLKernelUtil { + static void Forward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* input, const K* target, + const T* weight, T* out, T* out_weight); + + static void Backward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* out_grad, + const K* target, const T* weight, T* in_grad); +}; + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_NLL_KERNEL_UTIL_H_ diff --git a/oneflow/user/kernels/nms_kernel.cu b/oneflow/user/kernels/nms_kernel.cu index 5b92dedcdc2..8a1f1785e0e 100644 --- a/oneflow/user/kernels/nms_kernel.cu +++ b/oneflow/user/kernels/nms_kernel.cu @@ -105,7 +105,7 @@ class NmsGpuKernel final : public user_op::OpKernel { int8_t* keep = keep_blob->mut_dptr(); int64_t* suppression_mask = tmp_blob->mut_dptr(); - const int num_boxes = boxes_blob->shape().At(0); + const int num_boxes = boxes_blob->shape_view().At(0); int num_keep = ctx->Attr("keep_n"); if (num_keep <= 0 || num_keep > num_boxes) { num_keep = num_boxes; } const int num_blocks = CeilDiv(num_boxes, kBlockSize); diff --git a/oneflow/user/kernels/normalization_kernel.cpp b/oneflow/user/kernels/normalization_kernel.cpp index 09bdcf3c46b..3e30aace6dc 100644 --- a/oneflow/user/kernels/normalization_kernel.cpp +++ b/oneflow/user/kernels/normalization_kernel.cpp @@ -289,10 +289,10 @@ class NormalizationInferenceCpuKernel final : public user_op::OpKernel { const auto epsilon = ctx->Attr("epsilon"); const DataType data_type = x->data_type(); - CHECK_EQ(x->shape(), y->shape()); + CHECK_EQ(x->shape_view(), y->shape_view()); CHECK_EQ(y->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); + CHECK_LT(axis, x->shape_view().NumAxes()); if (axis == 1) { // NOTE(Liang Depeng): NCHW format const T* input_ptr = x->dptr(); @@ -303,9 +303,9 @@ class NormalizationInferenceCpuKernel final : public user_op::OpKernel { T* moving_mean_ptr = moving_mean->mut_dptr(); T* moving_variance_ptr = moving_variance->mut_dptr(); - const int64_t batch_size = x->shape().At(0); - const int64_t channel_size = x->shape().At(axis); - const int64_t spatial_size = x->shape().Count(axis + 1); + const int64_t batch_size = x->shape_view().At(0); + const int64_t channel_size = x->shape_view().At(axis); + const int64_t spatial_size = x->shape_view().Count(axis + 1); // NOTE(Liang Depeng): // compute the normalization result @@ -315,8 +315,8 @@ class NormalizationInferenceCpuKernel final : public user_op::OpKernel { if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), y->data_type()); - CHECK_EQ(add_to_output->shape(), y->shape()); - AddToOutput(add_to_output->dptr(), output_ptr, x->shape().elem_cnt()); + CHECK_EQ(add_to_output->shape_view(), y->shape_view()); + AddToOutput(add_to_output->dptr(), output_ptr, x->shape_view().elem_cnt()); } } else { // TODO(Liang Depeng): NHWC format @@ -365,10 +365,10 @@ class NormalizationTrainCpuKernel final : public user_op::OpKernel { const auto momentum = ctx->Attr("momentum"); const DataType data_type = x->data_type(); - CHECK_EQ(x->shape(), y->shape()); + CHECK_EQ(x->shape_view(), y->shape_view()); CHECK_EQ(y->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); + CHECK_LT(axis, x->shape_view().NumAxes()); const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0); @@ -399,9 +399,9 @@ class NormalizationTrainCpuKernel final : public user_op::OpKernel { moving_variance_ptr = moving_variance->mut_dptr(); } - const int64_t batch_size = x->shape().At(0); - const int64_t channel_size = x->shape().At(axis); - const int64_t spatial_size = x->shape().Count(axis + 1); + const int64_t batch_size = x->shape_view().At(0); + const int64_t channel_size = x->shape_view().At(axis); + const int64_t spatial_size = x->shape_view().Count(axis + 1); // NOTE(Liang Depeng): // Compute mean & inv_variance and update moving_mean & moving_variance for each channel. @@ -416,8 +416,8 @@ class NormalizationTrainCpuKernel final : public user_op::OpKernel { if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), y->data_type()); - CHECK_EQ(add_to_output->shape(), y->shape()); - AddToOutput(add_to_output->dptr(), output_ptr, x->shape().elem_cnt()); + CHECK_EQ(add_to_output->shape_view(), y->shape_view()); + AddToOutput(add_to_output->dptr(), output_ptr, x->shape_view().elem_cnt()); } if (ctx->op_type_name() == "normalization_add_relu") { @@ -426,9 +426,10 @@ class NormalizationTrainCpuKernel final : public user_op::OpKernel { if (ctx->has_input("addend", 0)) { const auto* addend = ctx->Tensor4ArgNameAndIndex("addend", 0); - AddRelu(addend->dptr(), mask->mut_dptr(), output_ptr, x->shape().elem_cnt()); + AddRelu(addend->dptr(), mask->mut_dptr(), output_ptr, + x->shape_view().elem_cnt()); } else { - Relu(mask->mut_dptr(), output_ptr, x->shape().elem_cnt()); + Relu(mask->mut_dptr(), output_ptr, x->shape_view().elem_cnt()); } } } else { // TODO(Liang Depeng): NHWC format @@ -490,12 +491,12 @@ class NormalizationGradCpuKernel final : public user_op::OpKernel { const auto axis = ctx->Attr("axis"); const DataType data_type = x->data_type(); - CHECK_EQ(dy->shape(), x->shape()); + CHECK_EQ(dy->shape_view(), x->shape_view()); CHECK_EQ(dy->data_type(), data_type); - CHECK_EQ(dx->shape(), x->shape()); + CHECK_EQ(dx->shape_view(), x->shape_view()); CHECK_EQ(dx->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); + CHECK_LT(axis, x->shape_view().NumAxes()); const T* dy_ptr = nullptr; if (ctx->op_type_name() == "normalization_grad") { @@ -505,11 +506,11 @@ class NormalizationGradCpuKernel final : public user_op::OpKernel { if (ctx->has_output("addend_diff", 0)) { user_op::Tensor* addend_diff = ctx->Tensor4ArgNameAndIndex("addend_diff", 0); AddReluGrad(dy->dptr(), mask->dptr(), addend_diff->mut_dptr(), - dy->shape().elem_cnt()); + dy->shape_view().elem_cnt()); dy_ptr = addend_diff->dptr(); } else { ReluGrad(dy->dptr(), mask->dptr(), tmp_buffer->mut_dptr(), - dy->shape().elem_cnt()); + dy->shape_view().elem_cnt()); dy_ptr = tmp_buffer->dptr(); } @@ -527,9 +528,9 @@ class NormalizationGradCpuKernel final : public user_op::OpKernel { T* gamma_diff_ptr = gamma_diff->mut_dptr(); T* beta_diff_ptr = beta_diff->mut_dptr(); - const int64_t batch_size = x->shape().At(0); - const int64_t channel_size = x->shape().At(axis); - const int64_t spatial_size = x->shape().Count(axis + 1); + const int64_t batch_size = x->shape_view().At(0); + const int64_t channel_size = x->shape_view().At(axis); + const int64_t spatial_size = x->shape_view().Count(axis + 1); const int64_t jump_step = spatial_size * channel_size; const int64_t reduce_count = batch_size * spatial_size; diff --git a/oneflow/user/kernels/normalization_kernel.cu b/oneflow/user/kernels/normalization_kernel.cu index 54eae9b4382..8589ca56239 100644 --- a/oneflow/user/kernels/normalization_kernel.cu +++ b/oneflow/user/kernels/normalization_kernel.cu @@ -112,8 +112,8 @@ class CudnnTensorDescHelper final { void CheckParamTensor(const user_op::Tensor* tensor) const { CHECK_NOTNULL(tensor); - CHECK_EQ(tensor->shape().NumAxes(), 1); - CHECK_EQ(tensor->shape().At(0), param_size_); + CHECK_EQ(tensor->shape_view().NumAxes(), 1); + CHECK_EQ(tensor->shape_view().At(0), param_size_); CHECK_EQ(GetCudnnDataType(tensor->data_type()), param_data_type_); } @@ -196,12 +196,13 @@ class NormalizationInferenceKernel final : public user_op::OpKernel, const auto epsilon = ctx->Attr("epsilon"); const DataType data_type = x->data_type(); - CHECK_EQ(x->shape(), y->shape()); + CHECK_EQ(x->shape_view(), y->shape_view()); CHECK_EQ(y->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); + CHECK_LT(axis, x->shape_view().NumAxes()); - const CudnnTensorDescHelper desc_helper(x->shape(), data_type, axis, CUDNN_BATCHNORM_SPATIAL); + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, + CUDNN_BATCHNORM_SPATIAL); desc_helper.CheckParamTensor(gamma); desc_helper.CheckParamTensor(beta); desc_helper.CheckParamTensor(moving_mean); @@ -212,10 +213,10 @@ class NormalizationInferenceKernel final : public user_op::OpKernel, if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), y->data_type()); - CHECK_EQ(add_to_output->shape(), y->shape()); + CHECK_EQ(add_to_output->shape_view(), y->shape_view()); Memcpy( ctx->stream(), y->mut_dptr(), add_to_output->dptr(), - add_to_output->shape().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); sp_beta = CudnnSPOnePtr(); } else { sp_beta = CudnnSPZeroPtr(); @@ -369,11 +370,11 @@ class NormalizationTrainKernel final : public user_op::OpKernel, public user_op: const auto momentum = ctx->Attr("momentum"); const DataType data_type = x->data_type(); - CHECK_EQ(x->shape(), y->shape()); + CHECK_EQ(x->shape_view(), y->shape_view()); CHECK_EQ(y->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); - const CudnnTensorDescHelper desc_helper(x->shape(), data_type, axis, + CHECK_LT(axis, x->shape_view().NumAxes()); + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, CUDNN_BATCHNORM_SPATIAL_PERSISTENT); const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); @@ -400,10 +401,10 @@ class NormalizationTrainKernel final : public user_op::OpKernel, public user_op: if (ctx->has_input("_add_to_output", 0)) { const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); CHECK_EQ(add_to_output->data_type(), y->data_type()); - CHECK_EQ(add_to_output->shape(), y->shape()); + CHECK_EQ(add_to_output->shape_view(), y->shape_view()); Memcpy( ctx->stream(), y->mut_dptr(), add_to_output->dptr(), - add_to_output->shape().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); sp_beta = CudnnSPOnePtr(); } else { sp_beta = CudnnSPZeroPtr(); @@ -420,15 +421,15 @@ class NormalizationTrainKernel final : public user_op::OpKernel, public user_op: ctx->stream()->As()->cudnn_handle(), CUDNN_BATCHNORM_SPATIAL_PERSISTENT, CUDNN_BATCHNORM_OPS_BN, nullptr, desc_helper.xy_desc(), &reserve_space_size)); auto* workspace = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - if (reserve_space_size == 0 && workspace_size <= workspace->shape().elem_cnt()) { + if (reserve_space_size == 0 && workspace_size <= workspace->shape_view().elem_cnt()) { OF_CUDNN_CHECK(cudnnBatchNormalizationForwardTrainingEx( ctx->stream()->As()->cudnn_handle(), CUDNN_BATCHNORM_SPATIAL_PERSISTENT, CUDNN_BATCHNORM_OPS_BN, sp_alpha, sp_beta, desc_helper.xy_desc(), x->dptr(), nullptr, nullptr, desc_helper.xy_desc(), y->mut_dptr(), desc_helper.param_desc(), gamma->dptr(), beta->dptr(), 1.0 - momentum, moving_mean ? moving_mean->mut_dptr() : NULL, moving_variance ? moving_variance->mut_dptr() : NULL, epsilon, mean->mut_dptr(), - inv_variance->mut_dptr(), nullptr, workspace->mut_dptr(), workspace->shape().elem_cnt(), - nullptr, 0)); + inv_variance->mut_dptr(), nullptr, workspace->mut_dptr(), + workspace->shape_view().elem_cnt(), nullptr, 0)); } else { OF_CUDNN_CHECK(cudnnBatchNormalizationForwardTraining( ctx->stream()->As()->cudnn_handle(), CUDNN_BATCHNORM_SPATIAL_PERSISTENT, @@ -450,7 +451,7 @@ class NormalizationTrainKernel final : public user_op::OpKernel, public user_op: if (ctx->op_type_name() == "normalization_add_relu") { CHECK(!ctx->has_input("_add_to_output", 0)); - const int64_t elem_cnt = x->shape().elem_cnt(); + const int64_t elem_cnt = x->shape_view().elem_cnt(); auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0); if (ctx->has_input("addend", 0)) { const auto* addend = ctx->Tensor4ArgNameAndIndex("addend", 0); @@ -518,14 +519,14 @@ class NormalizationGradUserKernel final : public user_op::OpKernel, const auto epsilon = ctx->Attr("epsilon"); const DataType data_type = x->data_type(); - CHECK_EQ(dy->shape(), x->shape()); + CHECK_EQ(dy->shape_view(), x->shape_view()); CHECK_EQ(dy->data_type(), data_type); - CHECK_EQ(dx->shape(), x->shape()); + CHECK_EQ(dx->shape_view(), x->shape_view()); CHECK_EQ(dx->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); + CHECK_LT(axis, x->shape_view().NumAxes()); - const CudnnTensorDescHelper desc_helper(x->shape(), data_type, axis, + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, CUDNN_BATCHNORM_SPATIAL_PERSISTENT); desc_helper.CheckParamTensor(gamma); desc_helper.CheckParamTensor(gamma_diff); @@ -539,10 +540,10 @@ class NormalizationGradUserKernel final : public user_op::OpKernel, if (ctx->op_type_name() == "normalization_grad") { bn_workspace_ptr = tmp_buffer->mut_dptr(); - bn_workspace_size = tmp_buffer->shape().elem_cnt(); + bn_workspace_size = tmp_buffer->shape_view().elem_cnt(); bn_dy_ptr = dy->dptr(); } else if (ctx->op_type_name() == "normalization_add_relu_grad") { - const int64_t elem_cnt = dy->shape().elem_cnt(); + const int64_t elem_cnt = dy->shape_view().elem_cnt(); const auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); if (ctx->has_output("addend_diff", 0)) { @@ -550,12 +551,12 @@ class NormalizationGradUserKernel final : public user_op::OpKernel, ReluBackward(ctx->stream(), elem_cnt, mask->dptr(), dy->dptr(), addend_diff->mut_dptr()); bn_workspace_ptr = tmp_buffer->mut_dptr(); - bn_workspace_size = tmp_buffer->shape().elem_cnt(); + bn_workspace_size = tmp_buffer->shape_view().elem_cnt(); bn_dy_ptr = addend_diff->dptr(); } else { - const size_t tmp_buffer_size = tmp_buffer->shape().elem_cnt(); + const size_t tmp_buffer_size = tmp_buffer->shape_view().elem_cnt(); const size_t relu_dx_size = - GetCudaAlignedSize(dy->shape().elem_cnt() * GetSizeOfDataType(dy->data_type())); + GetCudaAlignedSize(dy->shape_view().elem_cnt() * GetSizeOfDataType(dy->data_type())); CHECK_GE(tmp_buffer_size, relu_dx_size); ReluBackward(ctx->stream(), elem_cnt, mask->dptr(), dy->dptr(), reinterpret_cast(tmp_buffer->mut_dptr())); @@ -703,12 +704,12 @@ class FusedNormalizationAddReluKernel final : public user_op::OpKernel, auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const DataType data_type = x->data_type(); - CHECK_EQ(x->shape(), y->shape()); + CHECK_EQ(x->shape_view(), y->shape_view()); CHECK_EQ(y->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); + CHECK_LT(axis, x->shape_view().NumAxes()); - const CudnnTensorDescHelper desc_helper(x->shape(), data_type, axis, + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, CUDNN_BATCHNORM_SPATIAL_PERSISTENT); desc_helper.CheckParamTensor(gamma); desc_helper.CheckParamTensor(beta); @@ -736,13 +737,13 @@ class FusedNormalizationAddReluKernel final : public user_op::OpKernel, ctx->stream()->As()->cudnn_handle(), CUDNN_BATCHNORM_SPATIAL_PERSISTENT, ops, desc_helper.xy_desc(), z_desc, desc_helper.xy_desc(), desc_helper.param_desc(), activation_desc.Get(), &min_workspace_size)); - const size_t workspace_size = tmp_buffer->shape().elem_cnt(); + const size_t workspace_size = tmp_buffer->shape_view().elem_cnt(); CHECK_GE(workspace_size, min_workspace_size); size_t min_reserve_space_size; OF_CUDNN_CHECK(cudnnGetBatchNormalizationTrainingExReserveSpaceSize( ctx->stream()->As()->cudnn_handle(), CUDNN_BATCHNORM_SPATIAL_PERSISTENT, ops, activation_desc.Get(), desc_helper.xy_desc(), &min_reserve_space_size)); - const size_t reserve_space_size = reserve_space->shape().elem_cnt(); + const size_t reserve_space_size = reserve_space->shape_view().elem_cnt(); CHECK_GE(reserve_space_size, min_reserve_space_size); OF_CUDNN_CHECK(cudnnBatchNormalizationForwardTrainingEx( @@ -792,14 +793,14 @@ class FusedNormalizationAddReluGradUserKernel final : public user_op::OpKernel, const auto epsilon = ctx->Attr("epsilon"); const DataType data_type = x->data_type(); - CHECK_EQ(dy->shape(), x->shape()); + CHECK_EQ(dy->shape_view(), x->shape_view()); CHECK_EQ(dy->data_type(), data_type); - CHECK_EQ(dx->shape(), x->shape()); + CHECK_EQ(dx->shape_view(), x->shape_view()); CHECK_EQ(dx->data_type(), data_type); CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape().NumAxes()); + CHECK_LT(axis, x->shape_view().NumAxes()); - const CudnnTensorDescHelper desc_helper(x->shape(), data_type, axis, + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, CUDNN_BATCHNORM_SPATIAL_PERSISTENT); desc_helper.CheckParamTensor(gamma); desc_helper.CheckParamTensor(beta); @@ -828,13 +829,13 @@ class FusedNormalizationAddReluGradUserKernel final : public user_op::OpKernel, ops, desc_helper.xy_desc(), desc_helper.xy_desc(), desc_helper.xy_desc(), dz_desc, desc_helper.xy_desc(), desc_helper.param_desc(), activation_desc.Get(), &min_workspace_size)); - const size_t workspace_size = tmp_buffer->shape().elem_cnt(); + const size_t workspace_size = tmp_buffer->shape_view().elem_cnt(); CHECK_GE(workspace_size, min_workspace_size); size_t min_reserve_space_size; OF_CUDNN_CHECK(cudnnGetBatchNormalizationTrainingExReserveSpaceSize( ctx->stream()->As()->cudnn_handle(), CUDNN_BATCHNORM_SPATIAL_PERSISTENT, ops, activation_desc.Get(), desc_helper.xy_desc(), &min_reserve_space_size)); - const size_t reserve_space_size = reserve_space->shape().elem_cnt(); + const size_t reserve_space_size = reserve_space->shape_view().elem_cnt(); CHECK_GE(reserve_space_size, min_reserve_space_size); OF_CUDNN_CHECK(cudnnBatchNormalizationBackwardEx( ctx->stream()->As()->cudnn_handle(), CUDNN_BATCHNORM_SPATIAL_PERSISTENT, diff --git a/oneflow/user/kernels/nvtx_range_kernel.cu b/oneflow/user/kernels/nvtx_range_kernel.cu index 95bcdced4a2..9efe3f52a92 100644 --- a/oneflow/user/kernels/nvtx_range_kernel.cu +++ b/oneflow/user/kernels/nvtx_range_kernel.cu @@ -62,8 +62,8 @@ class NvtxStartKernel final : public user_op::OpKernel { const user_op::OpKernelCache*) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); - CHECK_EQ(out->shape(), in_shape); + const ShapeView& in_shape = in->shape_view(); + CHECK_EQ(out->shape_view(), in_shape); const DataType in_data_type = in->data_type(); CHECK_EQ(out->data_type(), in_data_type); Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), @@ -105,8 +105,8 @@ class NvtxEndKernel final : public user_op::OpKernel { const user_op::OpKernelCache*) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); - CHECK_EQ(out->shape(), in_shape); + const ShapeView& in_shape = in->shape_view(); + CHECK_EQ(out->shape_view(), in_shape); const DataType in_data_type = in->data_type(); CHECK_EQ(out->data_type(), in_data_type); #ifdef OF_ENABLE_PROFILER diff --git a/oneflow/user/kernels/ofrecord_decoder_kernels.cpp b/oneflow/user/kernels/ofrecord_decoder_kernels.cpp index ab4a34ecd9e..d684a5b3911 100644 --- a/oneflow/user/kernels/ofrecord_decoder_kernels.cpp +++ b/oneflow/user/kernels/ofrecord_decoder_kernels.cpp @@ -88,8 +88,8 @@ class OFRecordRawDecoderKernel final : public user_op::OpKernel { user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); // TODO(chengcheng): remove record num in record blob, fix by shape elem cnt - int64_t record_num = in_blob->shape().At(0); - int64_t sample_elem_cnt = out_blob->shape().Count(1); + int64_t record_num = in_blob->shape_view().At(0); + int64_t sample_elem_cnt = out_blob->shape_view().Count(1); CHECK(record_num > 0); const OFRecord* records = in_blob->dptr(); T* out_dptr = out_blob->mut_dptr(); @@ -134,10 +134,10 @@ class OFRecordBytesDecoderKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(out->shape(), in->shape()); + CHECK_EQ(out->shape_view(), in->shape_view()); CHECK_EQ(in->data_type(), DataType::kOFRecord); CHECK_EQ(out->data_type(), DataType::kTensorBuffer); - const int64_t num_instances = in->shape().elem_cnt(); + const int64_t num_instances = in->shape_view().elem_cnt(); const auto* records = in->dptr(); auto* buffers = out->mut_dptr(); const std::string& name = ctx->Attr("name"); @@ -223,10 +223,10 @@ class OFRecordImageDecoderRandomCropKernel final : public user_op::OpKernel { auto* crop_window_generators = dynamic_cast(state); CHECK_NOTNULL(crop_window_generators); user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t record_num = out_blob->shape().At(0); + int64_t record_num = out_blob->shape_view().At(0); CHECK(record_num > 0); user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); - CHECK_EQ(out_blob->shape(), in_blob->shape()); + CHECK_EQ(out_blob->shape_view(), in_blob->shape_view()); const OFRecord* records = in_blob->dptr(); TensorBuffer* buffers = out_blob->mut_dptr(); const std::string& name = ctx->Attr("name"); @@ -256,10 +256,10 @@ class OFRecordImageDecoderKernel final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t record_num = out_blob->shape().At(0); + int64_t record_num = out_blob->shape_view().At(0); CHECK(record_num > 0); user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); - CHECK_EQ(out_blob->shape(), in_blob->shape()); + CHECK_EQ(out_blob->shape_view(), in_blob->shape_view()); const OFRecord* records = in_blob->dptr(); TensorBuffer* buffers = out_blob->mut_dptr(); const std::string& name = ctx->Attr("name"); diff --git a/oneflow/user/kernels/one_embedding_kernels.cu b/oneflow/user/kernels/one_embedding_kernels.cu index b35d4173f92..231cc250e18 100644 --- a/oneflow/user/kernels/one_embedding_kernels.cu +++ b/oneflow/user/kernels/one_embedding_kernels.cu @@ -488,10 +488,10 @@ class EmbeddingPrefetchKernel final : public user_op::OpKernel { const int64_t line_size = ctx->Attr("line_size"); uint32_t num_unique; T* values_ptr = nullptr; - LookupAndInitMissing(ctx->stream(), embedding_state, unique_ids->shape().elem_cnt(), - embedding_size, line_size, num_unique_ids->dptr(), - unique_ids->dptr(), table_ids->dptr(), values_ptr, - tmp_buffer->mut_dptr(), &num_unique, true); + LookupAndInitMissing(ctx->stream(), embedding_state, + unique_ids->shape_view().elem_cnt(), embedding_size, line_size, + num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), + values_ptr, tmp_buffer->mut_dptr(), &num_unique, true); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -556,9 +556,9 @@ class EmbeddingLookupKernel final : public user_op::OpKernel { const int64_t line_size = ctx->Attr("line_size"); uint32_t num_unique; LookupAndInitMissing( - ctx->stream(), embedding_state, unique_ids->shape().elem_cnt(), embedding_size, line_size, - num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), unique_values->mut_dptr(), - tmp_buffer->mut_dptr(), &num_unique, false); + ctx->stream(), embedding_state, unique_ids->shape_view().elem_cnt(), embedding_size, + line_size, num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), + unique_values->mut_dptr(), tmp_buffer->mut_dptr(), &num_unique, false); if (ctx->has_output("embeddings", 0)) { user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0); CopyValuesToEmbeddings(ctx->stream(), num_unique, embedding_size, line_size, diff --git a/oneflow/user/kernels/one_embedding_update_kernels.cu b/oneflow/user/kernels/one_embedding_update_kernels.cu index 91dc6acf1a5..fd5c0cddd66 100644 --- a/oneflow/user/kernels/one_embedding_update_kernels.cu +++ b/oneflow/user/kernels/one_embedding_update_kernels.cu @@ -219,10 +219,10 @@ class SgdEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); user_op::Tensor* updated_unique_embeddings = ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape().NumAxes(), 2); - const int64_t line_size = unique_embeddings->shape().At(1); - const int64_t embedding_size = embedding_grad->shape().At(1); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); CHECK_EQ(line_size, embedding_size); const auto scale = ctx->Attr("scale"); const float l1 = ctx->Attr("l1"); @@ -234,7 +234,7 @@ class SgdEmbeddingUpdateKernel final : public user_op::OpKernel { if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const T* down_scale_by_ptr = nullptr; @@ -242,19 +242,19 @@ class SgdEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* down_scale_by_tensor = ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); down_scale_by_ptr = down_scale_by_tensor->dptr(); } const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } // update kernel SGDUpdateKernel - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( embedding_size, scale, l1, l2, weight_decay, reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), @@ -295,11 +295,11 @@ class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); user_op::Tensor* updated_unique_embeddings = ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape().NumAxes(), 2); - const int64_t num_keys = unique_embeddings->shape().At(0); - const int64_t line_size = unique_embeddings->shape().At(1); - const int64_t embedding_size = embedding_grad->shape().At(1); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); CHECK_EQ(line_size, embedding_size * 2); const float l1 = ctx->Attr("l1"); const float l2 = ctx->Attr("l2"); @@ -310,7 +310,7 @@ class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel { if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const T* down_scale_by_ptr = nullptr; @@ -318,7 +318,7 @@ class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* down_scale_by_tensor = ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); down_scale_by_ptr = down_scale_by_tensor->dptr(); } const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); @@ -326,13 +326,13 @@ class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel { const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } // update kernel MomentumUpdateKernel - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( line_size, embedding_size, scale, l1, l2, weight_decay, beta, reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), @@ -370,11 +370,11 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); user_op::Tensor* updated_unique_embeddings = ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape().NumAxes(), 2); - const int64_t num_keys = unique_embeddings->shape().At(0); - const int64_t line_size = unique_embeddings->shape().At(1); - const int64_t embedding_size = embedding_grad->shape().At(1); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); CHECK_EQ(line_size, embedding_size * 3); const float l1 = ctx->Attr("l1"); @@ -389,7 +389,7 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel { if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const T* down_scale_by_ptr = nullptr; @@ -397,7 +397,7 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* down_scale_by_tensor = ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); down_scale_by_ptr = down_scale_by_tensor->dptr(); } const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); @@ -405,7 +405,7 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel { const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } const float* bias_correction1_ptr = nullptr; @@ -418,8 +418,8 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel { } // update kernel AdamUpdateKernel - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, beta1, beta2, epsilon, bias_correction1_ptr, bias_correction2_ptr, reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, @@ -457,11 +457,11 @@ class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); user_op::Tensor* updated_unique_embeddings = ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape().NumAxes(), 2); - const int64_t num_keys = unique_embeddings->shape().At(0); - const int64_t line_size = unique_embeddings->shape().At(1); - const int64_t embedding_size = embedding_grad->shape().At(1); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); CHECK_EQ(line_size, embedding_size * 2); const float l1 = ctx->Attr("l1"); @@ -474,7 +474,7 @@ class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel { if (ctx->has_input("scale_by_tensor", 0)) { const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); scale_by_ptr = scale_by_tensor->dptr(); } const T* down_scale_by_ptr = nullptr; @@ -482,7 +482,7 @@ class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* down_scale_by_tensor = ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); down_scale_by_ptr = down_scale_by_tensor->dptr(); } const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); @@ -491,13 +491,13 @@ class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel { const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } // update kernel AdagradUpdateKernel - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, lr_decay, epsilon, reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, train_step_ptr, scale_by_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), @@ -535,13 +535,13 @@ class FtrlEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); user_op::Tensor* updated_unique_embeddings = ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape().NumAxes(), 2) + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2) << "The NumAxes of unique_embedding should be equal to 2. "; - CHECK_EQ(embedding_grad->shape().NumAxes(), 2) + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2) << "The NumAxes of embedding_grad should be equal to 2. "; - const int64_t num_keys = unique_embeddings->shape().At(0); - const int64_t line_size = unique_embeddings->shape().At(1); - const int64_t embedding_size = embedding_grad->shape().At(1); + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); CHECK_EQ(line_size, embedding_size * 3) << "The line_size should be equal to 3 x embedding_size. "; const float l1 = 0.0; @@ -561,7 +561,7 @@ class FtrlEmbeddingUpdateKernel final : public user_op::OpKernel { const user_op::Tensor* down_scale_by_tensor = ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape().elem_cnt(), 1); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); down_scale_by_ptr = down_scale_by_tensor->dptr(); } const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); @@ -569,13 +569,13 @@ class FtrlEmbeddingUpdateKernel final : public user_op::OpKernel { const int64_t* skip_if_ptr = nullptr; if (ctx->has_input("skip_if", 0)) { const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape().elem_cnt(), 1); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); skip_if_ptr = skip_if->dptr(); } // update kernel FtrlUpdateKernel - <<shape().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, lr_power, lambda1, lambda2, beta, reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), diff --git a/oneflow/user/kernels/one_hot_kernel.cpp b/oneflow/user/kernels/one_hot_kernel.cpp index 6dca45985e9..e926bae72bf 100644 --- a/oneflow/user/kernels/one_hot_kernel.cpp +++ b/oneflow/user/kernels/one_hot_kernel.cpp @@ -29,7 +29,7 @@ class CpuOneHotKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t num_indices = indices->shape().elem_cnt(); + const int64_t num_indices = indices->shape_view().elem_cnt(); const int64_t depth = ctx->Attr("depth"); const DataType dtype = ctx->Attr("dtype"); const T on_value = IsFloatingDataType(dtype) @@ -44,7 +44,7 @@ class CpuOneHotKernel final : public user_op::OpKernel { ep::primitive::NewPrimitive(ctx->stream()->device_type(), out->data_type()); CHECK(fill); - fill->Launch(ctx->stream(), out->mut_dptr(), off_value, out->shape().elem_cnt()); + fill->Launch(ctx->stream(), out->mut_dptr(), off_value, out->shape_view().elem_cnt()); FOR_RANGE(int64_t, i, 0, num_indices) { const int64_t idx = indices_dptr[i]; CHECK_GE(idx, 0); diff --git a/oneflow/user/kernels/one_hot_kernel.cu b/oneflow/user/kernels/one_hot_kernel.cu index 19c34d043f2..f687d144d78 100644 --- a/oneflow/user/kernels/one_hot_kernel.cu +++ b/oneflow/user/kernels/one_hot_kernel.cu @@ -46,7 +46,7 @@ class GpuOneHotKernel final : public user_op::OpKernel, public user_op::CudaGrap void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t num_indices = indices->shape().elem_cnt(); + const int64_t num_indices = indices->shape_view().elem_cnt(); const int64_t depth = ctx->Attr("depth"); const DataType dtype = ctx->Attr("dtype"); const T on_value = IsFloatingDataType(dtype) diff --git a/oneflow/user/kernels/onerec_decoder_kernels.cpp b/oneflow/user/kernels/onerec_decoder_kernels.cpp index 2a4cfb7f095..6fd45618687 100644 --- a/oneflow/user/kernels/onerec_decoder_kernels.cpp +++ b/oneflow/user/kernels/onerec_decoder_kernels.cpp @@ -51,7 +51,7 @@ void GetTensorDimsWithoutReshape(const std::vectorresize(num_axes); for (int32_t d = 0; d < num_axes; ++d) { (*tensor_dims)[d].resize(tensors.size()); } for (int32_t j = 0; j < tensors.size(); ++j) { - const flatbuffers::Vector* shape_vec = tensors.at(j)->shape(); + const flatbuffers::Vector* shape_vec = tensors.at(j)->shape_view(); CHECK_NOTNULL(shape_vec); CHECK_EQ(shape_vec->size(), num_axes); for (int32_t d = 0; d < num_axes; ++d) { (*tensor_dims)[d][j] = shape_vec->Get(d); } @@ -79,7 +79,7 @@ void GetTensorDimsWithReshape(const std::vector& } } for (int32_t j = 0; j < tensors.size(); ++j) { - const flatbuffers::Vector* shape_vec = tensors.at(j)->shape(); + const flatbuffers::Vector* shape_vec = tensors.at(j)->shape_view(); CHECK_NOTNULL(shape_vec); int32_t elem_cnt = 1; for (int32_t d = 0; d < shape_vec->size(); ++d) { elem_cnt *= shape_vec->Get(d); } @@ -165,7 +165,7 @@ void DecodeField(const TensorBuffer* records, const int64_t record_num, const st const Shape& batch_padding, user_op::Tensor* out_blob) { const int32_t batch_size = record_num; char* out_ptr = out_blob->mut_dptr(); - const int64_t out_bytes = out_blob->shape().elem_cnt() * GetSizeOfDataType(data_type); + const int64_t out_bytes = out_blob->shape_view().elem_cnt() * GetSizeOfDataType(data_type); std::vector tensors; GetTensorsFromRecords(records, record_num, key, &tensors); std::vector> tensor_dims; @@ -212,15 +212,15 @@ void DecodeField(const TensorBuffer* records, const int64_t record_num, const st const Shape instance_shape = Shape(instance_dim_vec); if (is_dynamic) { CHECK_LE(instance_shape.elem_cnt(), static_shape.elem_cnt()); - out_blob->mut_shape().Set(0, record_num); + out_blob->mut_shape_view().Set(0, record_num); for (int64_t d = 0; d < instance_shape.NumAxes(); ++d) { - out_blob->mut_shape().Set(d + 1, instance_shape.At(d)); + out_blob->mut_shape_view().Set(d + 1, instance_shape.At(d)); } } else { CHECK(instance_shape == static_shape); - CHECK_EQ(out_blob->shape().At(0), record_num); + CHECK_EQ(out_blob->shape_view().At(0), record_num); for (int64_t d = 0; d < instance_shape.NumAxes(); ++d) { - CHECK_EQ(out_blob->shape().At(d + 1), instance_shape.At(d)); + CHECK_EQ(out_blob->shape_view().At(d + 1), instance_shape.At(d)); } } const int64_t buffer_size = GetBatchSizeInBytes(batch_size, instance_shape, data_type); @@ -244,7 +244,7 @@ class OneRecDecoderKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t record_num = in_blob->shape().At(0); + int64_t record_num = in_blob->shape_view().At(0); CHECK(record_num > 0); const TensorBuffer* records = in_blob->dptr(); diff --git a/oneflow/user/kernels/ones_like_kernel.cpp b/oneflow/user/kernels/ones_like_kernel.cpp index 65f9c59787a..5b9b83ed2b6 100644 --- a/oneflow/user/kernels/ones_like_kernel.cpp +++ b/oneflow/user/kernels/ones_like_kernel.cpp @@ -41,7 +41,7 @@ class OnesLikeKernel final : public user_op::OpKernel { ep::primitive::NewPrimitive(ctx->stream()->device_type(), out->data_type()); CHECK(fill); - fill->Launch(ctx->stream(), out->mut_dptr(), 1, out->shape().elem_cnt()); + fill->Launch(ctx->stream(), out->mut_dptr(), 1, out->shape_view().elem_cnt()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/p2p_comm_kernel.cpp b/oneflow/user/kernels/p2p_comm_kernel.cpp index af60085e482..0e21933147d 100644 --- a/oneflow/user/kernels/p2p_comm_kernel.cpp +++ b/oneflow/user/kernels/p2p_comm_kernel.cpp @@ -34,7 +34,7 @@ class SendKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); const auto& dst_process_id = ctx->Attr("dst_process_id"); - CHECK_JUST(ccl::Send(in->dptr(), in->shape().elem_cnt(), in->data_type(), + CHECK_JUST(ccl::Send(in->dptr(), in->shape_view().elem_cnt(), in->data_type(), dst_process_id, ctx->stream())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -50,8 +50,8 @@ class RecvKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); const auto& src_process_id = ctx->Attr("src_process_id"); - CHECK_JUST(ccl::Recv(out->mut_dptr(), out->shape().elem_cnt(), out->data_type(), - src_process_id, ctx->stream())); + CHECK_JUST(ccl::Recv(out->mut_dptr(), out->shape_view().elem_cnt(), + out->data_type(), src_process_id, ctx->stream())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/pack_kernel.cpp b/oneflow/user/kernels/pack_kernel.cpp index 72df505f6e2..ea342c27029 100644 --- a/oneflow/user/kernels/pack_kernel.cpp +++ b/oneflow/user/kernels/pack_kernel.cpp @@ -40,20 +40,20 @@ class PackKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(in->data_type(), out->data_type()); const auto pack_num = ctx->Attr("pack_num"); - if (in->shape().NumAxes() > 0) { - CHECK_EQ(in->shape().NumAxes(), out->shape().NumAxes()); - CHECK_EQ(out->shape().At(0), in->shape().At(0) * pack_num); - for (int64_t i = 1; i < in->shape().NumAxes(); ++i) { - CHECK_EQ(out->shape().At(i), in->shape().At(i)); + if (in->shape_view().NumAxes() > 0) { + CHECK_EQ(in->shape_view().NumAxes(), out->shape_view().NumAxes()); + CHECK_EQ(out->shape_view().At(0), in->shape_view().At(0) * pack_num); + for (int64_t i = 1; i < in->shape_view().NumAxes(); ++i) { + CHECK_EQ(out->shape_view().At(i), in->shape_view().At(i)); } } else { // NOTE(chengcheng): for Scalar input pack - CHECK_EQ(in->shape().NumAxes(), 0); - CHECK_EQ(out->shape().NumAxes(), 1); - CHECK_EQ(in->shape().elem_cnt(), 1); - CHECK_EQ(out->shape().elem_cnt(), pack_num); + CHECK_EQ(in->shape_view().NumAxes(), 0); + CHECK_EQ(out->shape_view().NumAxes(), 1); + CHECK_EQ(in->shape_view().elem_cnt(), 1); + CHECK_EQ(out->shape_view().elem_cnt(), pack_num); } - const int64_t copy_size = in->shape().elem_cnt() * GetSizeOfDataType(out->data_type()); + const int64_t copy_size = in->shape_view().elem_cnt() * GetSizeOfDataType(out->data_type()); auto* state_wrapper = dynamic_cast>*>(state); CHECK_NOTNULL(state_wrapper); const size_t index = state_wrapper->Get().first; diff --git a/oneflow/user/kernels/pad2d_kernels.cpp b/oneflow/user/kernels/pad2d_kernels.cpp index 3569efb3d5e..74a1ab27ca9 100644 --- a/oneflow/user/kernels/pad2d_kernels.cpp +++ b/oneflow/user/kernels/pad2d_kernels.cpp @@ -70,7 +70,7 @@ class ReflectionPad2dKernel final : public OpKernel { const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); const auto& padding = ctx->Attr>("padding"); - const int64_t ndims = x->shape().NumAxes(); + const int64_t ndims = x->shape_view().NumAxes(); CHECK_EQ(padding.size(), ndims); const int64_t n_idx = 0; const int64_t c_idx = 1; @@ -80,17 +80,17 @@ class ReflectionPad2dKernel final : public OpKernel { const int64_t pad_left = padding[0]; const int64_t pad_top = padding[2]; - const int64_t n_batch = y->shape().At(n_idx); - const int64_t n_channel = y->shape().At(c_idx); - const int64_t y_height = y->shape().At(h_idx); - const int64_t y_width = y->shape().At(w_idx); - const int64_t x_height = x->shape().At(h_idx); - const int64_t x_width = x->shape().At(w_idx); + const int64_t n_batch = y->shape_view().At(n_idx); + const int64_t n_channel = y->shape_view().At(c_idx); + const int64_t y_height = y->shape_view().At(h_idx); + const int64_t y_width = y->shape_view().At(w_idx); + const int64_t x_height = x->shape_view().At(h_idx); + const int64_t x_width = x->shape_view().At(w_idx); IN_T* dest = y->mut_dptr(); const IN_T* src = x->dptr(); DimVector y_vector; - y->shape().ToDimVector(&y_vector); + y->shape_view().ToDimVector(&y_vector); NdIndexOffsetHelper index_helper(y_vector.data()); ReflectionPad2dFunctor()(ctx->stream(), src, dest, index_helper, n_batch, @@ -111,7 +111,7 @@ class ReflectionPad2dGradKernel final : public OpKernel { const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const auto& padding = ctx->Attr>("padding"); - const int64_t ndims = dy->shape().NumAxes(); + const int64_t ndims = dy->shape_view().NumAxes(); CHECK_EQ(padding.size(), ndims); const int64_t n_idx = 0; @@ -121,20 +121,20 @@ class ReflectionPad2dGradKernel final : public OpKernel { int64_t pad_left = padding[0]; int64_t pad_top = padding[2]; - int64_t n_batch = dy->shape().At(n_idx); - int64_t n_channel = dy->shape().At(c_idx); - int64_t dy_height = dy->shape().At(h_idx); - int64_t dy_width = dy->shape().At(w_idx); - int64_t dx_height = dx->shape().At(h_idx); - int64_t dx_width = dx->shape().At(w_idx); + int64_t n_batch = dy->shape_view().At(n_idx); + int64_t n_channel = dy->shape_view().At(c_idx); + int64_t dy_height = dy->shape_view().At(h_idx); + int64_t dy_width = dy->shape_view().At(w_idx); + int64_t dx_height = dx->shape_view().At(h_idx); + int64_t dx_width = dx->shape_view().At(w_idx); const IN_T* src = dy->dptr(); IN_T* dest = dx->mut_dptr(); DimVector dy_vector; - dy->shape().ToDimVector(&dy_vector); + dy->shape_view().ToDimVector(&dy_vector); NdIndexOffsetHelper index_helper(dy_vector.data()); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); ReflectionPad2dGradFunctor()(ctx->stream(), src, dest, index_helper, n_batch, @@ -176,7 +176,7 @@ class ReplicationPad2dKernel final : public OpKernel { const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); const auto& padding = ctx->Attr>("padding"); - const int64_t ndims = x->shape().NumAxes(); + const int64_t ndims = x->shape_view().NumAxes(); CHECK_EQ(padding.size(), ndims); const int64_t n_idx = 0; const int64_t c_idx = 1; @@ -186,17 +186,17 @@ class ReplicationPad2dKernel final : public OpKernel { const int64_t pad_left = padding[0]; const int64_t pad_top = padding[2]; - const int64_t n_batch = y->shape().At(n_idx); - const int64_t n_channel = y->shape().At(c_idx); - const int64_t y_height = y->shape().At(h_idx); - const int64_t y_width = y->shape().At(w_idx); - const int64_t x_height = x->shape().At(h_idx); - const int64_t x_width = x->shape().At(w_idx); + const int64_t n_batch = y->shape_view().At(n_idx); + const int64_t n_channel = y->shape_view().At(c_idx); + const int64_t y_height = y->shape_view().At(h_idx); + const int64_t y_width = y->shape_view().At(w_idx); + const int64_t x_height = x->shape_view().At(h_idx); + const int64_t x_width = x->shape_view().At(w_idx); IN_T* dest = y->mut_dptr(); const IN_T* src = x->dptr(); DimVector y_vector; - y->shape().ToDimVector(&y_vector); + y->shape_view().ToDimVector(&y_vector); NdIndexOffsetHelper index_helper(y_vector.data()); ReplicationPad2dFunctor()(ctx->stream(), src, dest, index_helper, n_batch, @@ -217,7 +217,7 @@ class ReplicationPad2dGradKernel final : public OpKernel { const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); const auto& padding = ctx->Attr>("padding"); - const int64_t ndims = dy->shape().NumAxes(); + const int64_t ndims = dy->shape_view().NumAxes(); CHECK_EQ(padding.size(), ndims); const int64_t n_idx = 0; @@ -227,20 +227,20 @@ class ReplicationPad2dGradKernel final : public OpKernel { int64_t pad_left = padding[0]; int64_t pad_top = padding[2]; - int64_t n_batch = dy->shape().At(n_idx); - int64_t n_channel = dy->shape().At(c_idx); - int64_t dy_height = dy->shape().At(h_idx); - int64_t dy_width = dy->shape().At(w_idx); - int64_t dx_height = dx->shape().At(h_idx); - int64_t dx_width = dx->shape().At(w_idx); + int64_t n_batch = dy->shape_view().At(n_idx); + int64_t n_channel = dy->shape_view().At(c_idx); + int64_t dy_height = dy->shape_view().At(h_idx); + int64_t dy_width = dy->shape_view().At(w_idx); + int64_t dx_height = dx->shape_view().At(h_idx); + int64_t dx_width = dx->shape_view().At(w_idx); const IN_T* src = dy->dptr(); IN_T* dest = dx->mut_dptr(); DimVector dy_vector; - dy->shape().ToDimVector(&dy_vector); + dy->shape_view().ToDimVector(&dy_vector); NdIndexOffsetHelper index_helper(dy_vector.data()); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); + size_t out_bytes_size = dx->shape_view().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); ReplicationPad2dGradFunctor()(ctx->stream(), src, dest, index_helper, diff --git a/oneflow/user/kernels/pad_kernel.cpp b/oneflow/user/kernels/pad_kernel.cpp index ebd57d0c48f..7f4a1c793eb 100644 --- a/oneflow/user/kernels/pad_kernel.cpp +++ b/oneflow/user/kernels/pad_kernel.cpp @@ -48,7 +48,7 @@ class PadKernel final : public OpKernel, public CudaGraphSupport { void Compute(KernelComputeContext* ctx) const override { const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - if (y->shape().NumAxes() > 0 && y->shape().elem_cnt() == 0) { + if (y->shape_view().NumAxes() > 0 && y->shape_view().elem_cnt() == 0) { // if output is 0-shape tensor, than do nothing and return return; } @@ -62,14 +62,14 @@ class PadKernel final : public OpKernel, public CudaGraphSupport { const auto& padding_before = ctx->Attr>("padding_before"); const auto& padding_after = ctx->Attr>("padding_after"); - const int64_t ndims = x->shape().NumAxes(); + const int64_t ndims = x->shape_view().NumAxes(); CHECK_EQ(padding_before.size(), ndims); std::unique_ptr pad_primitive = NewConstantPadPrimitive(ctx); CHECK(pad_primitive); - pad_primitive->Launch(ctx->stream(), ndims, x->shape().ptr(), x->dptr(), padding_before.data(), - padding_after.data(), value, y->mut_dptr()); + pad_primitive->Launch(ctx->stream(), ndims, x->shape_view().ptr(), x->dptr(), + padding_before.data(), padding_after.data(), value, y->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/partial_fc_sample_kernel.cu b/oneflow/user/kernels/partial_fc_sample_kernel.cu index b1ed2b4ac5e..2a7b898c636 100644 --- a/oneflow/user/kernels/partial_fc_sample_kernel.cu +++ b/oneflow/user/kernels/partial_fc_sample_kernel.cu @@ -322,8 +322,8 @@ class DistributedPartialFcSampleGpuKernel final : public user_op::OpKernel { user_op::Tensor* sampled_weight = ctx->Tensor4ArgNameAndIndex("sampled_weight", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t batch_size = label->shape().At(0); - const int64_t num_classes = weight->shape().At(0); + const int64_t batch_size = label->shape_view().At(0); + const int64_t num_classes = weight->shape_view().At(0); const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); TmpBufferManager buffer_manager(tmp_buffer->mut_dptr(), num_classes, batch_size, parallel_num); @@ -355,7 +355,7 @@ class DistributedPartialFcSampleGpuKernel final : public user_op::OpKernel { GatherKernelUtilImpl::Forward( ctx->stream(), buffer_manager.CubSortValuesOutPtr(), num_sample, weight->dptr(), - Shape({1, num_classes, weight->shape().Count(1)}), sampled_weight->mut_dptr(), 0); + Shape({1, num_classes, weight->shape_view().Count(1)}), sampled_weight->mut_dptr(), 0); MapLabel(ctx->stream(), num_classes, batch_size, lower_bound, parallel_num, num_sample, buffer_manager.GetCubTmpStorageSize(), label->dptr(), @@ -406,11 +406,11 @@ class DistributedPartialFcSampleDisableBoxingGpuKernel final : public user_op::O ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_label", 0); Memcpy(ctx->stream(), boxing_disabled_sampled_weight_diff->mut_dptr(), sampled_weight_diff->dptr(), - sampled_weight_diff->shape().elem_cnt() + sampled_weight_diff->shape_view().elem_cnt() * GetSizeOfDataType(sampled_weight_diff->data_type())); Memcpy( ctx->stream(), boxing_disabled_sampled_label->mut_dptr(), sampled_label->dptr(), - sampled_label->shape().elem_cnt() * GetSizeOfDataType(sampled_label->data_type())); + sampled_label->shape_view().elem_cnt() * GetSizeOfDataType(sampled_label->data_type())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/prelu_kernel.cpp b/oneflow/user/kernels/prelu_kernel.cpp index 538cafabcca..b1f5678ce22 100644 --- a/oneflow/user/kernels/prelu_kernel.cpp +++ b/oneflow/user/kernels/prelu_kernel.cpp @@ -32,10 +32,10 @@ class CpuPReluKernel final : public user_op::OpKernel { const T* x_ptr = x->dptr(); const T* alpha_ptr = alpha->dptr(); T* y_ptr = y->mut_dptr(); - const int32_t elem_cnt = x->shape().elem_cnt(); - const int32_t alpha_size = alpha->shape().elem_cnt(); - const int batch = x->shape().At(0); - const int channels = (x->shape().NumAxes() == 1) ? 1 : x->shape().At(1); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); + const int batch = x->shape_view().At(0); + const int channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); const int32_t inner_size = elem_cnt / batch / channels; FOR_RANGE(int32_t, i, 0, elem_cnt) { y_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : x_ptr[i] * alpha_ptr[(i / inner_size) % alpha_size]; @@ -71,14 +71,14 @@ class CpuPReluGradKernel final : public user_op::OpKernel { T* dx_ptr = dx->mut_dptr(); T* alpha_diff_ptr = alpha_diff->mut_dptr(); - const int32_t elem_cnt = x->shape().elem_cnt(); - const int32_t alpha_size = alpha->shape().elem_cnt(); - const int batch = x->shape().At(0); - const int channels = (x->shape().NumAxes() == 1) ? 1 : x->shape().At(1); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); + const int batch = x->shape_view().At(0); + const int channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); const int32_t inner_size = elem_cnt / batch / channels; Memset(ctx->stream(), alpha_diff->mut_dptr(), 0, - alpha_diff->shape().elem_cnt() * sizeof(T)); + alpha_diff->shape_view().elem_cnt() * sizeof(T)); for (int i = 0; i < elem_cnt; i++) { const T x_i = x_ptr[i]; diff --git a/oneflow/user/kernels/prelu_kernel.cu b/oneflow/user/kernels/prelu_kernel.cu index 7e71bdb173b..48dc3c150f8 100644 --- a/oneflow/user/kernels/prelu_kernel.cu +++ b/oneflow/user/kernels/prelu_kernel.cu @@ -409,10 +409,10 @@ class GpuPReluKernel final : public user_op::OpKernel { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); - const int32_t batch = x->shape().At(0); - const int32_t channels = (x->shape().NumAxes() == 1) ? 1 : x->shape().At(1); - const int32_t alpha_size = alpha->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + const int32_t batch = x->shape_view().At(0); + const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); const int32_t inner_size = elem_cnt / batch / channels; if (alpha_size == 1) { @@ -454,16 +454,16 @@ class GpuPReluGradKernel final : public user_op::OpKernel { user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const bool alpha_requires_grad = ctx->Attr("alpha_requires_grad"); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); T* broadcasted_alpha_diff = tmp_buffer->mut_dptr(); T* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + GetCudaAlignedSize(elem_cnt * sizeof(T))); - const Shape& left_extended_shape = CreatePreluLeftExtendedShape(ShapeView(x->shape())); + const Shape& left_extended_shape = CreatePreluLeftExtendedShape(ShapeView(x->shape_view())); - const int32_t batch = x->shape().At(0); - const int32_t channels = (x->shape().NumAxes() == 1) ? 1 : x->shape().At(1); - const int32_t alpha_size = alpha->shape().elem_cnt(); + const int32_t batch = x->shape_view().At(0); + const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); const int32_t inner_size = elem_cnt / batch / channels; if (alpha_size == 1) { DispatchPreluBackwardSingleAlphaIndex(ctx->stream(), elem_cnt, x->dptr(), @@ -477,8 +477,8 @@ class GpuPReluGradKernel final : public user_op::OpKernel { if (alpha_requires_grad) { NdarrayUtil::ReduceSum( ctx->stream(), XpuVarNdarray(left_extended_shape, alpha_diff->mut_dptr()), - XpuVarNdarray(x->shape(), broadcasted_alpha_diff), - XpuVarNdarray(x->shape(), reduce_sum_tmp_buf)); + XpuVarNdarray(x->shape_view(), broadcasted_alpha_diff), + XpuVarNdarray(x->shape_view(), reduce_sum_tmp_buf)); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/quantization_kernel.cpp b/oneflow/user/kernels/quantization_kernel.cpp index 7c7e75e3fa4..29bba3220e9 100644 --- a/oneflow/user/kernels/quantization_kernel.cpp +++ b/oneflow/user/kernels/quantization_kernel.cpp @@ -88,10 +88,10 @@ class CpuQuantizationKernel final : public user_op::OpKernel { if (quantization_formula == "google") { int64_t outer_num = 1; - int64_t inner_num = in->shape().elem_cnt(); - if (scale->shape().elem_cnt() > 1) { // per-channel quantization - outer_num = in->shape().At(0); - inner_num = in->shape().Count(1); + int64_t inner_num = in->shape_view().elem_cnt(); + if (scale->shape_view().elem_cnt() > 1) { // per-channel quantization + outer_num = in->shape_view().At(0); + inner_num = in->shape_view().Count(1); } if (quantization_scheme == "symmetric") { @@ -110,8 +110,8 @@ class CpuQuantizationKernel final : public user_op::OpKernel { } } } else if (quantization_formula == "cambricon") { - QuantizationPerLayerCambricon(in_ptr, scale_ptr[0], quantization_bit, in->shape().elem_cnt(), - out_ptr); + QuantizationPerLayerCambricon(in_ptr, scale_ptr[0], quantization_bit, + in->shape_view().elem_cnt(), out_ptr); } else { UNIMPLEMENTED(); } diff --git a/oneflow/user/kernels/quantization_kernel.cu b/oneflow/user/kernels/quantization_kernel.cu index 45cadef864c..2b0cfa1826b 100644 --- a/oneflow/user/kernels/quantization_kernel.cu +++ b/oneflow/user/kernels/quantization_kernel.cu @@ -114,9 +114,9 @@ class GpuQuantizationKernel final : public user_op::OpKernel { const int32_t quantization_bit = ctx->Attr("quantization_bit"); const std::string quantization_formula = ctx->Attr("quantization_formula"); - const int64_t elements = in->shape().elem_cnt(); - const int64_t panel_size = in->shape().Count(1); - const int64_t scale_size = scale->shape().elem_cnt(); + const int64_t elements = in->shape_view().elem_cnt(); + const int64_t panel_size = in->shape_view().Count(1); + const int64_t scale_size = scale->shape_view().elem_cnt(); // round to even auto origin_round_mode = std::fegetround(); diff --git a/oneflow/user/kernels/radix_sort_top_k_kernel.cu b/oneflow/user/kernels/radix_sort_top_k_kernel.cu index 29e69749ca3..6c43a0cd704 100644 --- a/oneflow/user/kernels/radix_sort_top_k_kernel.cu +++ b/oneflow/user/kernels/radix_sort_top_k_kernel.cu @@ -83,14 +83,14 @@ class GpuRadixSortTopKKernel final : public user_op::OpKernel { using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - if (in->shape().elem_cnt() == 0) { return; } + if (in->shape_view().elem_cnt() == 0) { return; } user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(static_cast(tmp_buffer->shape().elem_cnt()), - tmp_buffer->mut_dptr(), in->shape()); + TmpBufferManager buf_manager(static_cast(tmp_buffer->shape_view().elem_cnt()), + tmp_buffer->mut_dptr(), in->shape_view()); - const int64_t elem_cnt = in->shape().elem_cnt(); - const int64_t instance_size = in->shape().At(in->shape().NumAxes() - 1); + const int64_t elem_cnt = in->shape_view().elem_cnt(); + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); const int64_t instance_num = elem_cnt / instance_size; const int64_t k = std::min(static_cast(ctx->Attr("k")), instance_size); InitializeIndices<<Tensor4ArgNameAndIndex("like", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t elem_cnt = like->shape().elem_cnt(); + int64_t elem_cnt = like->shape_view().elem_cnt(); bool* mask = out->mut_dptr(); auto* random_mask_like_state = dynamic_cast(state); CHECK_NOTNULL(random_mask_like_state); diff --git a/oneflow/user/kernels/reduce_kernel.cpp b/oneflow/user/kernels/reduce_kernel.cpp index bcc0d0aa910..fcd3daaae5c 100644 --- a/oneflow/user/kernels/reduce_kernel.cpp +++ b/oneflow/user/kernels/reduce_kernel.cpp @@ -43,20 +43,20 @@ class ReduceKernel final : public user_op::OpKernel, public user_op::CudaGraphSu user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const auto& axis = ctx->Attr>("axis"); - if (input_tensor->shape().elem_cnt() == 0) { - if (output_tensor->shape().elem_cnt() != 0) { + if (input_tensor->shape_view().elem_cnt() == 0) { + if (output_tensor->shape_view().elem_cnt() != 0) { Memset( ctx->stream(), output_tensor->mut_dptr(), 0, - output_tensor->shape().elem_cnt() * GetSizeOfDataType(output_tensor->data_type())); + output_tensor->shape_view().elem_cnt() * GetSizeOfDataType(output_tensor->data_type())); } return; } const Shape& reduced_shape = - CreateReducedShape(input_tensor->shape(), {axis.begin(), axis.end()}); + CreateReducedShape(input_tensor->shape_view(), {axis.begin(), axis.end()}); NdarrayReduce::Reduce( ctx->stream(), XpuVarNdarray(reduced_shape, output_tensor->mut_dptr()), - XpuVarNdarray(input_tensor->shape(), input_tensor->dptr()), - XpuVarNdarray(tmp_buffer->shape(), tmp_buffer->mut_dptr())); + XpuVarNdarray(input_tensor->shape_view(), input_tensor->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -170,7 +170,7 @@ class ReduceSumHalfKernel final : public user_op::OpKernel, public user_op::Cuda const user_op::Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input_tensor", 0); user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("output_tensor", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const ShapeView& in_shape = input_tensor->shape(); + const ShapeView& in_shape = input_tensor->shape_view(); bool is_axis_contiguous = false; int64_t outer_size = 0, inner_size = 0, reduce_size = 0; GetReduceSumLayout(axis, in_shape, &is_axis_contiguous, &outer_size, &inner_size, &reduce_size); @@ -211,7 +211,7 @@ class ReduceSumHalfKernel final : public user_op::OpKernel, public user_op::Cuda const size_t reduce_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, - tmp_buffer->shape().elem_cnt()); + tmp_buffer->shape_view().elem_cnt()); auto h2f = ep::primitive::NewPrimitive( ctx->device_type(), DataType::kFloat16, DataType::kFloat); CHECK(h2f); @@ -226,7 +226,7 @@ class ReduceSumHalfKernel final : public user_op::OpKernel, public user_op::Cuda XpuVarNdarray(in_shape, reduce_tmp_buffer)); f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), - output_tensor->shape().elem_cnt()); + output_tensor->shape_view().elem_cnt()); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -265,12 +265,12 @@ class ReduceSumFloatCudaKernel final : public user_op::OpKernel, public user_op: const user_op::Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input_tensor", 0); user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("output_tensor", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const ShapeView& in_shape = input_tensor->shape(); - if (input_tensor->shape().elem_cnt() == 0) { - if (output_tensor->shape().elem_cnt() != 0) { + const ShapeView& in_shape = input_tensor->shape_view(); + if (input_tensor->shape_view().elem_cnt() == 0) { + if (output_tensor->shape_view().elem_cnt() != 0) { Memset( ctx->stream(), output_tensor->mut_dptr(), 0, - output_tensor->shape().elem_cnt() * GetSizeOfDataType(output_tensor->data_type())); + output_tensor->shape_view().elem_cnt() * GetSizeOfDataType(output_tensor->data_type())); } return; } @@ -306,8 +306,8 @@ class ReduceSumFloatCudaKernel final : public user_op::OpKernel, public user_op: const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()}); NdarrayReduce::Reduce( ctx->stream(), XpuVarNdarray(reduced_shape, output_tensor->mut_dptr()), - XpuVarNdarray(input_tensor->shape(), input_tensor->dptr()), - XpuVarNdarray(tmp_buffer->shape(), tmp_buffer->mut_dptr())); + XpuVarNdarray(input_tensor->shape_view(), input_tensor->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/reduce_like_kernels.cpp b/oneflow/user/kernels/reduce_like_kernels.cpp index 451a5311c61..62ca53cbd86 100644 --- a/oneflow/user/kernels/reduce_like_kernels.cpp +++ b/oneflow/user/kernels/reduce_like_kernels.cpp @@ -44,28 +44,29 @@ class ReduceSumLikeOpKernel final : public user_op::OpKernel, public user_op::Cu user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); const auto& axis = ctx->Attr>("axis"); - if (tensor_x->shape().elem_cnt() == 0) { - if (tensor_y->shape().elem_cnt() != 0) { + if (tensor_x->shape_view().elem_cnt() == 0) { + if (tensor_y->shape_view().elem_cnt() != 0) { Memset( ctx->stream(), tensor_y->mut_dptr(), 0, - tensor_y->shape().elem_cnt() * GetSizeOfDataType(tensor_y->data_type())); + tensor_y->shape_view().elem_cnt() * GetSizeOfDataType(tensor_y->data_type())); } return; } if (axis.empty()) { - CHECK_EQ(tensor_x->shape(), tensor_y->shape()); - Memcpy(ctx->stream(), tensor_y->mut_dptr(), tensor_x->dptr(), - tensor_x->shape().elem_cnt() * GetSizeOfDataType(tensor_x->data_type())); + CHECK_EQ(tensor_x->shape_view(), tensor_y->shape_view()); + Memcpy( + ctx->stream(), tensor_y->mut_dptr(), tensor_x->dptr(), + tensor_x->shape_view().elem_cnt() * GetSizeOfDataType(tensor_x->data_type())); } else { user_op::Tensor* tensor_tmp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); T* temp_storage = static_cast(tensor_tmp->mut_dptr()); NdarrayUtil::ReduceSum( ctx->stream(), - XpuVarNdarray(CreateReducedShape(tensor_x->shape(), {axis.begin(), axis.end()}), + XpuVarNdarray(CreateReducedShape(tensor_x->shape_view(), {axis.begin(), axis.end()}), tensor_y->mut_dptr()), - XpuVarNdarray(tensor_x->shape(), tensor_x->dptr(), - tensor_x->shape().NumAxes()), - XpuVarNdarray(tensor_x->shape(), temp_storage, tensor_x->shape().NumAxes())); + XpuVarNdarray(tensor_x->shape_view(), tensor_x->dptr(), + tensor_x->shape_view().NumAxes()), + XpuVarNdarray(tensor_x->shape_view(), temp_storage, tensor_x->shape_view().NumAxes())); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -113,13 +114,13 @@ class ReduceSumLikeHalfKernel final : public user_op::OpKernel, public user_op:: const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); if (axis.empty()) { - CHECK_EQ(tensor_x->shape(), tensor_y->shape()); + CHECK_EQ(tensor_x->shape_view(), tensor_y->shape_view()); Memcpy( ctx->stream(), tensor_y->mut_dptr(), tensor_x->dptr(), - tensor_x->shape().elem_cnt() * GetSizeOfDataType(tensor_x->data_type())); + tensor_x->shape_view().elem_cnt() * GetSizeOfDataType(tensor_x->data_type())); } else { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const ShapeView& in_shape = tensor_x->shape(); + const ShapeView& in_shape = tensor_x->shape_view(); bool is_axis_contiguous = false; int64_t outer_size = 0, inner_size = 0, reduce_size = 0; GetReduceSumLayout(axis, in_shape, &is_axis_contiguous, &outer_size, &inner_size, @@ -152,7 +153,7 @@ class ReduceSumLikeHalfKernel final : public user_op::OpKernel, public user_op:: const size_t reduce_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, - tmp_buffer->shape().elem_cnt()); + tmp_buffer->shape_view().elem_cnt()); auto h2f = ep::primitive::NewPrimitive( ctx->device_type(), DataType::kFloat16, DataType::kFloat); CHECK(h2f); @@ -167,7 +168,7 @@ class ReduceSumLikeHalfKernel final : public user_op::OpKernel, public user_op:: XpuVarNdarray(in_shape, reduce_tmp_buffer)); f2h->Launch(ctx->stream(), out_tmp_buffer, tensor_y->mut_dptr(), - tensor_y->shape().elem_cnt()); + tensor_y->shape_view().elem_cnt()); } } } diff --git a/oneflow/user/kernels/relu_bfloat16_kernel.cu b/oneflow/user/kernels/relu_bfloat16_kernel.cu index b0550d5ccc6..5e63697efed 100644 --- a/oneflow/user/kernels/relu_bfloat16_kernel.cu +++ b/oneflow/user/kernels/relu_bfloat16_kernel.cu @@ -46,7 +46,7 @@ class ReluGradNvBFloat16Kernel final : public OpKernel { const Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t n = y->shape().elem_cnt(); + const int64_t n = y->shape_view().elem_cnt(); ReluBackwardGpu<<stream()->As()->cuda_stream()>>>( n, reinterpret_cast(y->dptr()), diff --git a/oneflow/user/kernels/repeat_interleave_kernel.cpp b/oneflow/user/kernels/repeat_interleave_kernel.cpp index ad048e95022..cdcc7b47599 100644 --- a/oneflow/user/kernels/repeat_interleave_kernel.cpp +++ b/oneflow/user/kernels/repeat_interleave_kernel.cpp @@ -34,7 +34,7 @@ class CpuRepeatInterLeaveKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); const T* cumsum_ptr = cumsum->dptr(); T* out_ptr = out->mut_dptr(); - for (T i = 0; i < in->shape().At(0); i++) { + for (T i = 0; i < in->shape_view().At(0); i++) { T end = cumsum_ptr[i]; T size = in_ptr[i]; T start = end - size; diff --git a/oneflow/user/kernels/repeat_interleave_kernel.cu b/oneflow/user/kernels/repeat_interleave_kernel.cu index 5ec32f35df8..9a547be5014 100644 --- a/oneflow/user/kernels/repeat_interleave_kernel.cu +++ b/oneflow/user/kernels/repeat_interleave_kernel.cu @@ -52,9 +52,9 @@ class GpuRepeatInterLeaveKernel final : public user_op::OpKernel { const T* cumsum_ptr = cumsum->dptr(); T* out_ptr = out->mut_dptr(); - repeat_interleave<<shape().At(0)), kCudaThreadsNumPerBlock, 0, + repeat_interleave<<shape_view().At(0)), kCudaThreadsNumPerBlock, 0, ctx->stream()->As()->cuda_stream()>>>( - in_ptr, cumsum_ptr, out_ptr, in->shape().At(0)); + in_ptr, cumsum_ptr, out_ptr, in->shape_view().At(0)); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/repeat_kernel.cpp b/oneflow/user/kernels/repeat_kernel.cpp index 6c1b8f88bd8..4ea643a787d 100644 --- a/oneflow/user/kernels/repeat_kernel.cpp +++ b/oneflow/user/kernels/repeat_kernel.cpp @@ -30,10 +30,10 @@ class RepeatKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape().elem_cnt(), out->shape().elem_cnt()); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()); CHECK_EQ(in->data_type(), out->data_type()); Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), - in->shape().elem_cnt() * GetSizeOfDataType(in->data_type())); + in->shape_view().elem_cnt() * GetSizeOfDataType(in->data_type())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/roc_auc_score_kernel.cpp b/oneflow/user/kernels/roc_auc_score_kernel.cpp index a536dfcf38a..cbe22133ab2 100644 --- a/oneflow/user/kernels/roc_auc_score_kernel.cpp +++ b/oneflow/user/kernels/roc_auc_score_kernel.cpp @@ -85,9 +85,9 @@ class RocAucScoreKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); P* out_ptr = out->mut_dptr

(); - CHECK_EQ(label->shape().elem_cnt(), pred->shape().elem_cnt()); - CHECK_EQ(out->shape().elem_cnt(), 1); - out_ptr[0] = RocAucScore(label->shape().elem_cnt(), label->dptr(), pred->dptr

(), + CHECK_EQ(label->shape_view().elem_cnt(), pred->shape_view().elem_cnt()); + CHECK_EQ(out->shape_view().elem_cnt(), 1); + out_ptr[0] = RocAucScore(label->shape_view().elem_cnt(), label->dptr(), pred->dptr

(), tmp_buffer->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/roi_align_kernel.cu b/oneflow/user/kernels/roi_align_kernel.cu index 45b5ea6f031..f1c4c6ed5d9 100644 --- a/oneflow/user/kernels/roi_align_kernel.cu +++ b/oneflow/user/kernels/roi_align_kernel.cu @@ -239,7 +239,7 @@ class RoIAlignKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x_blob = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0); - if (rois_blob->shape().elem_cnt() == 0) { return; } + if (rois_blob->shape_view().elem_cnt() == 0) { return; } user_op::Tensor* y_blob = ctx->Tensor4ArgNameAndIndex("y", 0); const int32_t pooled_h = ctx->Attr("pooled_h"); const int32_t pooled_w = ctx->Attr("pooled_w"); @@ -247,12 +247,12 @@ class RoIAlignKernel final : public user_op::OpKernel { const int32_t sampling_ratio = ctx->Attr("sampling_ratio"); const bool aligned = ctx->Attr("aligned"); - const int64_t elem_cnt = y_blob->shape().elem_cnt(); + const int64_t elem_cnt = y_blob->shape_view().elem_cnt(); RoiAlignForward<<stream()->As()->cuda_stream()>>>( elem_cnt, x_blob->dptr(), rois_blob->dptr(), spatial_scale, sampling_ratio, - x_blob->shape().At(1), x_blob->shape().At(2), x_blob->shape().At(3), pooled_h, pooled_w, - aligned, y_blob->mut_dptr()); + x_blob->shape_view().At(1), x_blob->shape_view().At(2), x_blob->shape_view().At(3), + pooled_h, pooled_w, aligned, y_blob->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -269,7 +269,7 @@ class RoIAlignGradKernel final : public user_op::OpKernel { user_op::Tensor* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); if (dx_blob == nullptr) { return; } Memset(ctx->stream(), dx_blob->mut_dptr(), 0, - dx_blob->shape().elem_cnt() * sizeof(T)); + dx_blob->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0); const int32_t pooled_h = ctx->Attr("pooled_h"); @@ -278,13 +278,13 @@ class RoIAlignGradKernel final : public user_op::OpKernel { const int32_t sampling_ratio = ctx->Attr("sampling_ratio"); const bool aligned = ctx->Attr("aligned"); - const int64_t elem_cnt = dy_blob->shape().elem_cnt(); + const int64_t elem_cnt = dy_blob->shape_view().elem_cnt(); if (elem_cnt > 0) { RoiAlignBackward<<stream()->As()->cuda_stream()>>>( elem_cnt, dy_blob->dptr(), rois_blob->dptr(), spatial_scale, sampling_ratio, - dx_blob->shape().At(1), dx_blob->shape().At(2), dx_blob->shape().At(3), pooled_h, - pooled_w, aligned, dx_blob->mut_dptr()); + dx_blob->shape_view().At(1), dx_blob->shape_view().At(2), dx_blob->shape_view().At(3), + pooled_h, pooled_w, aligned, dx_blob->mut_dptr()); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/roll_kernel.cpp b/oneflow/user/kernels/roll_kernel.cpp index b50cd0e5d9a..6476d62508d 100644 --- a/oneflow/user/kernels/roll_kernel.cpp +++ b/oneflow/user/kernels/roll_kernel.cpp @@ -36,11 +36,11 @@ class CpuRollKernel final : public user_op::OpKernel { SHAPE new_shape{}; SHIFTS new_shifts{}; int32_t num_axes = 0; - computeParams(in->shape(), shifts, dims, new_shifts.val, new_shape.val, &num_axes); + computeParams(in->shape_view(), shifts, dims, new_shifts.val, new_shape.val, &num_axes); const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int32_t size = out->shape().elem_cnt(); + const int32_t size = out->shape_view().elem_cnt(); STRIDE stride{}; initStride(stride, new_shape, num_axes); diff --git a/oneflow/user/kernels/roll_kernel.cu b/oneflow/user/kernels/roll_kernel.cu index 5a2e35c7506..7a34cd32bf0 100644 --- a/oneflow/user/kernels/roll_kernel.cu +++ b/oneflow/user/kernels/roll_kernel.cu @@ -166,7 +166,7 @@ class GpuRollKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int64_t elem_count = out->shape().elem_cnt(); + const int64_t elem_count = out->shape_view().elem_cnt(); if (dims[0] == -1) { // NOTE(Liang Depeng): Borrow the implementation of pytorch and simplify to 1d array case. @@ -179,7 +179,7 @@ class GpuRollKernel final : public user_op::OpKernel { SHAPE new_shape{}; SHIFTS new_shifts{}; int32_t num_axes = 0; - computeParams(in->shape(), shifts, dims, new_shifts.val, new_shape.val, &num_axes); + computeParams(in->shape_view(), shifts, dims, new_shifts.val, new_shape.val, &num_axes); STRIDE stride{}; initStride(stride, new_shape, num_axes); diff --git a/oneflow/user/kernels/roll_kernel_utils.h b/oneflow/user/kernels/roll_kernel_utils.h index 23ca979c6f4..d57db3d7407 100644 --- a/oneflow/user/kernels/roll_kernel_utils.h +++ b/oneflow/user/kernels/roll_kernel_utils.h @@ -89,7 +89,7 @@ static void initStride(STRIDE& stride, const SHAPE& dim_vec, const int32_t dims) } static void transformShifts(int32_t* shifts, int32_t* shape, int n) { - for (int i = 0; i < n; ++i) { shifts[i] = shifts[i] % shape[i]; } + for (int i = 0; i < n; ++i) { shifts[i] = shifts[i] % shape[i]; } // NOLINT } static void computeParams(const ShapeView& in_shape, const std::vector& shifts, diff --git a/oneflow/user/kernels/same_padding_kernel.cpp b/oneflow/user/kernels/same_padding_kernel.cpp index 1f10c36ccc6..e62ce2bb27c 100644 --- a/oneflow/user/kernels/same_padding_kernel.cpp +++ b/oneflow/user/kernels/same_padding_kernel.cpp @@ -59,7 +59,7 @@ class SamePaddingKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int64_t num_axes = x->shape().NumAxes(); + const int64_t num_axes = x->shape_view().NumAxes(); const std::string& padding = ctx->Attr("padding"); const std::string& data_format = ctx->Attr("data_format"); const std::vector kernel_size = ctx->Attr>("kernel_size"); @@ -67,13 +67,13 @@ class SamePaddingKernel final : public user_op::OpKernel { const std::vector dilation_rate = ctx->Attr>("dilation_rate"); std::vector padding_before(num_axes, 0); const size_t idx_offset = IdxOffset(data_format); - const int32_t num_spatial_dims = x->shape().NumAxes() - 2; + const int32_t num_spatial_dims = x->shape_view().NumAxes() - 2; for (int32_t i = 0; i < num_spatial_dims; ++i) { int32_t padding_small = 0; int32_t padding_large = 0; - CHECK_JUST(CalcSamePadding(x->shape().At(idx_offset + i), kernel_size.at(i), - dilation_rate.at(i), strides.at(i), &padding_small, - &padding_large)); + CHECK_JUST(CalcSamePadding(x->shape_view().At(idx_offset + i), kernel_size.at(i), // NOLINT + dilation_rate.at(i), strides.at(i), &padding_small, // NOLINT + &padding_large)); // NOLINT if (padding == "same_lower") { padding_before[idx_offset + i] = padding_large; } else if (padding == "same_upper") { @@ -81,20 +81,20 @@ class SamePaddingKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED(); } - CHECK_EQ(y->shape().At(idx_offset + i), - x->shape().At(idx_offset + i) + padding_small + padding_large); + CHECK_EQ(y->shape_view().At(idx_offset + i), + x->shape_view().At(idx_offset + i) + padding_small + padding_large); } CHECK_EQ(padding_before.size(), num_axes); std::unique_ptr fill_primitive = NewFillPrimitive(ctx); CHECK(fill_primitive); - fill_primitive->Launch(ctx->stream(), y->mut_dptr(), Scalar(0), y->shape().elem_cnt()); + fill_primitive->Launch(ctx->stream(), y->mut_dptr(), Scalar(0), y->shape_view().elem_cnt()); DimVector src_pos_vec(num_axes, 0); DimVector dst_pos_vec(padding_before.cbegin(), padding_before.cend()); std::unique_ptr copy_nd_primitive = NewCopyNdPrimitive(ctx); CHECK(copy_nd_primitive); copy_nd_primitive->Launch(ctx->stream(), x->data_type(), num_axes, y->mut_dptr(), - y->shape().ptr(), dst_pos_vec.data(), x->dptr(), x->shape().ptr(), - src_pos_vec.data(), x->shape().ptr()); + y->shape_view().ptr(), dst_pos_vec.data(), x->dptr(), + x->shape_view().ptr(), src_pos_vec.data(), x->shape_view().ptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -112,7 +112,7 @@ class SamePaddingGradKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t num_axes = dy->shape().NumAxes(); + const int64_t num_axes = dy->shape_view().NumAxes(); const std::string& padding = ctx->Attr("padding"); const std::string& data_format = ctx->Attr("data_format"); const std::vector kernel_size = ctx->Attr>("kernel_size"); @@ -120,13 +120,13 @@ class SamePaddingGradKernel final : public user_op::OpKernel { const std::vector dilation_rate = ctx->Attr>("dilation_rate"); std::vector padding_before(num_axes, 0); const size_t idx_offset = IdxOffset(data_format); - const int32_t num_spatial_dims = dy->shape().NumAxes() - 2; + const int32_t num_spatial_dims = dy->shape_view().NumAxes() - 2; for (int32_t i = 0; i < num_spatial_dims; ++i) { int32_t padding_small = 0; int32_t padding_large = 0; - CHECK_JUST(CalcSamePadding(dx->shape().At(idx_offset + i), kernel_size.at(i), - dilation_rate.at(i), strides.at(i), &padding_small, - &padding_large)); + CHECK_JUST(CalcSamePadding(dx->shape_view().At(idx_offset + i), kernel_size.at(i), // NOLINT + dilation_rate.at(i), strides.at(i), &padding_small, // NOLINT + &padding_large)); // NOLINT if (padding == "same_lower") { padding_before[idx_offset + i] = padding_large; } else if (padding == "same_upper") { @@ -134,16 +134,16 @@ class SamePaddingGradKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED(); } - CHECK_EQ(dy->shape().At(idx_offset + i), - dx->shape().At(idx_offset + i) + padding_small + padding_large); + CHECK_EQ(dy->shape_view().At(idx_offset + i), + dx->shape_view().At(idx_offset + i) + padding_small + padding_large); } DimVector dst_pos_vec(num_axes, 0); DimVector src_pos_vec(padding_before.cbegin(), padding_before.cend()); std::unique_ptr primitive = NewCopyNdPrimitive(ctx); CHECK(primitive); - primitive->Launch(ctx->stream(), dy->data_type(), num_axes, dx->mut_dptr(), dx->shape().ptr(), - dst_pos_vec.data(), dy->dptr(), dy->shape().ptr(), src_pos_vec.data(), - dx->shape().ptr()); + primitive->Launch(ctx->stream(), dy->data_type(), num_axes, dx->mut_dptr(), + dx->shape_view().ptr(), dst_pos_vec.data(), dy->dptr(), + dy->shape_view().ptr(), src_pos_vec.data(), dx->shape_view().ptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/scalar_by_tensor_kernel.cpp b/oneflow/user/kernels/scalar_by_tensor_kernel.cpp index 5e2b864fc8f..ca09d86e20d 100644 --- a/oneflow/user/kernels/scalar_by_tensor_kernel.cpp +++ b/oneflow/user/kernels/scalar_by_tensor_kernel.cpp @@ -50,13 +50,13 @@ class ScalarByTensorKernel final : public user_op::OpKernel, public user_op::Cud const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* scalar = ctx->Tensor4ArgNameAndIndex("scalar", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - int64_t elem_cnt = y->shape().elem_cnt(); + int64_t elem_cnt = y->shape_view().elem_cnt(); if (elem_cnt != 0) { std::unique_ptr primitive = NewBroadcastElementwiseBinaryPrimitive(ctx, op); CHECK(primitive); - primitive->Launch(ctx->stream(), x->shape().NumAxes(), x->shape().ptr(), x->dptr(), - scalar->shape().NumAxes(), scalar->shape().ptr(), scalar->dptr(), + primitive->Launch(ctx->stream(), x->shape_view().NumAxes(), x->shape_view().ptr(), x->dptr(), + scalar->shape_view().NumAxes(), scalar->shape_view().ptr(), scalar->dptr(), y->mut_dptr()); } else { // For 0-size Tensor diff --git a/oneflow/user/kernels/scalar_logical_kernels.cpp b/oneflow/user/kernels/scalar_logical_kernels.cpp index b82352725b4..db64ed7026b 100644 --- a/oneflow/user/kernels/scalar_logical_kernels.cpp +++ b/oneflow/user/kernels/scalar_logical_kernels.cpp @@ -46,7 +46,7 @@ class ScalarLogicalKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); bool* out_ptr = out->mut_dptr(); - int64_t elem_cnt = out->shape().elem_cnt(); + int64_t elem_cnt = out->shape_view().elem_cnt(); if (elem_cnt != 0) { ScalarLogicalFunctor()(ctx->stream(), elem_cnt, scalar_operand, in_ptr, out_ptr); diff --git a/oneflow/user/kernels/scalar_math_kernels.cpp b/oneflow/user/kernels/scalar_math_kernels.cpp index d385bee423e..b2c42b9fff5 100644 --- a/oneflow/user/kernels/scalar_math_kernels.cpp +++ b/oneflow/user/kernels/scalar_math_kernels.cpp @@ -52,7 +52,7 @@ class ScalarMathKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - int64_t elem_cnt = out->shape().elem_cnt(); + int64_t elem_cnt = out->shape_view().elem_cnt(); if (elem_cnt != 0) { ScalarMathFunctor()(ctx->stream(), elem_cnt, scalar_operand, in_ptr, out_ptr); @@ -85,7 +85,7 @@ class ScalarReverseMathKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - int64_t elem_cnt = out->shape().elem_cnt(); + int64_t elem_cnt = out->shape_view().elem_cnt(); if (elem_cnt != 0) { ScalarReverseMathFunctor()(ctx->stream(), elem_cnt, scalar_operand, in_ptr, out_ptr); @@ -169,7 +169,7 @@ class CpuScalarPowGradKernel final : public user_op::OpKernel { UNIMPLEMENTED(); } - const int32_t elem_cnt = x_tensor->shape().elem_cnt(); + const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); FOR_RANGE(int32_t, i, 0, elem_cnt) { dx_ptr[i] = scalar_operand * (std::pow(x_ptr[i], scalar_operand - static_cast(1))) * dy_ptr[i]; @@ -210,7 +210,7 @@ class CpuScalarReversePowGradKernel final : public user_op::OpKernel { UNIMPLEMENTED(); } - const int32_t elem_cnt = x_tensor->shape().elem_cnt(); + const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); // NOTE: y = a^x ==>> dy/dx = a^x * lna FOR_RANGE(int32_t, i, 0, elem_cnt) { dx_ptr[i] = std::pow(scalar_operand, x_ptr[i]) * std::log(scalar_operand) * dy_ptr[i]; diff --git a/oneflow/user/kernels/scalar_math_kernels.cu b/oneflow/user/kernels/scalar_math_kernels.cu index 3d9f605f149..b9cf24cdab5 100644 --- a/oneflow/user/kernels/scalar_math_kernels.cu +++ b/oneflow/user/kernels/scalar_math_kernels.cu @@ -163,7 +163,7 @@ class GpuScalarPowGradKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED(); } - const int32_t elem_cnt = x_tensor->shape().elem_cnt(); + const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary( ScalarPowGradFunctor(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr, ctx->stream()->As()->cuda_stream()))); @@ -203,7 +203,7 @@ class GpuScalarReversePowGradKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED(); } - const int32_t elem_cnt = x_tensor->shape().elem_cnt(); + const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary( ScalarReversePowGradFunctor(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr, ctx->stream()->As()->cuda_stream()))); diff --git a/oneflow/user/kernels/search_sorted_kernel.cpp b/oneflow/user/kernels/search_sorted_kernel.cpp index 461606b4086..c3de4402ada 100644 --- a/oneflow/user/kernels/search_sorted_kernel.cpp +++ b/oneflow/user/kernels/search_sorted_kernel.cpp @@ -34,11 +34,13 @@ class CpuSearchSortedKernel final : public user_op::OpKernel { const T* values_ptr = values->dptr(); const T* sequence_ptr = sorted_sequence->dptr(); K* out_ptr = out->mut_dptr(); - const int32_t instance_num = values->shape().elem_cnt(); - bool is_values_scalar = (values->shape().elem_cnt() == 1 && values->shape().NumAxes() == 0); - bool is_sequence_1d = (sorted_sequence->shape().NumAxes() == 1); - K values_shape_last = is_values_scalar ? 1 : values->shape().At(values->shape().NumAxes() - 1); - K sequence_shape_last = sorted_sequence->shape().At(sorted_sequence->shape().NumAxes() - 1); + const int32_t instance_num = values->shape_view().elem_cnt(); + bool is_values_scalar = values->shape_view().NumAxes() == 0; + bool is_sequence_1d = (sorted_sequence->shape_view().NumAxes() == 1); + K values_shape_last = + is_values_scalar ? 1 : values->shape_view().At(values->shape_view().NumAxes() - 1); + K sequence_shape_last = + sorted_sequence->shape_view().At(sorted_sequence->shape_view().NumAxes() - 1); FOR_RANGE(int32_t, i, 0, instance_num) { K start_bd = is_sequence_1d ? 0 : i / values_shape_last * sequence_shape_last; K end_bd = start_bd + sequence_shape_last; @@ -81,7 +83,7 @@ class CpuSearchSortedScalarKernel final : public user_op::OpKernel { const T* sequence_ptr = sorted_sequence->dptr(); K* out_ptr = out->mut_dptr(); - K sequence_shape_last = sorted_sequence->shape().At(0); + K sequence_shape_last = sorted_sequence->shape_view().At(0); K pos = !right ? cus_lower_bound(0, sequence_shape_last, values, sequence_ptr) : cus_upper_bound(0, sequence_shape_last, values, sequence_ptr); diff --git a/oneflow/user/kernels/search_sorted_kernel.cu b/oneflow/user/kernels/search_sorted_kernel.cu index 23f79e51e7f..6e2e7b66894 100644 --- a/oneflow/user/kernels/search_sorted_kernel.cu +++ b/oneflow/user/kernels/search_sorted_kernel.cu @@ -62,11 +62,13 @@ class GpuSearchSortedKernel final : public user_op::OpKernel { const T* values_ptr = values->dptr(); const T* sequence_ptr = sorted_sequence->dptr(); K* out_ptr = out->mut_dptr(); - const int32_t instance_num = values->shape().elem_cnt(); - bool is_values_scalar = (values->shape().elem_cnt() == 1 && values->shape().NumAxes() == 0); - bool is_sequence_1d = (sorted_sequence->shape().NumAxes() == 1); - K values_shape_last = is_values_scalar ? 1 : values->shape().At(values->shape().NumAxes() - 1); - K sequence_shape_last = sorted_sequence->shape().At(sorted_sequence->shape().NumAxes() - 1); + const int32_t instance_num = values->shape_view().elem_cnt(); + bool is_values_scalar = values->shape_view().NumAxes() == 0; + bool is_sequence_1d = (sorted_sequence->shape_view().NumAxes() == 1); + K values_shape_last = + is_values_scalar ? 1 : values->shape_view().At(values->shape_view().NumAxes() - 1); + K sequence_shape_last = + sorted_sequence->shape_view().At(sorted_sequence->shape_view().NumAxes() - 1); RUN_CUDA_KERNEL((DoSearchSortedLogical), ctx->stream(), instance_num, instance_num, is_sequence_1d, values_shape_last, sequence_shape_last, right, values_ptr, sequence_ptr, out_ptr); @@ -104,7 +106,7 @@ class GpuSearchSortedScalarKernel final : public user_op::OpKernel { const T* sequence_ptr = sorted_sequence->dptr(); K* out_ptr = out->mut_dptr(); - K sequence_shape_last = sorted_sequence->shape().At(0); + K sequence_shape_last = sorted_sequence->shape_view().At(0); RUN_CUDA_KERNEL((DoSearchSortedScalarLogical), ctx->stream(), 1, sequence_shape_last, right, values, sequence_ptr, out_ptr); } diff --git a/oneflow/user/kernels/sigmoid_cross_entropy_kernel.h b/oneflow/user/kernels/sigmoid_cross_entropy_kernel.h index 871da4dacbe..bca66ba8f12 100644 --- a/oneflow/user/kernels/sigmoid_cross_entropy_kernel.h +++ b/oneflow/user/kernels/sigmoid_cross_entropy_kernel.h @@ -66,7 +66,7 @@ class SigmoidCrossEntropyKernel final : public user_op::OpKernel { const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0); const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); user_op::Tensor* loss = ctx->Tensor4ArgNameAndIndex("loss", 0); - const auto n = prediction->shape().elem_cnt(); + const auto n = prediction->shape_view().elem_cnt(); ElemwiseSigmoidCrossEntropyFunctor()( ctx->stream(), n, loss->mut_dptr(), prediction->dptr(), label->dptr()); @@ -96,7 +96,7 @@ class SigmoidCrossEntropyGradKernel final : public user_op::OpKernel { const user_op::Tensor* loss_diff = ctx->Tensor4ArgNameAndIndex("loss_diff", 0); const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0); user_op::Tensor* prediction_diff = ctx->Tensor4ArgNameAndIndex("prediction_diff", 0); - const int64_t n = prediction->shape().elem_cnt(); + const int64_t n = prediction->shape_view().elem_cnt(); ElemwiseSigmoidCrossEntropyGradFunctor()( ctx->stream(), n, prediction_diff->mut_dptr(), prediction->dptr(), label->dptr(), loss_diff->dptr()); diff --git a/oneflow/user/kernels/slice_kernel.cpp b/oneflow/user/kernels/slice_kernel.cpp index 691ee1b810e..67056ae6d62 100644 --- a/oneflow/user/kernels/slice_kernel.cpp +++ b/oneflow/user/kernels/slice_kernel.cpp @@ -165,30 +165,30 @@ SliceParams ConstructSliceParams(user_op::KernelComputeContext* ctx, const user_ const auto& start_vec = ctx->Attr>("start"); const auto& stop_vec = ctx->Attr>("stop"); const auto& step_vec = ctx->Attr>("step"); - const int64_t ndim = entire->shape().NumAxes(); + const int64_t ndim = entire->shape_view().NumAxes(); CHECK_LE(ndim, kSliceMaxDims); - if (entire->shape().NumAxes() == 1) { - CHECK_LE(sliced->shape().NumAxes(), 1); + if (entire->shape_view().NumAxes() == 1) { + CHECK_LE(sliced->shape_view().NumAxes(), 1); } else { - CHECK_EQ(sliced->shape().NumAxes(), ndim); + CHECK_EQ(sliced->shape_view().NumAxes(), ndim); } CHECK_EQ(start_vec.size(), ndim); CHECK_EQ(stop_vec.size(), ndim); CHECK_EQ(step_vec.size(), ndim); SliceParams params; - if (entire->shape().NumAxes() == 1 && sliced->shape().NumAxes() == 0) { + if (entire->shape_view().NumAxes() == 1 && sliced->shape_view().NumAxes() == 0) { params.ndim = ndim; - params.dims[0] = entire->shape().At(0); - params.start[0] = RegulateSliceStart(start_vec.at(0), entire->shape().At(0)); + params.dims[0] = entire->shape_view().At(0); + params.start[0] = RegulateSliceStart(start_vec.at(0), entire->shape_view().At(0)); params.step[0] = step_vec.at(0); params.size[0] = 1; return params; } params.ndim = ndim; FOR_RANGE(int, i, 0, params.ndim) { - const int64_t dim_size = entire->shape().At(i); - const int64_t slice_size = sliced->shape().At(i); + const int64_t dim_size = entire->shape_view().At(i); + const int64_t slice_size = sliced->shape_view().At(i); const int64_t step = step_vec.at(i); CHECK_NE(step, 0); const int64_t start = RegulateSliceStart(start_vec.at(i), dim_size); @@ -208,43 +208,6 @@ SliceParams ConstructSliceParams(user_op::KernelComputeContext* ctx, const user_ } // namespace -template -class SliceKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - SliceKernel() = default; - ~SliceKernel() = default; - - private: - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - SliceParams params = ConstructSliceParams(ctx, x_tensor, y_tensor); - SliceKernelUtil::Forward(ctx->stream(), params, x_tensor->dptr(), - y_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class SliceGradKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - SliceGradKernel() = default; - ~SliceGradKernel() = default; - - private: - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - size_t dx_byte_size = dx_tensor->shape().elem_cnt() * sizeof(T); - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, dx_byte_size); - if (dy_tensor->shape().elem_cnt() == 0) { return; } - SliceParams params = ConstructSliceParams(ctx, dx_tensor, dy_tensor); - SliceKernelUtil::Backward(ctx->stream(), params, dy_tensor->dptr(), - dx_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - template void WriteSlice(user_op::KernelComputeContext* ctx, const user_op::Tensor* src, user_op::Tensor* dst, const SliceContext& slice_ctx, @@ -254,7 +217,7 @@ void WriteSlice(user_op::KernelComputeContext* ctx, const user_op::Tensor* src, // Check physical tensor's shape for (const auto& split_info : slice_ctx.GetSplitInfo()) { if (split_info.split_axis != SPLIT_AXIS_FOR_NON_SPLIT) { - CHECK_EQ(large->shape().At(split_info.split_axis), split_info.upper - split_info.lower) + CHECK_EQ(large->shape_view().At(split_info.split_axis), split_info.upper - split_info.lower) << "split_info shape mismatch physical tensor shape"; } } @@ -272,7 +235,7 @@ void WriteSlice(user_op::KernelComputeContext* ctx, const user_op::Tensor* src, for (int i = 0; i < ndim; i++) { if (!slice_ctx.IsAxisPushed(i)) { // axis is not split, logical shape is same as physical shape - logical_dims[i] = large->shape().At(i); + logical_dims[i] = large->shape_view().At(i); } } for (const auto& split_info : slice_ctx.GetSplitInfo()) { @@ -289,9 +252,9 @@ void WriteSlice(user_op::KernelComputeContext* ctx, const user_op::Tensor* src, SliceParams large_slice_param; SliceParams small_slice_param; ConstructSliceParamsLarge(slice_ctx, positive_start_vec, positive_stop_vec, step_attr, - large->shape(), &large_slice_param); + large->shape_view(), &large_slice_param); ConstructSliceParamsSmall(slice_ctx, positive_start_vec, positive_stop_vec, step_attr, - small->shape(), &small_slice_param); + small->shape_view(), &small_slice_param); CHECK_EQ(large_slice_param.elem_cnt(), small_slice_param.elem_cnt()); const int64_t elem_cnt = large_slice_param.elem_cnt(); @@ -330,10 +293,10 @@ DEFINE_STATIC_SWITCH_FUNC( #undef MAKE_WRITE_SLICE_SWITCH_ENTRY template -class LogicalSliceKernel final : public user_op::OpKernel { +class SliceKernel final : public user_op::OpKernel { public: - LogicalSliceKernel() = default; - ~LogicalSliceKernel() = default; + SliceKernel() = default; + ~SliceKernel() = default; std::shared_ptr InitOpKernelCache( user_op::KernelCacheContext* ctx) const override { @@ -362,23 +325,24 @@ class LogicalSliceKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, const user_op::OpKernelCache* cache) const override { user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + if (y_tensor->shape_view().elem_cnt() == 0) { return; } const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); const SliceContext& slice_ctx = dynamic_cast*>(cache)->Get(); AutoMemset(ctx->stream(), y_tensor->mut_dptr(), 0, - y_tensor->shape().elem_cnt() * GetSizeOfDataType(y_tensor->data_type()), + y_tensor->shape_view().elem_cnt() * GetSizeOfDataType(y_tensor->data_type()), y_tensor->mem_case()); - SwitchWriteSlice(SwitchCase(y_tensor->shape().NumAxes(), y_tensor->data_type()), ctx, x_tensor, - y_tensor, slice_ctx, true); + SwitchWriteSlice(SwitchCase(y_tensor->shape_view().NumAxes(), y_tensor->data_type()), ctx, + x_tensor, y_tensor, slice_ctx, true); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; template -class LogicalSliceAssignKernel final : public user_op::OpKernel { +class SliceUpdateKernel final : public user_op::OpKernel { public: - LogicalSliceAssignKernel() = default; - ~LogicalSliceAssignKernel() = default; + SliceUpdateKernel() = default; + ~SliceUpdateKernel() = default; std::shared_ptr InitOpKernelCache( user_op::KernelCacheContext* ctx) const override { @@ -423,92 +387,79 @@ class LogicalSliceAssignKernel final : public user_op::OpKernel { const user_op::Tensor* value_tensor = ctx->Tensor4ArgNameAndIndex("value", 0); user_op::Tensor* ref_tensor = ctx->Tensor4ArgNameAndIndex("ref", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + if (y_tensor->shape_view().elem_cnt() == 0) { return; } // When eager executing, y_tensor shared the same memory with ref_tensor if (ref_tensor->dptr() != y_tensor->dptr()) { // lazy run AutoMemcpy(ctx->stream(), y_tensor->mut_dptr(), ref_tensor->dptr(), - y_tensor->shape().elem_cnt() * sizeof(T), ref_tensor->mem_case(), + y_tensor->shape_view().elem_cnt() * sizeof(T), ref_tensor->mem_case(), y_tensor->mem_case()); } const SliceContext& slice_ctx = dynamic_cast*>(cache)->Get(); - SwitchWriteSlice(SwitchCase(value_tensor->shape().NumAxes(), value_tensor->data_type()), ctx, - value_tensor, y_tensor, slice_ctx, false); + SwitchWriteSlice(SwitchCase(value_tensor->shape_view().NumAxes(), value_tensor->data_type()), + ctx, value_tensor, y_tensor, slice_ctx, false); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } }; +#define REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(dtype) \ + REGISTER_USER_KERNEL("slice_update") \ + .SetCreateFn>() \ + .SetIsMatchedHob(user_op::HobDataType("ref", 0) == GetDataType::value); \ + REGISTER_USER_KERNEL("slice").SetCreateFn>().SetIsMatchedHob( \ + user_op::HobDataType("x", 0) == GetDataType::value); + +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(float) +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(double) +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(int32_t) +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(int64_t) +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(int8_t) +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(uint8_t) +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(bool) +#ifdef WITH_CUDA +REGISTER_SLICE_UPDATE_AND_SLICE_KERNELS(float16) +#endif + template -class SliceUpdateKernel final : public user_op::OpKernel { +class SliceGradKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { public: - SliceUpdateKernel() = default; - ~SliceUpdateKernel() = default; + SliceGradKernel() = default; + ~SliceGradKernel() = default; private: void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* update_tensor = ctx->Tensor4ArgNameAndIndex("update", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - Memcpy(ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - y_tensor->shape().elem_cnt() * sizeof(T)); - SliceParams params = ConstructSliceParams(ctx, y_tensor, update_tensor); - SliceKernelUtil::Backward(ctx->stream(), params, update_tensor->dptr(), - y_tensor->mut_dptr()); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + size_t dx_byte_size = dx_tensor->shape_view().elem_cnt() * sizeof(T); + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, dx_byte_size); + if (dy_tensor->shape_view().elem_cnt() == 0) { return; } + SliceParams params = ConstructSliceParams(ctx, dx_tensor, dy_tensor); + SliceKernelUtil::Backward(ctx->stream(), params, dy_tensor->dptr(), + dx_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_SLICE_KERNELS(device, dtype) \ - REGISTER_USER_KERNEL("slice").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == device) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("slice_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == device) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("slice_update") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == device) \ - && (user_op::HobDataType("x", 0) == GetDataType::value) \ - && (user_op::HobDataType("update", 0) == GetDataType::value)) \ - .SetInplaceProposalFn([](const user_op::InferContext&, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "x", 0, true)); \ - return Maybe::Ok(); \ - }); - -#define REGISTER_SLICE_KERNELS_WITH_DEVICE(device) \ - REGISTER_SLICE_KERNELS(device, bool) \ - REGISTER_SLICE_KERNELS(device, float) \ - REGISTER_SLICE_KERNELS(device, double) \ - REGISTER_SLICE_KERNELS(device, int32_t) \ - REGISTER_SLICE_KERNELS(device, int64_t) \ - REGISTER_SLICE_KERNELS(device, int8_t) \ - REGISTER_SLICE_KERNELS(device, uint8_t) - -REGISTER_SLICE_KERNELS_WITH_DEVICE(DeviceType::kCPU) -#ifdef WITH_CUDA -REGISTER_SLICE_KERNELS_WITH_DEVICE(DeviceType::kCUDA) -REGISTER_SLICE_KERNELS(DeviceType::kCUDA, float16) -#endif - -#define REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(dtype) \ - REGISTER_USER_KERNEL("logical_slice_assign") \ - .SetCreateFn>() \ - .SetIsMatchedHob(user_op::HobDataType("ref", 0) == GetDataType::value); \ - REGISTER_USER_KERNEL("logical_slice") \ - .SetCreateFn>() \ - .SetIsMatchedHob(user_op::HobDataType("x", 0) == GetDataType::value); - -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(float) -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(double) -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(int32_t) -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(int64_t) -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(int8_t) -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(uint8_t) -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(bool) +#define REGISTER_SLICE_GRAD_KERNEL(device, dtype) \ + REGISTER_USER_KERNEL("slice_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == device) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +#define REGISTER_SLICE_GRAD_KERNEL_WITH_DEVICE(device) \ + REGISTER_SLICE_GRAD_KERNEL(device, bool) \ + REGISTER_SLICE_GRAD_KERNEL(device, float) \ + REGISTER_SLICE_GRAD_KERNEL(device, double) \ + REGISTER_SLICE_GRAD_KERNEL(device, int32_t) \ + REGISTER_SLICE_GRAD_KERNEL(device, int64_t) \ + REGISTER_SLICE_GRAD_KERNEL(device, int8_t) \ + REGISTER_SLICE_GRAD_KERNEL(device, uint8_t) + +REGISTER_SLICE_GRAD_KERNEL_WITH_DEVICE(DeviceType::kCPU) #ifdef WITH_CUDA -REGISTER_LOGICAL_SLICE_ASSIGN_AND_LOGICAL_SLICE_KERNELS(float16) +REGISTER_SLICE_GRAD_KERNEL_WITH_DEVICE(DeviceType::kCUDA) +REGISTER_SLICE_GRAD_KERNEL(DeviceType::kCUDA, float16) #endif } // namespace oneflow diff --git a/oneflow/user/kernels/slice_util.h b/oneflow/user/kernels/slice_util.h index dd4022ccef7..be76c6289d1 100644 --- a/oneflow/user/kernels/slice_util.h +++ b/oneflow/user/kernels/slice_util.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef ONEFLOW_USER_KERNELS_SLICE_UTIL_H_ #define ONEFLOW_USER_KERNELS_SLICE_UTIL_H_ +#include #include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/common/util.h" #include "oneflow/core/ep/include/stream.h" @@ -60,6 +61,15 @@ struct SliceParams { if (size[dim] != dims[dim]) { return false; } return true; } + + std::string ToString() { + std::stringstream ss("SliceParams:"); + for (int i = 0; i < ndim; ++i) { + ss << "\n\tdim: " << i << ", start: " << start[i] << ", step: " << step[i] + << ", size: " << size[i]; + } + return ss.str(); + } }; SliceParams FoldContiguousFullSliceDimensions(const SliceParams& params); diff --git a/oneflow/user/kernels/softmax_cross_entropy_kernel.h b/oneflow/user/kernels/softmax_cross_entropy_kernel.h index 00ebb8bdb2e..d1eff26fcbc 100644 --- a/oneflow/user/kernels/softmax_cross_entropy_kernel.h +++ b/oneflow/user/kernels/softmax_cross_entropy_kernel.h @@ -57,9 +57,9 @@ class SoftmaxCrossEntropyKernel final : public user_op::OpKernel { const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const auto num_axes = label->shape().NumAxes(); - const int64_t num_instances = label->shape().Count(0, num_axes - 1); - const int64_t num_classes = label->shape().At(num_axes - 1); + const auto num_axes = label->shape_view().NumAxes(); + const int64_t num_instances = label->shape_view().Count(0, num_axes - 1); + const int64_t num_classes = label->shape_view().At(num_axes - 1); std::unique_ptr primitive = NewSoftmaxPrimitive(ctx); CHECK(primitive); primitive->Launch(ctx->stream(), num_instances, num_classes, prediction->dptr(), @@ -93,12 +93,12 @@ class SoftmaxCrossEntropyGradKernel final : public user_op::OpKernel { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); user_op::Tensor* prediction_diff = ctx->Tensor4ArgNameAndIndex("prediction_diff", 0); - const int64_t num_instances = dy->shape().elem_cnt(); - CHECK_EQ(prob->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prob->shape().elem_cnt() / num_instances; + const int64_t num_instances = dy->shape_view().elem_cnt(); + CHECK_EQ(prob->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prob->shape_view().elem_cnt() / num_instances; CrossEntropyKernelUtil::ComputeDiffWithSoftmax( - ctx->stream(), prediction_diff->shape().elem_cnt(), num_classes, prob->dptr(), + ctx->stream(), prediction_diff->shape_view().elem_cnt(), num_classes, prob->dptr(), label->dptr(), dy->dptr(), prediction_diff->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/softmax_kernel.cpp b/oneflow/user/kernels/softmax_kernel.cpp index 0ab7ad2d7c7..833e0d6a838 100644 --- a/oneflow/user/kernels/softmax_kernel.cpp +++ b/oneflow/user/kernels/softmax_kernel.cpp @@ -60,7 +60,7 @@ class SoftmaxKernel final : public user_op::OpKernel, public user_op::CudaGraphS void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); const int64_t cols = in_shape.At(in_shape.NumAxes() - 1); const int64_t rows = in_shape.Count(0, in_shape.NumAxes() - 1); std::unique_ptr primitive = NewSoftmaxPrimitive(ctx); @@ -85,8 +85,8 @@ class SoftmaxGradKernel final : public user_op::OpKernel, public user_op::CudaGr const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t num_classes = y->shape().At(y->shape().NumAxes() - 1); - const int64_t num_instances = y->shape().elem_cnt() / num_classes; + const int64_t num_classes = y->shape_view().At(y->shape_view().NumAxes() - 1); + const int64_t num_instances = y->shape_view().elem_cnt() / num_classes; std::unique_ptr primitive = NewSoftmaxBackwardPrimitive(ctx); CHECK(primitive); diff --git a/oneflow/user/kernels/sort_kernel.cpp b/oneflow/user/kernels/sort_kernel.cpp index 635a6a29b71..ee4974b4933 100644 --- a/oneflow/user/kernels/sort_kernel.cpp +++ b/oneflow/user/kernels/sort_kernel.cpp @@ -30,9 +30,9 @@ class CpuSortKernel final : public user_op::OpKernel { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), - in->shape().elem_cnt() * sizeof(T)); - const int32_t instance_size = in->shape().At(in->shape().NumAxes() - 1); - const int32_t instance_num = in->shape().elem_cnt() / instance_size; + in->shape_view().elem_cnt() * sizeof(T)); + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int32_t instance_num = in->shape_view().elem_cnt() / instance_size; const std::string& direction = ctx->Attr("direction"); const bool is_ascending = direction == "ASCENDING"; const bool is_descending = direction == "DESCENDING"; diff --git a/oneflow/user/kernels/sort_kernel.cu b/oneflow/user/kernels/sort_kernel.cu index 319fac1576d..79f634f20cf 100644 --- a/oneflow/user/kernels/sort_kernel.cu +++ b/oneflow/user/kernels/sort_kernel.cu @@ -34,17 +34,17 @@ class GpuSortKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), - in->shape().elem_cnt() * sizeof(T)); - const int32_t instance_size = in->shape().At(in->shape().NumAxes() - 1); - const int32_t instance_num = in->shape().elem_cnt() / instance_size; + in->shape_view().elem_cnt() * sizeof(T)); + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int32_t instance_num = in->shape_view().elem_cnt() / instance_size; const std::string& direction = ctx->Attr("direction"); if (direction == "ASCENDING") { SortKeysAscending(in->dptr(), instance_num, instance_size, tmp_buffer->mut_dptr(), - tmp_buffer->shape().elem_cnt(), out->mut_dptr(), + tmp_buffer->shape_view().elem_cnt(), out->mut_dptr(), ctx->stream()->As()->cuda_stream()); } else if (direction == "DESCENDING") { SortKeysDescending(in->dptr(), instance_num, instance_size, tmp_buffer->mut_dptr(), - tmp_buffer->shape().elem_cnt(), out->mut_dptr(), + tmp_buffer->shape_view().elem_cnt(), out->mut_dptr(), ctx->stream()->As()->cuda_stream()); } else { UNIMPLEMENTED(); diff --git a/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp b/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp index 8f5728a48c7..e97a47e3b26 100644 --- a/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp +++ b/oneflow/user/kernels/sparse_cross_entropy_kernel.cpp @@ -49,9 +49,9 @@ class SparseCrossEntropyKernel final : public user_op::OpKernel { const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0); const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prediction->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prediction->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; const int64_t lower_bound = 0; const int64_t depth = ctx->Attr("depth"); SparseCrossEntropyKernelUtil::ComputeEntropy( @@ -90,9 +90,9 @@ class SparseCrossEntropyMsKernel final : public user_op::OpKernel { const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0); const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prediction->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prediction->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; const int64_t depth = ctx->Attr("depth"); int64_t lower_bound = 0; if (cache != nullptr) { @@ -102,7 +102,7 @@ class SparseCrossEntropyMsKernel final : public user_op::OpKernel { lower_bound = kernel_cache->lower(); } Memset(ctx->stream(), out->mut_dptr(), 0, - out->shape().elem_cnt() * GetSizeOfDataType(out->data_type())); + out->shape_view().elem_cnt() * GetSizeOfDataType(out->data_type())); SparseCrossEntropyKernelUtil::ComputeEntropy( ctx->stream(), num_instances, num_classes, depth, lower_bound, prediction->dptr(), label->dptr(), out->mut_dptr()); @@ -150,13 +150,13 @@ class SparseCrossEntropyGradKernel final : public user_op::OpKernel { const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* prediction_diff = ctx->Tensor4ArgNameAndIndex("prediction_diff", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prediction->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prediction->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; const int64_t lower_bound = 0; const int64_t depth = ctx->Attr("depth"); size_t prediction_diff_bytes_size = - prediction_diff->shape().elem_cnt() * GetSizeOfDataType(prediction_diff->data_type()); + prediction_diff->shape_view().elem_cnt() * GetSizeOfDataType(prediction_diff->data_type()); Memset(ctx->stream(), prediction_diff->mut_dptr(), 0, prediction_diff_bytes_size); SparseCrossEntropyKernelUtil::ComputeDiff( @@ -196,9 +196,9 @@ class SparseCrossEntropyMsGradKernel final : public user_op::OpKernel { const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* prediction_diff = ctx->Tensor4ArgNameAndIndex("prediction_diff", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prediction->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prediction->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; const int64_t depth = ctx->Attr("depth"); int64_t lower_bound = 0; if (cache != nullptr) { @@ -208,7 +208,7 @@ class SparseCrossEntropyMsGradKernel final : public user_op::OpKernel { lower_bound = kernel_cache->lower(); } size_t prediction_diff_bytes_size = - prediction_diff->shape().elem_cnt() * GetSizeOfDataType(prediction_diff->data_type()); + prediction_diff->shape_view().elem_cnt() * GetSizeOfDataType(prediction_diff->data_type()); Memset(ctx->stream(), prediction_diff->mut_dptr(), 0, prediction_diff_bytes_size); SparseCrossEntropyKernelUtil::ComputeDiff( diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp index edd2c9732ea..1ca34f9e02f 100644 --- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp +++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cpp @@ -68,9 +68,9 @@ class SparseSoftmaxCrossEntropyKernel final : public user_op::OpKernel, const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prediction->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prediction->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; const int64_t lower_bound = 0; const int64_t depth = ctx->Attr("depth"); @@ -147,13 +147,13 @@ class SparseSoftmaxCrossEntropyGradKernel final : public user_op::OpKernel, const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); user_op::Tensor* prediction_diff = ctx->Tensor4ArgNameAndIndex("prediction_diff", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prob->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prob->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prob->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prob->shape_view().elem_cnt() / num_instances; const int64_t lower_bound = 0; const int64_t depth = ctx->Attr("depth"); SparseSoftmaxCrossEntropyKernelUtil::ComputeDiff( - ctx->stream(), prediction_diff->shape().elem_cnt(), num_classes, depth, lower_bound, + ctx->stream(), prediction_diff->shape_view().elem_cnt(), num_classes, depth, lower_bound, prob->dptr(), label->dptr(), dy->dptr(), prediction_diff->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -187,9 +187,9 @@ class SparseSoftmaxCrossEntropyMsGradKernel final : public user_op::OpKernel { const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); user_op::Tensor* prediction_diff = ctx->Tensor4ArgNameAndIndex("prediction_diff", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prob->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prob->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prob->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prob->shape_view().elem_cnt() / num_instances; const int64_t depth = ctx->Attr("depth"); int64_t lower_bound = 0; if (cache != nullptr) { @@ -199,7 +199,7 @@ class SparseSoftmaxCrossEntropyMsGradKernel final : public user_op::OpKernel { lower_bound = kernel_cache->lower(); } SparseCrossEntropyKernelUtil::ComputeDiffWithSoftmax( - ctx->stream(), prediction_diff->shape().elem_cnt(), num_classes, depth, lower_bound, + ctx->stream(), prediction_diff->shape_view().elem_cnt(), num_classes, depth, lower_bound, prob->dptr(), label->dptr(), dy->dptr(), prediction_diff->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cu b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cu index fa4c105f73c..74ebf6332e7 100644 --- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cu +++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.cu @@ -100,9 +100,9 @@ class SparseSoftmaxCrossEntropyKernel final : public user_op::OpKernel, user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t num_instances = label->shape().elem_cnt(); - CHECK_EQ(prediction->shape().elem_cnt() % num_instances, 0); - const int64_t num_classes = prediction->shape().elem_cnt() / num_instances; + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; const int64_t lower_bound = 0; const int64_t depth = ctx->Attr("depth"); diff --git a/oneflow/user/kernels/split_like_kernel.cpp b/oneflow/user/kernels/split_like_kernel.cpp index 738235f74ce..9fc017fe649 100644 --- a/oneflow/user/kernels/split_like_kernel.cpp +++ b/oneflow/user/kernels/split_like_kernel.cpp @@ -65,8 +65,8 @@ class SplitLikeKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); const auto axis = ctx->Attr("axis"); - const int64_t in_cols = in_tensor->shape().Count(axis); - const int64_t rows = in_tensor->shape().elem_cnt() / in_cols; + const int64_t in_cols = in_tensor->shape_view().Count(axis); + const int64_t rows = in_tensor->shape_view().elem_cnt() / in_cols; CHECK_GT(rows, 0); auto primitive = NewCopyNdPrimitive(ctx); @@ -75,8 +75,8 @@ class SplitLikeKernel final : public user_op::OpKernel { for (const auto& out_arg_pair : ctx->outputs()) { user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex(out_arg_pair.first, out_arg_pair.second); - const int64_t out_cols = out_tensor->shape().Count(axis); - CHECK_EQ(out_tensor->shape().elem_cnt(), rows * out_cols); + const int64_t out_cols = out_tensor->shape_view().Count(axis); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), rows * out_cols); if (out_cols > 0) { DimVector dst_shape = {rows, out_cols}; DimVector dst_pos_vec = {0, 0}; diff --git a/oneflow/user/kernels/sqrt_square_sum_kernel.cpp b/oneflow/user/kernels/sqrt_square_sum_kernel.cpp index 4c741594e4b..282ec7b3b5a 100644 --- a/oneflow/user/kernels/sqrt_square_sum_kernel.cpp +++ b/oneflow/user/kernels/sqrt_square_sum_kernel.cpp @@ -43,8 +43,9 @@ class SqrtSquareSumKernel final : public user_op::OpKernel, public user_op::Cuda user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); user_op::Tensor* tmp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - SqrtSquareSumKernelUtil::SqrtSquareSum( - ctx->stream(), x->shape().elem_cnt(), x->dptr(), y->mut_dptr(), tmp->mut_dptr()); + SqrtSquareSumKernelUtil::SqrtSquareSum(ctx->stream(), + x->shape_view().elem_cnt(), x->dptr(), + y->mut_dptr(), tmp->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/square_sum_kernel.cpp b/oneflow/user/kernels/square_sum_kernel.cpp index 96fe61da092..a84b1f27a14 100644 --- a/oneflow/user/kernels/square_sum_kernel.cpp +++ b/oneflow/user/kernels/square_sum_kernel.cpp @@ -33,7 +33,7 @@ class SquareSumKernel final : public user_op::OpKernel, public user_op::CudaGrap const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - SquareSumKernelUtil::SquareSum(ctx->stream(), x->shape().elem_cnt(), + SquareSumKernelUtil::SquareSum(ctx->stream(), x->shape_view().elem_cnt(), x->dptr(), y->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -60,7 +60,7 @@ class MultiSquareSumKernel final : public user_op::OpKernel, public user_op::Cud params.resize(ctx->input_size("x")); for (int64_t i = 0; i < params.size(); ++i) { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", i); - params[i].count = x->shape().elem_cnt(); + params[i].count = x->shape_view().elem_cnt(); params[i].ptr = x->dptr(); } user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); diff --git a/oneflow/user/kernels/ssp_variable_proxy_kernel.cpp b/oneflow/user/kernels/ssp_variable_proxy_kernel.cpp index 25151f1cbfb..baa3bf6c0c6 100644 --- a/oneflow/user/kernels/ssp_variable_proxy_kernel.cpp +++ b/oneflow/user/kernels/ssp_variable_proxy_kernel.cpp @@ -32,8 +32,8 @@ class SspVariableProxyKernel final : public user_op::OpKernel { const user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0); CHECK_EQ(var->dptr(), ref->dptr()); user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); - const ShapeView& in_shape = ref->shape(); - CHECK_EQ(value->shape(), in_shape); + const ShapeView& in_shape = ref->shape_view(); + CHECK_EQ(value->shape_view(), in_shape); const DataType in_data_type = ref->data_type(); CHECK_EQ(value->data_type(), in_data_type); Memcpy(ctx->stream(), value->mut_dptr(), ref->dptr(), diff --git a/oneflow/user/kernels/stack_kernel.cpp b/oneflow/user/kernels/stack_kernel.cpp index c254faff140..57fe4f800dc 100644 --- a/oneflow/user/kernels/stack_kernel.cpp +++ b/oneflow/user/kernels/stack_kernel.cpp @@ -71,10 +71,10 @@ class StackKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - if (out_tensor->shape().elem_cnt() == 0) { return; } + if (out_tensor->shape_view().elem_cnt() == 0) { return; } const int64_t axis = ctx->Attr("axis"); - const int64_t out_cols = out_tensor->shape().Count(axis); - const int64_t rows = out_tensor->shape().Count(0, axis); + const int64_t out_cols = out_tensor->shape_view().Count(axis); + const int64_t rows = out_tensor->shape_view().Count(0, axis); CHECK_GT(rows, 0) << "The multiplicative from axis 0 to axis " << axis - 1 << " should be greater than 0. "; auto primitive = NewCopyNdPrimitive(ctx); @@ -83,9 +83,9 @@ class StackKernel final : public user_op::OpKernel { for (const auto& in_arg_pair : ctx->inputs()) { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex(in_arg_pair.first, in_arg_pair.second); - if (in_tensor->shape().elem_cnt() == 0) { continue; } - const int64_t in_cols = in_tensor->shape().Count(axis); - CHECK_EQ(in_tensor->shape().elem_cnt(), rows * in_cols) + if (in_tensor->shape_view().elem_cnt() == 0) { continue; } + const int64_t in_cols = in_tensor->shape_view().Count(axis); + CHECK_EQ(in_tensor->shape_view().elem_cnt(), rows * in_cols) << "The element count of input tensor is not equal to `rows * in_cols`. "; if (in_cols > 0) { DimVector dst_shape = {rows, out_cols}; @@ -172,8 +172,8 @@ class StackGradKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("in", 0); const int64_t axis = ctx->Attr("axis"); - const int64_t in_cols = in_tensor->shape().Count(axis); - const int64_t rows = in_tensor->shape().Count(0, axis); + const int64_t in_cols = in_tensor->shape_view().Count(axis); + const int64_t rows = in_tensor->shape_view().Count(0, axis); CHECK_GT(rows, 0) << "The multiplicative from axis 0 to axis " << axis - 1 << " should be greater than 0. "; auto primitive = NewCopyNdPrimitive(ctx); @@ -182,8 +182,8 @@ class StackGradKernel final : public user_op::OpKernel { for (const auto& out_arg_pair : ctx->outputs()) { user_op::Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex(out_arg_pair.first, out_arg_pair.second); - const int64_t out_cols = out_tensor->shape().Count(axis); - CHECK_EQ(out_tensor->shape().elem_cnt(), rows * out_cols) + const int64_t out_cols = out_tensor->shape_view().Count(axis); + CHECK_EQ(out_tensor->shape_view().elem_cnt(), rows * out_cols) << "The element count of output tensor is not equal to `rows * out_cols`. "; if (out_cols > 0) { DimVector dst_shape = {rows, out_cols}; diff --git a/oneflow/user/kernels/stateful_local_opkernel.cpp b/oneflow/user/kernels/stateful_opkernel.cpp similarity index 96% rename from oneflow/user/kernels/stateful_local_opkernel.cpp rename to oneflow/user/kernels/stateful_opkernel.cpp index 629a795240a..6afbc1bbd07 100644 --- a/oneflow/user/kernels/stateful_local_opkernel.cpp +++ b/oneflow/user/kernels/stateful_opkernel.cpp @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/user/kernels/stateful_local_opkernel.h" +#include "oneflow/user/kernels/stateful_opkernel.h" #include "oneflow/core/framework/attr_value_accessor.h" #include "oneflow/core/framework/user_op_conf.h" #include "oneflow/core/framework/user_op_registry_manager.h" @@ -370,12 +370,12 @@ Maybe InitTensorTupleIndexes4Bns(const std::shared_ptr return Maybe::Ok(); } -/* static */ Maybe StatefulLocalOpKernel::New( +/* static */ Maybe StatefulOpKernel::New( const std::shared_ptr& op_conf, const Symbol& stream, const AttrMap& base_attrs, const std::shared_ptr& parallel_desc, const std::shared_ptr& input_arg_tuple, const std::shared_ptr& output_arg_tuple) { - auto opkernel = std::shared_ptr(new StatefulLocalOpKernel()); + auto opkernel = std::shared_ptr(new StatefulOpKernel()); opkernel->op_conf_ = op_conf; opkernel->user_op_conf_.reset(new user_op::UserOpConfWrapper(op_conf)); opkernel->stream_ = stream; @@ -419,9 +419,9 @@ Maybe InitTensorTupleIndexes4Bns(const std::shared_ptr return opkernel; } -StatefulLocalOpKernel::~StatefulLocalOpKernel() = default; +StatefulOpKernel::~StatefulOpKernel() = default; -Maybe StatefulLocalOpKernel::ChooseOpKernel( +Maybe StatefulOpKernel::ChooseOpKernel( const user_op::OpKernel** user_opkernel, bool* need_temp_storage, const AttrMap& attrs, EagerBlobObjectListRawPtr inputs, EagerBlobObjectListRawPtr outputs, ConsistentTensorInferResultRawPtr consistent_tensor_infer_result) { @@ -463,7 +463,7 @@ Maybe StatefulLocalOpKernel::ChooseOpKernel( return Maybe::Ok(); } -void StatefulLocalOpKernel::TryInitOpKernelStateAndCache( +void StatefulOpKernel::TryInitOpKernelStateAndCache( const user_op::OpKernel* op_kernel, DeviceCtx* device_ctx, EagerBlobObjectListRawPtr inputs, EagerBlobObjectListRawPtr outputs, ConsistentTensorInferResultRawPtr consistent_tensor_infer_result, @@ -490,24 +490,20 @@ void StatefulLocalOpKernel::TryInitOpKernelStateAndCache( } } -const user_op::InferTmpSizeFn& StatefulLocalOpKernel::GetInferTmpSizeFn( +const user_op::InferTmpSizeFn& StatefulOpKernel::GetInferTmpSizeFn( const user_op::OpKernel* op_kernel) const { return *infer_tmp_size_fn_map_.at(op_kernel); } -vm::EagerBlobObject* StatefulLocalOpKernel::mut_temp_blob_object() { - return tmp_blob_object_.get(); -} +vm::EagerBlobObject* StatefulOpKernel::mut_temp_blob_object() { return tmp_blob_object_.get(); } -user_op::TensorDescInferFn StatefulLocalOpKernel::TensorDescInferFn() const { +user_op::TensorDescInferFn StatefulOpKernel::TensorDescInferFn() const { return tensor_desc_infer_fn_; } -user_op::DataTypeInferFn StatefulLocalOpKernel::DataTypeInferFn() const { - return data_type_infer_fn_; -} +user_op::DataTypeInferFn StatefulOpKernel::DataTypeInferFn() const { return data_type_infer_fn_; } -LocalUserKernelComputeContext* StatefulLocalOpKernel::UpdateComputeContext( +LocalUserKernelComputeContext* StatefulOpKernel::UpdateComputeContext( EagerBlobObjectListRawPtr inputs, EagerBlobObjectListRawPtr outputs, ConsistentTensorInferResultRawPtr consistent_tensor_infer_result, DeviceCtx* device_ctx) { compute_ctx_->Update(inputs, outputs, consistent_tensor_infer_result, device_ctx); diff --git a/oneflow/user/kernels/stateful_local_opkernel.h b/oneflow/user/kernels/stateful_opkernel.h similarity index 94% rename from oneflow/user/kernels/stateful_local_opkernel.h rename to oneflow/user/kernels/stateful_opkernel.h index c4b0e306169..063e1c07fd0 100644 --- a/oneflow/user/kernels/stateful_local_opkernel.h +++ b/oneflow/user/kernels/stateful_opkernel.h @@ -13,8 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_USER_KERNELS_STATEFUL_LOCAL_OPKERNEL_H_ -#define ONEFLOW_USER_KERNELS_STATEFUL_LOCAL_OPKERNEL_H_ +#ifndef ONEFLOW_USER_KERNELS_STATEFUL_OPKERNEL_H_ +#define ONEFLOW_USER_KERNELS_STATEFUL_OPKERNEL_H_ #include "oneflow/core/eager/eager_blob_object.h" #include "oneflow/core/framework/tensor_meta.h" @@ -30,7 +30,7 @@ namespace oneflow { class AttrMap; namespace vm { -struct LocalCallOpKernelUtil; +struct OpCallInstructionUtil; } // namespace vm namespace one { @@ -52,11 +52,9 @@ class EagerBlobObjectTensorView final : public user_op::Tensor { EagerBlobObjectTensorView(const std::function& mut_eager_blob_object) : mut_eager_blob_object_(mut_eager_blob_object) {} - ShapeView shape() const override { return mut_eager_blob_object_()->shape().ToShapeView(); } + ShapeView shape_view() const override { return mut_eager_blob_object_()->shape(); } - MutShapeView mut_shape() override { - return mut_eager_blob_object_()->mut_shape().ToMutShapeView(); - } + MutShapeView mut_shape_view() override { return mut_eager_blob_object_()->mut_shape(); } const Stride& stride() const override { return mut_eager_blob_object_()->stride(); } @@ -384,15 +382,15 @@ class LocalUserKernelComputeContext final : public user_op::KernelComputeContext LocalUserKernelBaseContext base_ctx_; }; -class StatefulLocalOpKernel final { +class StatefulOpKernel final { public: - OF_DISALLOW_COPY_AND_MOVE(StatefulLocalOpKernel); - static Maybe New(const std::shared_ptr& op_conf, - const Symbol& stream, const AttrMap& base_attrs, - const std::shared_ptr& parallel_desc, - const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple); - ~StatefulLocalOpKernel(); + OF_DISALLOW_COPY_AND_MOVE(StatefulOpKernel); + static Maybe New(const std::shared_ptr& op_conf, + const Symbol& stream, const AttrMap& base_attrs, + const std::shared_ptr& parallel_desc, + const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple); + ~StatefulOpKernel(); const Symbol& stream() const { return stream_; } const std::shared_ptr& mem_case() const { return stream_->device()->mem_case(); } const std::string& op_type_name() const { return op_conf_->user_conf().op_type_name(); } @@ -431,8 +429,8 @@ class StatefulLocalOpKernel final { const OperatorConf& op_conf() const { return *op_conf_; } private: - friend struct vm::LocalCallOpKernelUtil; - StatefulLocalOpKernel() = default; + friend struct vm::OpCallInstructionUtil; + StatefulOpKernel() = default; LocalUserKernelComputeContext* UpdateComputeContext( EagerBlobObjectListRawPtr inputs, EagerBlobObjectListRawPtr outputs, ConsistentTensorInferResultRawPtr consistent_tensor_infer_result, DeviceCtx* device_ctx); @@ -489,4 +487,4 @@ class StatefulLocalOpKernel final { } // namespace oneflow -#endif // ONEFLOW_USER_KERNELS_STATEFUL_LOCAL_OPKERNEL_H_ +#endif // ONEFLOW_USER_KERNELS_STATEFUL_OPKERNEL_H_ diff --git a/oneflow/user/kernels/summary_kernels.cpp b/oneflow/user/kernels/summary_kernels.cpp index 5ad7d947fb6..27252c67854 100644 --- a/oneflow/user/kernels/summary_kernels.cpp +++ b/oneflow/user/kernels/summary_kernels.cpp @@ -44,7 +44,7 @@ class SummaryWriteScalar final : public user_op::OpKernel { CHECK_NOTNULL(istep); int8_t* ctag = const_cast(tag->dptr()); CHECK_NOTNULL(ctag); - std::string tag_str(reinterpret_cast(ctag), tag->shape().elem_cnt()); + std::string tag_str(reinterpret_cast(ctag), tag->shape_view().elem_cnt()); EventWriterHelper::WriteScalarToFile( istep[0], static_cast(tvalue[0]), tag_str); } @@ -110,7 +110,7 @@ class SummaryWriteHistogram final : public user_op::OpKernel { CHECK_NOTNULL(istep); int8_t* ctag = const_cast(tag->dptr()); CHECK_NOTNULL(ctag); - std::string tag_str(reinterpret_cast(ctag), tag->shape().elem_cnt()); + std::string tag_str(reinterpret_cast(ctag), tag->shape_view().elem_cnt()); EventWriterHelper::WriteHistogramToFile(static_cast(istep[0]), *value, tag_str); } @@ -144,7 +144,7 @@ class SummaryWritePb final : public user_op::OpKernel { CHECK_NOTNULL(istep); int8_t* cvalue = const_cast(value->dptr()); CHECK_NOTNULL(cvalue); - std::string value_str(reinterpret_cast(cvalue), value->shape().elem_cnt()); + std::string value_str(reinterpret_cast(cvalue), value->shape_view().elem_cnt()); EventWriterHelper::WritePbToFile(istep[0], value_str); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } @@ -170,7 +170,7 @@ class SummaryWriteImage final : public user_op::OpKernel { CHECK_NOTNULL(istep); char* ctag = const_cast(tag->dptr()); CHECK_NOTNULL(ctag); - std::string tag_str(ctag, tag->shape().elem_cnt()); + std::string tag_str(ctag, tag->shape_view().elem_cnt()); EventWriterHelper::WriteImageToFile(static_cast(istep[0]), *value, tag_str); } diff --git a/oneflow/user/kernels/tanh_grad_kernel.cu b/oneflow/user/kernels/tanh_grad_kernel.cu index 42d6cfb3d49..725fa2613ac 100644 --- a/oneflow/user/kernels/tanh_grad_kernel.cu +++ b/oneflow/user/kernels/tanh_grad_kernel.cu @@ -74,7 +74,7 @@ class TanhGradGPUKernel final : public OpKernel { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); const T* x_ptr = reinterpret_cast(x->dptr()); const T* dy_ptr = reinterpret_cast(dy->dptr()); T* dx_ptr = reinterpret_cast(dx->mut_dptr()); diff --git a/oneflow/user/kernels/tanh_kernel.cpp b/oneflow/user/kernels/tanh_kernel.cpp index 6290aa58a3a..70e25f931d1 100644 --- a/oneflow/user/kernels/tanh_kernel.cpp +++ b/oneflow/user/kernels/tanh_kernel.cpp @@ -31,7 +31,7 @@ class CpuTanhGradKernel final : public user_op::OpKernel { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); const T* x_ptr = x->dptr(); const T* dy_ptr = dy->dptr(); T* dx_ptr = dx->mut_dptr(); diff --git a/oneflow/user/kernels/tensor_buffer_kernels.cpp b/oneflow/user/kernels/tensor_buffer_kernels.cpp index 0d1101fc693..9b6ba9fba97 100644 --- a/oneflow/user/kernels/tensor_buffer_kernels.cpp +++ b/oneflow/user/kernels/tensor_buffer_kernels.cpp @@ -31,9 +31,9 @@ class TensorBufferToTensorKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); CHECK_EQ(in->data_type(), DataType::kTensorBuffer); - const ShapeView& out_shape = out->shape(); + const ShapeView& out_shape = out->shape_view(); const auto& instance_shape = ctx->Attr("instance_shape"); CHECK_EQ(out_shape.NumAxes(), in_shape.NumAxes() + instance_shape.NumAxes()); FOR_RANGE(int64_t, i, 0, in_shape.NumAxes()) { CHECK_EQ(out_shape.At(i), in_shape.At(i)); } @@ -49,7 +49,7 @@ class TensorBufferToTensorKernel final : public user_op::OpKernel { const TensorBuffer* tensor_buffer = in_ptr + i; CHECK_EQ(tensor_buffer->nbytes(), instance_size); CHECK_EQ(tensor_buffer->data_type(), data_type); - CHECK(tensor_buffer->shape() == instance_shape); + CHECK(tensor_buffer->shape_view() == instance_shape); Memcpy(ctx->stream(), out_ptr + i * instance_size, tensor_buffer->data(), instance_size); }); @@ -71,8 +71,8 @@ class TensorToTensorBufferKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); - const ShapeView& out_shape = out->shape(); + const ShapeView& in_shape = in->shape_view(); + const ShapeView& out_shape = out->shape_view(); const auto instance_dims = ctx->Attr("instance_dims"); CHECK_LT(instance_dims, in_shape.NumAxes()); FOR_RANGE(int64_t, i, 0, in_shape.NumAxes() - instance_dims) { @@ -150,21 +150,21 @@ class TensorBufferToListOfTensors final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - CHECK_GT(in->shape().elem_cnt(), 0); + CHECK_GT(in->shape_view().elem_cnt(), 0); CHECK_EQ(in->data_type(), DataType::kTensorBuffer); const DataType out_dtype = ctx->Attr("out_dtype"); CHECK(IsPODDataType(out_dtype)); const bool dynamic_out = ctx->Attr("dynamic_out"); const auto* in_ptr = in->dptr(); - MultiThreadLoop(in->shape().elem_cnt(), [&](size_t i) { + MultiThreadLoop(in->shape_view().elem_cnt(), [&](size_t i) { const TensorBuffer* tensor_buffer = in_ptr + i; user_op::Tensor* out_i = ctx->Tensor4ArgNameAndIndex("out", i); CHECK_EQ(out_dtype, tensor_buffer->data_type()); if (dynamic_out) { - CHECK_LE(tensor_buffer->shape().elem_cnt(), out_i->shape().elem_cnt()); - out_i->mut_shape().set_shape(tensor_buffer->shape()); + CHECK_LE(tensor_buffer->shape_view().elem_cnt(), out_i->shape_view().elem_cnt()); + out_i->mut_shape_view().set_shape(tensor_buffer->shape_view()); } else { - CHECK_EQ(tensor_buffer->shape().elem_cnt(), out_i->shape().elem_cnt()); + CHECK_EQ(tensor_buffer->shape_view().elem_cnt(), out_i->shape_view().elem_cnt()); } Memcpy(ctx->stream(), out_i->mut_dptr(), tensor_buffer->data(), tensor_buffer->nbytes()); @@ -186,21 +186,21 @@ class TensorBufferToListOfTensorsV2 final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - CHECK_GT(in->shape().elem_cnt(), 0); + CHECK_GT(in->shape_view().elem_cnt(), 0); CHECK_EQ(in->data_type(), DataType::kTensorBuffer); const std::vector& out_dtypes = ctx->Attr>("out_dtypes"); const bool dynamic_out = ctx->Attr("dynamic_out"); const auto* in_ptr = in->dptr(); - MultiThreadLoop(in->shape().elem_cnt(), [&](size_t i) { + MultiThreadLoop(in->shape_view().elem_cnt(), [&](size_t i) { CHECK(IsPODDataType(out_dtypes[i])); const TensorBuffer* tensor_buffer = in_ptr + i; user_op::Tensor* out_i = ctx->Tensor4ArgNameAndIndex("out", i); CHECK_EQ(out_dtypes[i], tensor_buffer->data_type()); if (dynamic_out) { - CHECK_LE(tensor_buffer->shape().elem_cnt(), out_i->shape().elem_cnt()); - out_i->mut_shape().set_shape(tensor_buffer->shape()); + CHECK_LE(tensor_buffer->shape_view().elem_cnt(), out_i->shape_view().elem_cnt()); + out_i->mut_shape_view().set_shape(tensor_buffer->shape_view()); } else { - CHECK_EQ(tensor_buffer->shape().elem_cnt(), out_i->shape().elem_cnt()); + CHECK_EQ(tensor_buffer->shape_view().elem_cnt(), out_i->shape_view().elem_cnt()); } Memcpy(ctx->stream(), out_i->mut_dptr(), tensor_buffer->data(), tensor_buffer->nbytes()); diff --git a/oneflow/user/kernels/tf_prelu_kernel.cpp b/oneflow/user/kernels/tf_prelu_kernel.cpp index 7caa42f4f62..7e7e8dd90e4 100644 --- a/oneflow/user/kernels/tf_prelu_kernel.cpp +++ b/oneflow/user/kernels/tf_prelu_kernel.cpp @@ -33,11 +33,11 @@ class TfCpuPReluKernel final : public user_op::OpKernel { const T* x_ptr = x->dptr(); T* y_ptr = y->mut_dptr(); T* broadcasted_alpha_ptr = broadcasted_alpha->mut_dptr(); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); const Shape& left_extended_shape = - CreateLeftExtendedShape(ShapeView(alpha->shape()), x->shape().NumAxes()); + CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(x->shape(), broadcasted_alpha_ptr), + ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha_ptr), XpuVarNdarray(left_extended_shape, alpha->dptr())); FOR_RANGE(int32_t, i, 0, elem_cnt) { y_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : x_ptr[i] * broadcasted_alpha_ptr[i]; @@ -76,16 +76,16 @@ class TfCpuPReluGradKernel final : public user_op::OpKernel { const T* x_ptr = x->dptr(); const T* dy_ptr = dy->dptr(); T* dx_ptr = dx->mut_dptr(); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); T* broadcasted_alpha_ptr = tmp_buffer->mut_dptr(); T* broadcasted_alpha_diff = reinterpret_cast(tmp_buffer->mut_dptr() + GetCudaAlignedSize(elem_cnt * sizeof(T))); T* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + 2 * GetCudaAlignedSize(elem_cnt * sizeof(T))); const Shape& left_extended_shape = - CreateLeftExtendedShape(ShapeView(alpha->shape()), x->shape().NumAxes()); + CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(x->shape(), broadcasted_alpha_ptr), + ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha_ptr), XpuVarNdarray(left_extended_shape, alpha->dptr())); FOR_RANGE(int32_t, i, 0, elem_cnt) { dx_ptr[i] = x_ptr[i] > 0 ? dy_ptr[i] : dy_ptr[i] * broadcasted_alpha_ptr[i]; @@ -93,8 +93,8 @@ class TfCpuPReluGradKernel final : public user_op::OpKernel { } NdarrayUtil::ReduceSum( ctx->stream(), XpuVarNdarray(left_extended_shape, alpha_diff->mut_dptr()), - XpuVarNdarray(x->shape(), broadcasted_alpha_diff), - XpuVarNdarray(x->shape(), reduce_sum_tmp_buf)); + XpuVarNdarray(x->shape_view(), broadcasted_alpha_diff), + XpuVarNdarray(x->shape_view(), reduce_sum_tmp_buf)); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/tf_prelu_kernel.cu b/oneflow/user/kernels/tf_prelu_kernel.cu index 931914bdfef..948016aea5a 100644 --- a/oneflow/user/kernels/tf_prelu_kernel.cu +++ b/oneflow/user/kernels/tf_prelu_kernel.cu @@ -139,10 +139,10 @@ class TfGpuPReluKernel final : public user_op::OpKernel { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); - if (IsAlphaShapeContiguous(alpha->shape(), x->shape())) { - const int32_t outer_size = GetOuterSize(alpha->shape(), x->shape()); - const int32_t alpha_size = alpha->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) { + const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view()); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); const int32_t inner_size = elem_cnt / outer_size / alpha_size; BroadcastPReluForwardGpu<<stream()->As()->cuda_stream()>>>( @@ -150,9 +150,9 @@ class TfGpuPReluKernel final : public user_op::OpKernel { } else { user_op::Tensor* broadcasted_alpha = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const Shape& left_extended_shape = - CreateLeftExtendedShape(ShapeView(alpha->shape()), x->shape().NumAxes()); + CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(x->shape(), broadcasted_alpha->mut_dptr()), + ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha->mut_dptr()), XpuVarNdarray(left_extended_shape, alpha->dptr())); ElemwisePReluForwardGpu<<stream()->As()->cuda_stream()>>>( @@ -196,15 +196,15 @@ class TfGpuPReluGradKernel final : public user_op::OpKernel { user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int32_t elem_cnt = x->shape().elem_cnt(); + const int32_t elem_cnt = x->shape_view().elem_cnt(); T* broadcasted_alpha_diff = tmp_buffer->mut_dptr(); T* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + GetCudaAlignedSize(elem_cnt * sizeof(T))); const Shape& left_extended_shape = - CreateLeftExtendedShape(ShapeView(alpha->shape()), x->shape().NumAxes()); - if (IsAlphaShapeContiguous(alpha->shape(), x->shape())) { - const int32_t outer_size = GetOuterSize(alpha->shape(), x->shape()); - const int32_t alpha_size = alpha->shape().elem_cnt(); + CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); + if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) { + const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view()); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); const int32_t inner_size = elem_cnt / outer_size / alpha_size; BroadcastPReluBackwardGpu<<stream()->As()->cuda_stream()>>>( @@ -215,7 +215,7 @@ class TfGpuPReluGradKernel final : public user_op::OpKernel { + 2 * GetCudaAlignedSize(elem_cnt * sizeof(T))); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(x->shape(), broadcasted_alpha), + ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha), XpuVarNdarray(left_extended_shape, alpha->dptr())); ElemwisePReluBackwardGpu<<::ReduceSum( ctx->stream(), XpuVarNdarray(left_extended_shape, alpha_diff->mut_dptr()), - XpuVarNdarray(x->shape(), broadcasted_alpha_diff), - XpuVarNdarray(x->shape(), reduce_sum_tmp_buf)); + XpuVarNdarray(x->shape_view(), broadcasted_alpha_diff), + XpuVarNdarray(x->shape_view(), reduce_sum_tmp_buf)); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/to_contiguous_kernel.cpp b/oneflow/user/kernels/to_contiguous_kernel.cpp index 007df254be8..be32746d6c2 100644 --- a/oneflow/user/kernels/to_contiguous_kernel.cpp +++ b/oneflow/user/kernels/to_contiguous_kernel.cpp @@ -85,14 +85,12 @@ class ToContiguousKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape(); - CHECK_EQ(out->shape(), in_shape); + const ShapeView& in_shape = in->shape_view(); + CHECK_EQ(out->shape_view(), in_shape); const DataType in_data_type = in->data_type(); CHECK_EQ(out->data_type(), in_data_type); - const DimVector& stride_vec = in->stride().StrideVec(); - std::vector in_stride(in->stride().NumAxes()); - std::copy(stride_vec.begin(), stride_vec.end(), in_stride.begin()); + std::vector in_stride(in->stride().begin(), in->stride().end()); const char* in_dptr = static_cast(in->raw_dptr()); char* out_dptr = static_cast(out->mut_raw_dptr()); diff --git a/oneflow/user/kernels/top_k_kernel.cpp b/oneflow/user/kernels/top_k_kernel.cpp index 46c9834c5ff..ce898b1e70f 100644 --- a/oneflow/user/kernels/top_k_kernel.cpp +++ b/oneflow/user/kernels/top_k_kernel.cpp @@ -84,12 +84,12 @@ class TopKCpuKernel final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - if (in->shape().elem_cnt() == 0) { return; } + if (in->shape_view().elem_cnt() == 0) { return; } user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t instance_size = in->shape().At(in->shape().NumAxes() - 1); - const int64_t instance_num = in->shape().elem_cnt() / instance_size; + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int64_t instance_num = in->shape_view().elem_cnt() / instance_size; const int64_t k = std::min(static_cast(ctx->Attr("k")), instance_size); int64_t* indices_ptr = tmp_buffer ? tmp_buffer->mut_dptr() : nullptr; CpuTopK(ctx->stream(), in->dptr(), indices_ptr, instance_num, instance_size, k, diff --git a/oneflow/user/kernels/transpose_kernel.cpp b/oneflow/user/kernels/transpose_kernel.cpp index f8438fbc102..889a96c1844 100644 --- a/oneflow/user/kernels/transpose_kernel.cpp +++ b/oneflow/user/kernels/transpose_kernel.cpp @@ -50,12 +50,12 @@ class TransposeKernel final : public OpKernel, public user_op::CudaGraphSupport const Tensor* tensor_in = ctx->Tensor4ArgNameAndIndex("input", 0); Tensor* tensor_out = ctx->Tensor4ArgNameAndIndex("output", 0); const auto& perm = ctx->Attr>("perm"); - const ShapeView& in_shape = tensor_in->shape(); + const ShapeView& in_shape = tensor_in->shape_view(); DataType dtype = tensor_out->data_type(); - size_t num_dims = tensor_in->shape().NumAxes(); + size_t num_dims = tensor_in->shape_view().NumAxes(); const int64_t* src_dims = in_shape.ptr(); - int64_t elem_cnt = tensor_out->shape().elem_cnt(); + int64_t elem_cnt = tensor_out->shape_view().elem_cnt(); if (elem_cnt != 0) { if (IsIdentity(perm)) { diff --git a/oneflow/user/kernels/tril_kernel.cpp b/oneflow/user/kernels/tril_kernel.cpp index f0a8f1091ee..038ada3bf8e 100644 --- a/oneflow/user/kernels/tril_kernel.cpp +++ b/oneflow/user/kernels/tril_kernel.cpp @@ -28,7 +28,7 @@ class CpuTrilKernel final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape(); + const auto shape = x->shape_view(); const auto diagonal = ctx->Attr("diagonal"); const int64_t num_rows = shape.At(shape.NumAxes() - 2); const int64_t num_cols = shape.At(shape.NumAxes() - 1); diff --git a/oneflow/user/kernels/tril_kernel.cu b/oneflow/user/kernels/tril_kernel.cu index 5f64d4abdf4..9b7b0214cb3 100644 --- a/oneflow/user/kernels/tril_kernel.cu +++ b/oneflow/user/kernels/tril_kernel.cu @@ -151,7 +151,7 @@ class GpuTrilKernel final : public user_op::OpKernel { using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape(); + const auto shape = x->shape_view(); const auto diagonal = ctx->Attr("diagonal"); const int64_t num_rows = shape.At(shape.NumAxes() - 2); const int64_t num_cols = shape.At(shape.NumAxes() - 1); @@ -205,7 +205,7 @@ class GpuFusedScaleTrilKernel final : public user_op::OpKernel { using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape(); + const auto shape = x->shape_view(); const auto diagonal = ctx->Attr("diagonal"); const int32_t num_rows = shape.At(shape.NumAxes() - 2); const int32_t num_cols = shape.At(shape.NumAxes() - 1); diff --git a/oneflow/user/kernels/triu_kernel.cpp b/oneflow/user/kernels/triu_kernel.cpp index 4add2ee2c92..f6dce625f32 100644 --- a/oneflow/user/kernels/triu_kernel.cpp +++ b/oneflow/user/kernels/triu_kernel.cpp @@ -27,7 +27,7 @@ class CpuTriuKernel final : public user_op::OpKernel { private: void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape(); + const auto shape = x->shape_view(); const auto diagonal = ctx->Attr("diagonal"); const int64_t num_rows = shape.At(shape.NumAxes() - 2); const int64_t num_cols = shape.At(shape.NumAxes() - 1); diff --git a/oneflow/user/kernels/triu_kernel.cu b/oneflow/user/kernels/triu_kernel.cu index 79b103d8161..93e53bc4388 100644 --- a/oneflow/user/kernels/triu_kernel.cu +++ b/oneflow/user/kernels/triu_kernel.cu @@ -86,7 +86,7 @@ class GpuTriuKernel final : public user_op::OpKernel { using user_op::OpKernel::Compute; void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape(); + const auto shape = x->shape_view(); const auto diagonal = ctx->Attr("diagonal"); const int64_t num_rows = shape.At(shape.NumAxes() - 2); const int64_t num_cols = shape.At(shape.NumAxes() - 1); diff --git a/oneflow/user/kernels/tuple_identity_kernel.cpp b/oneflow/user/kernels/tuple_identity_kernel.cpp index 7eaaf756481..44d9c4520e2 100644 --- a/oneflow/user/kernels/tuple_identity_kernel.cpp +++ b/oneflow/user/kernels/tuple_identity_kernel.cpp @@ -35,8 +35,8 @@ class TupleIdentityKernel final : public user_op::OpKernel { user_op::Tensor* out_i = ctx->Tensor4ArgNameAndIndex("out", i); const DataType data_type = in_i->data_type(); CHECK_EQ(out_i->data_type(), data_type); - const ShapeView& shape = in_i->shape(); - CHECK_EQ(out_i->shape(), shape); + const ShapeView& shape = in_i->shape_view(); + CHECK_EQ(out_i->shape_view(), shape); Memcpy(ctx->stream(), out_i->mut_dptr(), in_i->dptr(), shape.elem_cnt() * GetSizeOfDataType(data_type)); } diff --git a/oneflow/user/kernels/two_stage_reduce_kernel.cpp b/oneflow/user/kernels/two_stage_reduce_kernel.cpp index a0298d3e19c..c76eaa9749d 100644 --- a/oneflow/user/kernels/two_stage_reduce_kernel.cpp +++ b/oneflow/user/kernels/two_stage_reduce_kernel.cpp @@ -39,28 +39,28 @@ class ReduceDeviceStageKernel final : public OpKernel { T* reduce_tmp_buf = tmp_buffer->mut_dptr(); int32_t* mask_tmp_buf = tmp_buffer->mut_dptr(); const size_t tmp_bytes = - GetCudaAlignedSize(in->shape().elem_cnt() * std::max(sizeof(T), sizeof(int32_t))); + GetCudaAlignedSize(in->shape_view().elem_cnt() * std::max(sizeof(T), sizeof(int32_t))); int32_t* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + tmp_bytes); NdarrayReduce::Reduce( - ctx->stream(), XpuVarNdarray(out->shape(), out->mut_dptr()), - XpuVarNdarray(in->shape(), in->dptr()), - XpuVarNdarray(in->shape(), reduce_tmp_buf)); + ctx->stream(), XpuVarNdarray(out->shape_view(), out->mut_dptr()), + XpuVarNdarray(in->shape_view(), in->dptr()), + XpuVarNdarray(in->shape_view(), reduce_tmp_buf)); NdarrayUtil::BroadcastEQ( - ctx->stream(), XpuVarNdarray(mask->shape(), mask->mut_dptr()), - XpuVarNdarray(in->shape(), in->dptr()), - XpuVarNdarray(out->shape(), out->dptr())); + ctx->stream(), XpuVarNdarray(mask->shape_view(), mask->mut_dptr()), + XpuVarNdarray(in->shape_view(), in->dptr()), + XpuVarNdarray(out->shape_view(), out->dptr())); auto cast = ep::primitive::NewPrimitive( ctx->device_type(), DataType::kInt8, DataType::kInt32); CHECK(cast); - cast->Launch(ctx->stream(), mask->dptr(), mask_tmp_buf, mask->shape().elem_cnt()); + cast->Launch(ctx->stream(), mask->dptr(), mask_tmp_buf, mask->shape_view().elem_cnt()); NdarrayUtil::ReduceSum( - ctx->stream(), XpuVarNdarray(count->shape(), count->mut_dptr()), - XpuVarNdarray(mask->shape(), mask_tmp_buf), - XpuVarNdarray(mask->shape(), reduce_sum_tmp_buf)); + ctx->stream(), XpuVarNdarray(count->shape_view(), count->mut_dptr()), + XpuVarNdarray(mask->shape_view(), mask_tmp_buf), + XpuVarNdarray(mask->shape_view(), reduce_sum_tmp_buf)); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -104,20 +104,20 @@ class ReduceDeviceStageGradKernel final : public OpKernel { user_op::Tensor* in_diff = ctx->Tensor4ArgNameAndIndex("in_diff", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); T* tmp_buf_ptr = tmp_buffer->mut_dptr(); - const size_t tmp_bytes = GetCudaAlignedSize(out_diff->shape().elem_cnt() * sizeof(T)); + const size_t tmp_bytes = GetCudaAlignedSize(out_diff->shape_view().elem_cnt() * sizeof(T)); T* broadcasted_tmp_buf_ptr = reinterpret_cast(tmp_buffer->mut_dptr() + tmp_bytes); TwoStageReduceKernelUtil::Divide( - ctx->stream(), out_diff->shape().elem_cnt(), out_diff->dptr(), count->dptr(), - tmp_buf_ptr); + ctx->stream(), out_diff->shape_view().elem_cnt(), out_diff->dptr(), + count->dptr(), tmp_buf_ptr); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(in_diff->shape(), broadcasted_tmp_buf_ptr), - XpuVarNdarray(out_diff->shape(), tmp_buf_ptr)); + ctx->stream(), XpuVarNdarray(in_diff->shape_view(), broadcasted_tmp_buf_ptr), + XpuVarNdarray(out_diff->shape_view(), tmp_buf_ptr)); TwoStageReduceKernelUtil::Mask( - ctx->stream(), in_diff->shape().elem_cnt(), broadcasted_tmp_buf_ptr, mask->dptr(), - in_diff->mut_dptr()); + ctx->stream(), in_diff->shape_view().elem_cnt(), broadcasted_tmp_buf_ptr, + mask->dptr(), in_diff->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -161,15 +161,15 @@ class ReduceGlobalStageKernel final : public OpKernel { user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); const auto& axis = ctx->Attr>("axis"); - const Shape& reduced_shape = CreateReducedShape(in->shape(), {axis.begin(), axis.end()}); + const Shape& reduced_shape = CreateReducedShape(in->shape_view(), {axis.begin(), axis.end()}); NdarrayReduce::Reduce( ctx->stream(), XpuVarNdarray(reduced_shape, out->mut_dptr()), - XpuVarNdarray(in->shape(), in->dptr()), - XpuVarNdarray(in->shape(), tmp_buffer->mut_dptr())); + XpuVarNdarray(in->shape_view(), in->dptr()), + XpuVarNdarray(in->shape_view(), tmp_buffer->mut_dptr())); NdarrayUtil::BroadcastEQ( - ctx->stream(), XpuVarNdarray(in->shape(), mask->mut_dptr()), - XpuVarNdarray(in->shape(), in->dptr()), + ctx->stream(), XpuVarNdarray(in->shape_view(), mask->mut_dptr()), + XpuVarNdarray(in->shape_view(), in->dptr()), XpuVarNdarray(reduced_shape, out->dptr())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -207,46 +207,47 @@ class ReduceGlobalStageGradKernel final : public OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); int32_t* device_count_with_mask = tmp_buffer->mut_dptr(); const size_t device_count_with_mask_bytes = - GetCudaAlignedSize(device_count->shape().elem_cnt() * sizeof(int32_t)); + GetCudaAlignedSize(device_count->shape_view().elem_cnt() * sizeof(int32_t)); int32_t* global_count = reinterpret_cast(tmp_buffer->mut_dptr() + device_count_with_mask_bytes); const size_t global_count_bytes = - GetCudaAlignedSize(out_diff->shape().elem_cnt() * sizeof(int32_t)); + GetCudaAlignedSize(out_diff->shape_view().elem_cnt() * sizeof(int32_t)); int32_t* reduce_sum_tmp_buf = reinterpret_cast( tmp_buffer->mut_dptr() + device_count_with_mask_bytes + global_count_bytes); const size_t reduce_sum_tmp_bytes = - GetCudaAlignedSize(device_count->shape().elem_cnt() * sizeof(int32_t)); + GetCudaAlignedSize(device_count->shape_view().elem_cnt() * sizeof(int32_t)); T* divided_buf_ptr = reinterpret_cast(tmp_buffer->mut_dptr() + device_count_with_mask_bytes + global_count_bytes + reduce_sum_tmp_bytes); - const size_t divided_buf_bytes = GetCudaAlignedSize(out_diff->shape().elem_cnt() * sizeof(T)); + const size_t divided_buf_bytes = + GetCudaAlignedSize(out_diff->shape_view().elem_cnt() * sizeof(T)); T* broadcasted_divided_buf_ptr = reinterpret_cast(tmp_buffer->mut_dptr() + device_count_with_mask_bytes + global_count_bytes + reduce_sum_tmp_bytes + divided_buf_bytes); TwoStageReduceKernelUtil::Mask( - ctx->stream(), device_count->shape().elem_cnt(), device_count->dptr(), + ctx->stream(), device_count->shape_view().elem_cnt(), device_count->dptr(), mask->dptr(), device_count_with_mask); const auto& axis = ctx->Attr>("axis"); const Shape& reduced_shape = - CreateReducedShape(device_count->shape(), {axis.begin(), axis.end()}); + CreateReducedShape(device_count->shape_view(), {axis.begin(), axis.end()}); NdarrayUtil::ReduceSum( ctx->stream(), XpuVarNdarray(reduced_shape, global_count), - XpuVarNdarray(device_count->shape(), device_count_with_mask), - XpuVarNdarray(device_count->shape(), reduce_sum_tmp_buf)); + XpuVarNdarray(device_count->shape_view(), device_count_with_mask), + XpuVarNdarray(device_count->shape_view(), reduce_sum_tmp_buf)); TwoStageReduceKernelUtil::Divide( - ctx->stream(), out_diff->shape().elem_cnt(), out_diff->dptr(), global_count, + ctx->stream(), out_diff->shape_view().elem_cnt(), out_diff->dptr(), global_count, divided_buf_ptr); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(in_diff->shape(), broadcasted_divided_buf_ptr), - XpuVarNdarray(out_diff->shape(), divided_buf_ptr)); + ctx->stream(), XpuVarNdarray(in_diff->shape_view(), broadcasted_divided_buf_ptr), + XpuVarNdarray(out_diff->shape_view(), divided_buf_ptr)); TwoStageReduceKernelUtil::Scale( - ctx->stream(), in_diff->shape().elem_cnt(), broadcasted_divided_buf_ptr, + ctx->stream(), in_diff->shape_view().elem_cnt(), broadcasted_divided_buf_ptr, device_count_with_mask, in_diff->mut_dptr()); } diff --git a/oneflow/user/kernels/unfold_kernel.cpp b/oneflow/user/kernels/unfold_kernel.cpp index b883d111277..b84f146cfea 100644 --- a/oneflow/user/kernels/unfold_kernel.cpp +++ b/oneflow/user/kernels/unfold_kernel.cpp @@ -69,7 +69,7 @@ class UnfoldKernel final : public OpKernel { const std::vector dilation = ctx->Attr>("dilation_rate"); const auto& state_ptr = CreateUnfoldOpKernelState( - input->shape(), kernel_size, padding, stride, dilation); + input->shape_view(), kernel_size, padding, stride, dilation); const UnfoldParams params = state_ptr->params(); UnfoldKernelUtil::Forward( diff --git a/oneflow/user/kernels/unfold_tensor_kernel.cpp b/oneflow/user/kernels/unfold_tensor_kernel.cpp index 7b004413215..3727cd6d422 100644 --- a/oneflow/user/kernels/unfold_tensor_kernel.cpp +++ b/oneflow/user/kernels/unfold_tensor_kernel.cpp @@ -31,10 +31,12 @@ class UnfoldTensorKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("y", 0); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); std::vector out_shape; - out_shape.resize(out->shape().NumAxes()); - for (int i = 0; i < out->shape().NumAxes(); ++i) { out_shape[i] = out->shape().At(i); } + out_shape.resize(out->shape_view().NumAxes()); + for (int i = 0; i < out->shape_view().NumAxes(); ++i) { + out_shape[i] = out->shape_view().At(i); + } const int32_t in_dims = in_shape.NumAxes(); const int32_t out_dims = out_shape.size(); @@ -58,7 +60,7 @@ class UnfoldTensorKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int32_t out_size = out->shape().elem_cnt(); + const int32_t out_size = out->shape_view().elem_cnt(); for (int32_t i = 0; i < out_size; ++i) { int offset = Offset(i, out_stride.data(), out_shape.data(), out_dims - 1); out_ptr[i] = in_ptr[offset]; @@ -91,7 +93,7 @@ class UnfoldTensorGradKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* din = ctx->Tensor4ArgNameAndIndex("dx", 0); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); const int32_t in_dims = in_shape.NumAxes(); std::vector din_stride(in_dims, 1); for (int32_t i = in_dims - 2; i >= 0; --i) { @@ -99,8 +101,10 @@ class UnfoldTensorGradKernel final : public user_op::OpKernel { } std::vector dout_shape; - dout_shape.resize(dout->shape().NumAxes()); - for (int i = 0; i < dout->shape().NumAxes(); ++i) { dout_shape[i] = dout->shape().At(i); } + dout_shape.resize(dout->shape_view().NumAxes()); + for (int i = 0; i < dout->shape_view().NumAxes(); ++i) { + dout_shape[i] = dout->shape_view().At(i); + } const int32_t dout_dims = dout_shape.size(); const int32_t dimension = ctx->Attr("dimension"); @@ -119,8 +123,8 @@ class UnfoldTensorGradKernel final : public user_op::OpKernel { const T* dout_ptr = dout->dptr(); T* din_ptr = din->mut_dptr(); - std::fill(din_ptr, din_ptr + din->shape().elem_cnt(), static_cast(0)); - const int32_t dout_size = dout->shape().elem_cnt(); + std::fill(din_ptr, din_ptr + din->shape_view().elem_cnt(), static_cast(0)); + const int32_t dout_size = dout->shape_view().elem_cnt(); for (int32_t i = 0; i < dout_size; ++i) { int offset = Offset(i, dout_stride.data(), dout_shape.data(), dout_dims - 1); din_ptr[offset] += dout_ptr[i]; diff --git a/oneflow/user/kernels/unfold_tensor_kernel.cu b/oneflow/user/kernels/unfold_tensor_kernel.cu index e9ec173c0e7..7b7b9c19d63 100644 --- a/oneflow/user/kernels/unfold_tensor_kernel.cu +++ b/oneflow/user/kernels/unfold_tensor_kernel.cu @@ -97,10 +97,12 @@ class GpuUnfoldTensorKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("y", 0); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); std::vector out_shape; - out_shape.resize(out->shape().NumAxes()); - for (int i = 0; i < out->shape().NumAxes(); ++i) { out_shape[i] = out->shape().At(i); } + out_shape.resize(out->shape_view().NumAxes()); + for (int i = 0; i < out->shape_view().NumAxes(); ++i) { + out_shape[i] = out->shape_view().At(i); + } const int32_t in_dims = in_shape.NumAxes(); const int32_t out_dims = out_shape.size(); const int32_t dimension = ctx->Attr("dimension"); @@ -123,7 +125,7 @@ class GpuUnfoldTensorKernel final : public user_op::OpKernel { const T* in_ptr = in->dptr(); T* out_ptr = out->mut_dptr(); - const int32_t out_size = out->shape().elem_cnt(); + const int32_t out_size = out->shape_view().elem_cnt(); STRIDES out_stride_cuda; for (int i = 0; i < out_dims; ++i) { out_stride_cuda.val[i] = out_stride[i]; } @@ -161,7 +163,7 @@ class GpuUnfoldTensorGradKernel final : public user_op::OpKernel { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* din = ctx->Tensor4ArgNameAndIndex("dx", 0); - const ShapeView& in_shape = in->shape(); + const ShapeView& in_shape = in->shape_view(); const int32_t in_dims = in_shape.NumAxes(); std::vector din_stride(in_dims, 1); for (int32_t i = in_dims - 2; i >= 0; --i) { @@ -169,8 +171,10 @@ class GpuUnfoldTensorGradKernel final : public user_op::OpKernel { } std::vector dout_shape; - dout_shape.resize(dout->shape().NumAxes()); - for (int i = 0; i < dout->shape().NumAxes(); ++i) { dout_shape[i] = dout->shape().At(i); } + dout_shape.resize(dout->shape_view().NumAxes()); + for (int i = 0; i < dout->shape_view().NumAxes(); ++i) { + dout_shape[i] = dout->shape_view().At(i); + } const int32_t dout_dims = dout_shape.size(); const int32_t dimension = ctx->Attr("dimension"); @@ -193,8 +197,8 @@ class GpuUnfoldTensorGradKernel final : public user_op::OpKernel { const T* dout_ptr = dout->dptr(); T* din_ptr = din->mut_dptr(); - const int32_t dout_size = dout->shape().elem_cnt(); - const int32_t din_size = din->shape().elem_cnt(); + const int32_t dout_size = dout->shape_view().elem_cnt(); + const int32_t din_size = din->shape_view().elem_cnt(); GpuUnfoldTensorGradFunctor()(ctx->stream(), dout_ptr, dout_stride_cuda, dout_shape_cuda, dout_dims, dout_size, din_size, din_ptr); diff --git a/oneflow/user/kernels/unique_with_counts_kernel.cpp b/oneflow/user/kernels/unique_with_counts_kernel.cpp index f41bba322c3..d15cb7a66b1 100644 --- a/oneflow/user/kernels/unique_with_counts_kernel.cpp +++ b/oneflow/user/kernels/unique_with_counts_kernel.cpp @@ -35,9 +35,9 @@ class UniqueWithCountsKernel final : public user_op::OpKernel { user_op::Tensor* num_unique = ctx->Tensor4ArgNameAndIndex("num_unique", 0); user_op::Tensor* tmp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); void* tmp_ptr = tmp ? tmp->mut_dptr() : nullptr; - int64_t tmp_size = tmp ? tmp->shape().elem_cnt() * GetSizeOfDataType(tmp->data_type()) : 0; + int64_t tmp_size = tmp ? tmp->shape_view().elem_cnt() * GetSizeOfDataType(tmp->data_type()) : 0; UniqueKernelUtil::UniqueWithCounts( - ctx->stream(), x->shape().elem_cnt(), x->dptr(), num_unique->mut_dptr(), + ctx->stream(), x->shape_view().elem_cnt(), x->dptr(), num_unique->mut_dptr(), y->mut_dptr(), idx->mut_dptr(), count->mut_dptr(), tmp_ptr, tmp_size); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/unpack_kernel.cpp b/oneflow/user/kernels/unpack_kernel.cpp index 82b85f4acf3..35b18165a44 100644 --- a/oneflow/user/kernels/unpack_kernel.cpp +++ b/oneflow/user/kernels/unpack_kernel.cpp @@ -37,16 +37,16 @@ class UnpackKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, const user_op::OpKernelCache*) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - CHECK_GT(in->shape().NumAxes(), 0); + CHECK_GT(in->shape_view().NumAxes(), 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK_EQ(in->data_type(), out->data_type()); - CHECK_EQ(in->shape().NumAxes(), out->shape().NumAxes()); + CHECK_EQ(in->shape_view().NumAxes(), out->shape_view().NumAxes()); const auto unpack_num = ctx->Attr("unpack_num"); - CHECK_EQ(out->shape().At(0) * unpack_num, in->shape().At(0)); - for (int64_t i = 1; i < in->shape().NumAxes(); ++i) { - CHECK_EQ(out->shape().At(i), in->shape().At(i)); + CHECK_EQ(out->shape_view().At(0) * unpack_num, in->shape_view().At(0)); + for (int64_t i = 1; i < in->shape_view().NumAxes(); ++i) { + CHECK_EQ(out->shape_view().At(i), in->shape_view().At(i)); } - const int64_t copy_size = out->shape().elem_cnt() * GetSizeOfDataType(out->data_type()); + const int64_t copy_size = out->shape_view().elem_cnt() * GetSizeOfDataType(out->data_type()); auto* state_wrapper = dynamic_cast>*>(state); CHECK_NOTNULL(state_wrapper); const size_t index = state_wrapper->Get().first; diff --git a/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp b/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp index ec5dde5c346..4fb6f6e9521 100644 --- a/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp +++ b/oneflow/user/kernels/unsorted_batch_segment_sum_kernel.cpp @@ -44,13 +44,14 @@ class UnsortedBatchSegmentSumKernel final : public user_op::OpKernel, const user_op::Tensor* data = ctx->Tensor4ArgNameAndIndex("data", 0); const user_op::Tensor* segment_ids = ctx->Tensor4ArgNameAndIndex("segment_ids", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t axis = segment_ids->shape().NumAxes() - 1; - const Shape& flat_data_shape = GetFlatShape(data->shape(), axis); + const int64_t axis = segment_ids->shape_view().NumAxes() - 1; + const Shape& flat_data_shape = GetFlatShape(data->shape_view(), axis); - Memset(ctx->stream(), out->mut_dptr(), 0, out->shape().elem_cnt() * sizeof(T)); + Memset(ctx->stream(), out->mut_dptr(), 0, + out->shape_view().elem_cnt() * sizeof(T)); BatchGatherKernelUtilImpl::Backward( ctx->stream(), data->dptr(), segment_ids->dptr(), flat_data_shape, - out->shape().At(axis), out->mut_dptr()); + out->shape_view().At(axis), out->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } }; diff --git a/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp b/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp index 9ed411edb39..bcd7b1c5364 100644 --- a/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp +++ b/oneflow/user/kernels/unsorted_segment_sum_kernel.cpp @@ -91,17 +91,18 @@ class UnsortedSegmentSumKernel final : public user_op::OpKernel, public user_op: const user_op::Tensor* segment_ids = ctx->Tensor4ArgNameAndIndex("segment_ids", 0); int64_t axis = ctx->Attr("axis"); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t outer_dim_size = out->shape().Count(0, axis); - int64_t num_segments = out->shape().At(axis); - int64_t inner_dim_size = out->shape().Count(axis + 1); - int64_t num_segment_ids = segment_ids->shape().elem_cnt(); - Memset(ctx->stream(), out->mut_dptr(), 0, out->shape().elem_cnt() * sizeof(T)); + int64_t outer_dim_size = out->shape_view().Count(0, axis); + int64_t num_segments = out->shape_view().At(axis); + int64_t inner_dim_size = out->shape_view().Count(axis + 1); + int64_t num_segment_ids = segment_ids->shape_view().elem_cnt(); + Memset(ctx->stream(), out->mut_dptr(), 0, + out->shape_view().elem_cnt() * sizeof(T)); int64_t offset = 0; if (cache != nullptr) { auto* sum_cache = dynamic_cast(cache); CHECK_NOTNULL(sum_cache); - CHECK_EQ(out->shape().At(axis), sum_cache->upper() - sum_cache->lower()); + CHECK_EQ(out->shape_view().At(axis), sum_cache->upper() - sum_cache->lower()); offset = sum_cache->lower(); } @@ -157,17 +158,17 @@ class UnsortedSegmentSumHalfKernel final : public user_op::OpKernel { int64_t axis = ctx->Attr("axis"); user_op::Tensor* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t outer_dim_size = out->shape().Count(0, axis); - int64_t num_segments = out->shape().At(axis); - int64_t inner_dim_size = out->shape().Count(axis + 1); - int64_t num_segment_ids = segment_ids->shape().elem_cnt(); + int64_t outer_dim_size = out->shape_view().Count(0, axis); + int64_t num_segments = out->shape_view().At(axis); + int64_t inner_dim_size = out->shape_view().Count(axis + 1); + int64_t num_segment_ids = segment_ids->shape_view().elem_cnt(); Memset(ctx->stream(), tmp_buf->mut_dptr(), 0, - out->shape().elem_cnt() * sizeof(float)); + out->shape_view().elem_cnt() * sizeof(float)); int64_t offset = 0; if (cache != nullptr) { auto* sum_cache = dynamic_cast(cache); CHECK_NOTNULL(sum_cache); - CHECK_EQ(out->shape().At(axis), sum_cache->upper() - sum_cache->lower()); + CHECK_EQ(out->shape_view().At(axis), sum_cache->upper() - sum_cache->lower()); offset = sum_cache->lower(); } @@ -179,7 +180,7 @@ class UnsortedSegmentSumHalfKernel final : public user_op::OpKernel { ctx->device_type(), DataType::kFloat, DataType::kFloat16); CHECK(f2h); f2h->Launch(ctx->stream(), tmp_buf->dptr(), out->mut_dptr(), - out->shape().elem_cnt()); + out->shape_view().elem_cnt()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } }; diff --git a/oneflow/user/kernels/upsample_bicubic_2d_kernel.cpp b/oneflow/user/kernels/upsample_bicubic_2d_kernel.cpp index e174f9d2f94..e3018166d18 100644 --- a/oneflow/user/kernels/upsample_bicubic_2d_kernel.cpp +++ b/oneflow/user/kernels/upsample_bicubic_2d_kernel.cpp @@ -37,13 +37,13 @@ class UpsampleBicubic2dCPUKernel final : public user_op::OpKernel { const T* in_ptr = x_tensor->dptr(); T* out_ptr = y_tensor->mut_dptr(); const bool align_corners = ctx->Attr("align_corners"); - const int nbatch = x_tensor->shape().At(0); - const int channels = x_tensor->shape().At(1); + const int nbatch = x_tensor->shape_view().At(0); + const int channels = x_tensor->shape_view().At(1); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t in_width = x_tensor->shape().At(3); - const int64_t out_height = y_tensor->shape().At(2); - const int64_t out_width = y_tensor->shape().At(3); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); @@ -110,19 +110,19 @@ class UpsampleBicubic2dGradCPUKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); T* in_ptr = dx_tensor->mut_dptr(); const T* out_ptr = dy_tensor->dptr(); const bool align_corners = ctx->Attr("align_corners"); - const int nbatch = dx_tensor->shape().At(0); - int channels = dx_tensor->shape().At(1); + const int nbatch = dx_tensor->shape_view().At(0); + int channels = dx_tensor->shape_view().At(1); channels = channels * nbatch; - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t in_width = dx_tensor->shape().At(3); - const int64_t out_height = dy_tensor->shape().At(2); - const int64_t out_width = dy_tensor->shape().At(3); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); diff --git a/oneflow/user/kernels/upsample_bicubic_2d_kernel.cu b/oneflow/user/kernels/upsample_bicubic_2d_kernel.cu index ba810969160..eabdaa4ea7e 100644 --- a/oneflow/user/kernels/upsample_bicubic_2d_kernel.cu +++ b/oneflow/user/kernels/upsample_bicubic_2d_kernel.cu @@ -139,12 +139,12 @@ class UpsampleBicubic2dGPUKernel final : public user_op::OpKernel { T* out_ptr = y_tensor->mut_dptr(); const bool align_corners = ctx->Attr("align_corners"); - const int nbatch = x_tensor->shape().At(0); - const int channels = x_tensor->shape().At(1); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t in_width = x_tensor->shape().At(3); - const int64_t out_height = y_tensor->shape().At(2); - const int64_t out_width = y_tensor->shape().At(3); + const int nbatch = x_tensor->shape_view().At(0); + const int channels = x_tensor->shape_view().At(1); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); @@ -157,7 +157,7 @@ class UpsampleBicubic2dGPUKernel final : public user_op::OpKernel { if (in_height == out_height && in_width == out_width) { Memcpy( ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); } else { const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); @@ -181,16 +181,16 @@ class UpsampleBicubic2dGradGPUKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const bool align_corners = ctx->Attr("align_corners"); - const int nbatch = dx_tensor->shape().At(0); - const int channels = dx_tensor->shape().At(1); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t in_width = dx_tensor->shape().At(3); - const int64_t out_height = dy_tensor->shape().At(2); - const int64_t out_width = dy_tensor->shape().At(3); + const int nbatch = dx_tensor->shape_view().At(0); + const int channels = dx_tensor->shape_view().At(1); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); @@ -203,7 +203,7 @@ class UpsampleBicubic2dGradGPUKernel final : public user_op::OpKernel { if (in_height == out_height && in_width == out_width) { Memcpy( ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); } else { const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); diff --git a/oneflow/user/kernels/upsample_bilinear_2d_kernel.cpp b/oneflow/user/kernels/upsample_bilinear_2d_kernel.cpp index ea1d3637f5e..b4ae545ab3c 100644 --- a/oneflow/user/kernels/upsample_bilinear_2d_kernel.cpp +++ b/oneflow/user/kernels/upsample_bilinear_2d_kernel.cpp @@ -88,18 +88,20 @@ class UpsampleBilinear2DCPUKernel final : public user_op::OpKernel { const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2), x_tensor->shape().At(3)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2), y_tensor->shape().At(3)); - - const int64_t nbatch = x_tensor->shape().At(0); - const int64_t channels = x_tensor->shape().At(1); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t in_width = x_tensor->shape().At(3); - const int64_t out_height = y_tensor->shape().At(2); - const int64_t out_width = y_tensor->shape().At(3); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3)); + + const int64_t nbatch = x_tensor->shape_view().At(0); + const int64_t channels = x_tensor->shape_view().At(1); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); @@ -130,24 +132,26 @@ class UpsampleBilinear2DGradCPUKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const bool align_corners = ctx->Attr("align_corners"); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2), dy_tensor->shape().At(3)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2), dx_tensor->shape().At(3)); - - const int64_t nbatch = dx_tensor->shape().At(0); - const int64_t channels = dx_tensor->shape().At(1); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t in_width = dx_tensor->shape().At(3); - const int64_t out_height = dy_tensor->shape().At(2); - const int64_t out_width = dy_tensor->shape().At(3); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3)); + + const int64_t nbatch = dx_tensor->shape_view().At(0); + const int64_t channels = dx_tensor->shape_view().At(1); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); diff --git a/oneflow/user/kernels/upsample_bilinear_2d_kernel.cu b/oneflow/user/kernels/upsample_bilinear_2d_kernel.cu index c9f3a9d7fb7..2dc3627d1b5 100644 --- a/oneflow/user/kernels/upsample_bilinear_2d_kernel.cu +++ b/oneflow/user/kernels/upsample_bilinear_2d_kernel.cu @@ -94,16 +94,18 @@ class UpsampleBilinear2DGPUKernel final : public user_op::OpKernel { const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2), x_tensor->shape().At(3)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2), y_tensor->shape().At(3)); - - const int64_t in_height = x_tensor->shape().At(2); - const int64_t in_width = x_tensor->shape().At(3); - const int64_t out_height = y_tensor->shape().At(2); - const int64_t out_width = y_tensor->shape().At(3); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3)); + + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); @@ -111,7 +113,7 @@ class UpsampleBilinear2DGPUKernel final : public user_op::OpKernel { if (in_height == out_height && in_width == out_width) { Memcpy( ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); } else { const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); @@ -134,22 +136,24 @@ class UpsampleBilinear2DGradGPUKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const bool align_corners = ctx->Attr("align_corners"); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2), dy_tensor->shape().At(3)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2), dx_tensor->shape().At(3)); - - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t in_width = dx_tensor->shape().At(3); - const int64_t out_height = dy_tensor->shape().At(2); - const int64_t out_width = dy_tensor->shape().At(3); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3)); + + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); @@ -157,7 +161,7 @@ class UpsampleBilinear2DGradGPUKernel final : public user_op::OpKernel { if (in_height == out_height && in_width == out_width) { Memcpy( ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); } else { const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); diff --git a/oneflow/user/kernels/upsample_linear_1d_kernel.cpp b/oneflow/user/kernels/upsample_linear_1d_kernel.cpp index 27c7cf41d94..a6515b26b71 100644 --- a/oneflow/user/kernels/upsample_linear_1d_kernel.cpp +++ b/oneflow/user/kernels/upsample_linear_1d_kernel.cpp @@ -72,15 +72,15 @@ class UpsampleLinear1DCPUKernel final : public user_op::OpKernel { const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2)); - const int64_t nbatch = x_tensor->shape().At(0); - const int64_t channels = x_tensor->shape().At(1); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t out_height = y_tensor->shape().At(2); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); + const int64_t nbatch = x_tensor->shape_view().At(0); + const int64_t channels = x_tensor->shape_view().At(1); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(2); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); if (!output_size.empty()) { @@ -109,20 +109,22 @@ class UpsampleLinearGrad1DCPUKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const bool align_corners = ctx->Attr("align_corners"); - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2)); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - - const int64_t nbatch = dx_tensor->shape().At(0); - const int64_t channels = dx_tensor->shape().At(1); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t out_height = dy_tensor->shape().At(2); + NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), + dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2)); + NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), + dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2)); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + + const int64_t nbatch = dx_tensor->shape_view().At(0); + const int64_t channels = dx_tensor->shape_view().At(1); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(2); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); if (!output_size.empty()) { diff --git a/oneflow/user/kernels/upsample_linear_1d_kernel.cu b/oneflow/user/kernels/upsample_linear_1d_kernel.cu index 2c44f882baa..1c2867cf696 100644 --- a/oneflow/user/kernels/upsample_linear_1d_kernel.cu +++ b/oneflow/user/kernels/upsample_linear_1d_kernel.cu @@ -77,13 +77,13 @@ class UpsampleLinear1DGPUKernel final : public user_op::OpKernel { const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2)); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t out_height = y_tensor->shape().At(2); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(2); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); if (!output_size.empty()) { @@ -92,7 +92,7 @@ class UpsampleLinear1DGPUKernel final : public user_op::OpKernel { if (in_height == out_height) { Memcpy( ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); } else { const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); RUN_CUDA_KERNEL((UpsampleLinear1DForward), ctx->stream(), elem_cnt, elem_cnt, @@ -114,17 +114,19 @@ class UpsampleLinearGrad1DGPUKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const bool align_corners = ctx->Attr("align_corners"); - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2)); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t out_height = dy_tensor->shape().At(2); + NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), + dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2)); + NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), + dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2)); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(2); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); if (!output_size.empty()) { @@ -133,7 +135,7 @@ class UpsampleLinearGrad1DGPUKernel final : public user_op::OpKernel { if (in_height == out_height) { Memcpy( ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); } else { const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); RUN_CUDA_KERNEL((UpsampleLinear1DBackward), ctx->stream(), elem_cnt, elem_cnt, diff --git a/oneflow/user/kernels/upsample_nearest_kernel.cpp b/oneflow/user/kernels/upsample_nearest_kernel.cpp index 4db78f85e5d..70d0d3041bd 100644 --- a/oneflow/user/kernels/upsample_nearest_kernel.cpp +++ b/oneflow/user/kernels/upsample_nearest_kernel.cpp @@ -126,13 +126,13 @@ class UpsampleNearest1DCPUKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); - const int64_t nbatch = x_tensor->shape().At(0); - const int64_t channels = x_tensor->shape().At(1); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t out_height = y_tensor->shape().At(2); + const int64_t nbatch = x_tensor->shape_view().At(0); + const int64_t channels = x_tensor->shape_view().At(1); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(2); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); } @@ -141,12 +141,12 @@ class UpsampleNearest1DCPUKernel final : public user_op::OpKernel { memcpy(y_tensor->mut_dptr(), x_tensor->dptr(), sizeof(T) * nbatch * channels * in_height); } else { - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2)); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); UpsampleNearest1DForward(elem_cnt, x_tensor->dptr(), in_helper, out_helper, - x_tensor->shape().At(2), 1.f / height_scale, + x_tensor->shape_view().At(2), 1.f / height_scale, y_tensor->mut_dptr()); } } @@ -164,15 +164,15 @@ class UpsampleNearestGrad1DCPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - const int64_t nbatch = dx_tensor->shape().At(0); - const int64_t channels = dx_tensor->shape().At(1); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t out_height = dy_tensor->shape().At(2); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + const int64_t nbatch = dx_tensor->shape_view().At(0); + const int64_t channels = dx_tensor->shape_view().At(1); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(2); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); } @@ -180,12 +180,14 @@ class UpsampleNearestGrad1DCPUKernel final : public user_op::OpKernel { memcpy(dx_tensor->mut_dptr(), dy_tensor->dptr(), sizeof(T) * nbatch * channels * in_height); } else { - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2)); + NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), + dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2)); + NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), + dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2)); UpsampleNearest1DBackward(elem_cnt, dy_tensor->dptr(), dy_helper, dx_helper, - dx_tensor->shape().At(2), 1.f / height_scale, + dx_tensor->shape_view().At(2), 1.f / height_scale, dx_tensor->mut_dptr()); } } @@ -218,13 +220,13 @@ class UpsampleNearest2DCPUKernel final : public user_op::OpKernel { const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t nbatch = x_tensor->shape().At(0); - const int64_t channels = x_tensor->shape().At(1); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t in_width = x_tensor->shape().At(3); - const int64_t out_height = y_tensor->shape().At(2); - const int64_t out_width = y_tensor->shape().At(3); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); + const int64_t nbatch = x_tensor->shape_view().At(0); + const int64_t channels = x_tensor->shape_view().At(1); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); @@ -234,12 +236,14 @@ class UpsampleNearest2DCPUKernel final : public user_op::OpKernel { memcpy(y_tensor->mut_dptr(), x_tensor->dptr(), sizeof(T) * nbatch * channels * in_height * in_width); } else { - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2), x_tensor->shape().At(3)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2), y_tensor->shape().At(3)); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3)); UpsampleNearest2DForward(elem_cnt, x_tensor->dptr(), in_helper, out_helper, - x_tensor->shape().At(2), x_tensor->shape().At(3), + x_tensor->shape_view().At(2), x_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr()); } } @@ -257,18 +261,18 @@ class UpsampleNearest2DGradCPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t nbatch = dx_tensor->shape().At(0); - const int64_t channels = dx_tensor->shape().At(1); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t in_width = dx_tensor->shape().At(3); - const int64_t out_height = dy_tensor->shape().At(2); - const int64_t out_width = dy_tensor->shape().At(3); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); + const int64_t nbatch = dx_tensor->shape_view().At(0); + const int64_t channels = dx_tensor->shape_view().At(1); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); @@ -278,12 +282,14 @@ class UpsampleNearest2DGradCPUKernel final : public user_op::OpKernel { memcpy(dx_tensor->mut_dptr(), dy_tensor->dptr(), sizeof(T) * nbatch * channels * in_height * in_width); } else { - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2), dy_tensor->shape().At(3)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2), dx_tensor->shape().At(3)); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2), dy_tensor->shape_view().At(3)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3)); UpsampleNearest2DBackward(elem_cnt, dy_tensor->dptr(), dy_helper, dx_helper, - dx_tensor->shape().At(2), dx_tensor->shape().At(3), + dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr()); } } @@ -317,28 +323,28 @@ class UpsampleNearest3DCPUKernel final : public user_op::OpKernel { double depth_scale = ctx->Attr("depth_scale"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t in_depth = x_blob->shape().At(2); - const int64_t in_height = x_blob->shape().At(3); - const int64_t in_width = x_blob->shape().At(4); - const int64_t out_depth = y_blob->shape().At(2); - const int64_t out_height = y_blob->shape().At(3); - const int64_t out_width = y_blob->shape().At(4); - const int64_t elem_cnt = y_blob->shape().elem_cnt(); + const int64_t in_depth = x_blob->shape_view().At(2); + const int64_t in_height = x_blob->shape_view().At(3); + const int64_t in_width = x_blob->shape_view().At(4); + const int64_t out_depth = y_blob->shape_view().At(2); + const int64_t out_height = y_blob->shape_view().At(3); + const int64_t out_width = y_blob->shape_view().At(4); + const int64_t elem_cnt = y_blob->shape_view().elem_cnt(); if (!output_size.empty()) { depth_scale = static_cast(out_depth) / static_cast(in_depth); height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); } - NdIndexOffsetHelper in_helper(x_blob->shape().At(0), x_blob->shape().At(1), - x_blob->shape().At(2), x_blob->shape().At(3), - x_blob->shape().At(4)); - NdIndexOffsetHelper out_helper(y_blob->shape().At(0), y_blob->shape().At(1), - y_blob->shape().At(2), y_blob->shape().At(3), - y_blob->shape().At(4)); + NdIndexOffsetHelper in_helper( + x_blob->shape_view().At(0), x_blob->shape_view().At(1), x_blob->shape_view().At(2), + x_blob->shape_view().At(3), x_blob->shape_view().At(4)); + NdIndexOffsetHelper out_helper( + y_blob->shape_view().At(0), y_blob->shape_view().At(1), y_blob->shape_view().At(2), + y_blob->shape_view().At(3), y_blob->shape_view().At(4)); UpsampleNearest3DForward(elem_cnt, x_blob->dptr(), in_helper, out_helper, - x_blob->shape().At(2), x_blob->shape().At(3), x_blob->shape().At(4), - 1.f / depth_scale, 1.f / height_scale, 1.f / width_scale, - y_blob->mut_dptr()); + x_blob->shape_view().At(2), x_blob->shape_view().At(3), + x_blob->shape_view().At(4), 1.f / depth_scale, 1.f / height_scale, + 1.f / width_scale, y_blob->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -354,33 +360,33 @@ class UpsampleNearestGrad3DCPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); if (dx_blob == nullptr) { return; } Memset(ctx->stream(), dx_blob->mut_dptr(), 0, - dx_blob->shape().elem_cnt() * sizeof(T)); + dx_blob->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); const std::vector output_size = ctx->Attr>("output_size"); double depth_scale = ctx->Attr("depth_scale"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t in_depth = dx_blob->shape().At(2); - const int64_t in_height = dx_blob->shape().At(3); - const int64_t in_width = dx_blob->shape().At(4); - const int64_t out_depth = dy_blob->shape().At(2); - const int64_t out_height = dy_blob->shape().At(3); - const int64_t out_width = dy_blob->shape().At(4); - const int64_t elem_cnt = dy_blob->shape().elem_cnt(); + const int64_t in_depth = dx_blob->shape_view().At(2); + const int64_t in_height = dx_blob->shape_view().At(3); + const int64_t in_width = dx_blob->shape_view().At(4); + const int64_t out_depth = dy_blob->shape_view().At(2); + const int64_t out_height = dy_blob->shape_view().At(3); + const int64_t out_width = dy_blob->shape_view().At(4); + const int64_t elem_cnt = dy_blob->shape_view().elem_cnt(); if (!output_size.empty()) { depth_scale = static_cast(out_depth) / static_cast(in_depth); height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); } - NdIndexOffsetHelper dy_helper(dy_blob->shape().At(0), dy_blob->shape().At(1), - dy_blob->shape().At(2), dy_blob->shape().At(3), - dy_blob->shape().At(4)); - NdIndexOffsetHelper dx_helper(dx_blob->shape().At(0), dx_blob->shape().At(1), - dx_blob->shape().At(2), dx_blob->shape().At(3), - dx_blob->shape().At(4)); + NdIndexOffsetHelper dy_helper( + dy_blob->shape_view().At(0), dy_blob->shape_view().At(1), dy_blob->shape_view().At(2), + dy_blob->shape_view().At(3), dy_blob->shape_view().At(4)); + NdIndexOffsetHelper dx_helper( + dx_blob->shape_view().At(0), dx_blob->shape_view().At(1), dx_blob->shape_view().At(2), + dx_blob->shape_view().At(3), dx_blob->shape_view().At(4)); UpsampleNearest3DBackward(elem_cnt, dy_blob->dptr(), dy_helper, dx_helper, - dx_blob->shape().At(2), dx_blob->shape().At(3), - dx_blob->shape().At(4), 1.f / depth_scale, 1.f / height_scale, + dx_blob->shape_view().At(2), dx_blob->shape_view().At(3), + dx_blob->shape_view().At(4), 1.f / depth_scale, 1.f / height_scale, 1.f / width_scale, dx_blob->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/upsample_nearest_kernel.cu b/oneflow/user/kernels/upsample_nearest_kernel.cu index a9fe4d557b9..d299150adae 100644 --- a/oneflow/user/kernels/upsample_nearest_kernel.cu +++ b/oneflow/user/kernels/upsample_nearest_kernel.cu @@ -130,23 +130,23 @@ class UpsampleNearest1DGPUKernel final : public user_op::OpKernel { user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t out_height = y_tensor->shape().At(2); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(2); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); } if (in_height == out_height) { Memcpy( ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); } else { - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2)); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); RUN_CUDA_KERNEL((UpsampleNearest1DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape().At(2), + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), 1.f / height_scale, y_tensor->mut_dptr()); } } @@ -165,27 +165,29 @@ class UpsampleNearestGrad1DGPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("scale_factor"); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t out_height = dy_tensor->shape().At(2); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(2); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); } if (in_height == out_height) { Memcpy( ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); } else { - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2)); + NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), + dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2)); + NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), + dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2)); RUN_CUDA_KERNEL((UpsampleNearest1DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape().At(2), + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), 1.f / height_scale, dx_tensor->mut_dptr()); } } @@ -219,11 +221,11 @@ class UpsampleNearest2DGPUKernel final : public user_op::OpKernel { const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - const int64_t in_height = x_tensor->shape().At(2); - const int64_t in_width = x_tensor->shape().At(3); - const int64_t out_height = y_tensor->shape().At(2); - const int64_t out_width = y_tensor->shape().At(3); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); @@ -232,15 +234,17 @@ class UpsampleNearest2DGPUKernel final : public user_op::OpKernel { if (in_height == out_height && in_width == out_width) { Memcpy( ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); } else { - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2), x_tensor->shape().At(3)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2), y_tensor->shape().At(3)); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3)); RUN_CUDA_KERNEL((UpsampleNearest2DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape().At(2), - x_tensor->shape().At(3), 1.f / height_scale, 1.f / width_scale, + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr()); } } @@ -259,16 +263,16 @@ class UpsampleNearest2DGradGPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const std::vector output_size = ctx->Attr>("output_size"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - const int64_t in_height = dx_tensor->shape().At(2); - const int64_t in_width = dx_tensor->shape().At(3); - const int64_t out_height = dy_tensor->shape().At(2); - const int64_t out_width = dy_tensor->shape().At(3); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); if (!output_size.empty()) { height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); @@ -276,15 +280,17 @@ class UpsampleNearest2DGradGPUKernel final : public user_op::OpKernel { if (in_height == out_height && in_width == out_width) { Memcpy( ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); } else { - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2), dy_tensor->shape().At(3)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2), dx_tensor->shape().At(3)); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2), dy_tensor->shape_view().At(3)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3)); RUN_CUDA_KERNEL((UpsampleNearest2DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape().At(2), - dx_tensor->shape().At(3), 1.f / height_scale, 1.f / width_scale, + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr()); } } @@ -319,27 +325,27 @@ class UpsampleNearest3DGPUKernel final : public user_op::OpKernel { double depth_scale = ctx->Attr("depth_scale"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t in_depth = x_tensor->shape().At(2); - const int64_t in_height = x_tensor->shape().At(3); - const int64_t in_width = x_tensor->shape().At(4); - const int64_t out_depth = y_tensor->shape().At(2); - const int64_t out_height = y_tensor->shape().At(3); - const int64_t out_width = y_tensor->shape().At(4); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); + const int64_t in_depth = x_tensor->shape_view().At(2); + const int64_t in_height = x_tensor->shape_view().At(3); + const int64_t in_width = x_tensor->shape_view().At(4); + const int64_t out_depth = y_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(3); + const int64_t out_width = y_tensor->shape_view().At(4); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); if (!output_size.empty()) { depth_scale = static_cast(out_depth) / static_cast(in_depth); height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); } - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2), x_tensor->shape().At(3), - x_tensor->shape().At(4)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2), y_tensor->shape().At(3), - y_tensor->shape().At(4)); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3), y_tensor->shape_view().At(4)); RUN_CUDA_KERNEL((UpsampleNearest3DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape().At(2), - x_tensor->shape().At(3), x_tensor->shape().At(4), 1.f / depth_scale, + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), 1.f / depth_scale, 1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -357,33 +363,33 @@ class UpsampleNearestGrad3DGPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const std::vector output_size = ctx->Attr>("output_size"); double depth_scale = ctx->Attr("depth_scale"); double height_scale = ctx->Attr("height_scale"); double width_scale = ctx->Attr("width_scale"); - const int64_t in_depth = dx_tensor->shape().At(2); - const int64_t in_height = dx_tensor->shape().At(3); - const int64_t in_width = dx_tensor->shape().At(4); - const int64_t out_depth = dy_tensor->shape().At(2); - const int64_t out_height = dy_tensor->shape().At(3); - const int64_t out_width = dy_tensor->shape().At(4); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); + const int64_t in_depth = dx_tensor->shape_view().At(2); + const int64_t in_height = dx_tensor->shape_view().At(3); + const int64_t in_width = dx_tensor->shape_view().At(4); + const int64_t out_depth = dy_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(3); + const int64_t out_width = dy_tensor->shape_view().At(4); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); if (!output_size.empty()) { depth_scale = static_cast(out_depth) / static_cast(in_depth); height_scale = static_cast(out_height) / static_cast(in_height); width_scale = static_cast(out_width) / static_cast(in_width); } - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2), dy_tensor->shape().At(3), - dy_tensor->shape().At(4)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2), dx_tensor->shape().At(3), - dx_tensor->shape().At(4)); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4)); RUN_CUDA_KERNEL((UpsampleNearest3DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape().At(2), - dx_tensor->shape().At(3), dx_tensor->shape().At(4), 1.f / depth_scale, + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), 1.f / depth_scale, 1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/upsample_trilinear_3d_kernel.cpp b/oneflow/user/kernels/upsample_trilinear_3d_kernel.cpp index 1872a901802..767aa248655 100644 --- a/oneflow/user/kernels/upsample_trilinear_3d_kernel.cpp +++ b/oneflow/user/kernels/upsample_trilinear_3d_kernel.cpp @@ -125,21 +125,21 @@ class UpsampleTrilinear3DCPUKernel final : public user_op::OpKernel { const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2), x_tensor->shape().At(3), - x_tensor->shape().At(4)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2), y_tensor->shape().At(3), - y_tensor->shape().At(4)); - - const int64_t in_depth = x_tensor->shape().At(2); - const int64_t in_height = x_tensor->shape().At(3); - const int64_t in_width = x_tensor->shape().At(4); - - const int64_t out_depth = y_tensor->shape().At(2); - const int64_t out_height = y_tensor->shape().At(3); - const int64_t out_width = y_tensor->shape().At(4); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3), y_tensor->shape_view().At(4)); + + const int64_t in_depth = x_tensor->shape_view().At(2); + const int64_t in_height = x_tensor->shape_view().At(3); + const int64_t in_width = x_tensor->shape_view().At(4); + + const int64_t out_depth = y_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(3); + const int64_t out_width = y_tensor->shape_view().At(4); const std::vector output_size = ctx->Attr>("output_size"); double depth_scale = ctx->Attr("depth_scale"); @@ -156,9 +156,9 @@ class UpsampleTrilinear3DCPUKernel final : public user_op::OpKernel { const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); UpsampleTrilinear3DForward(elem_cnt, x_tensor->dptr(), in_helper, out_helper, - x_tensor->shape().At(2), x_tensor->shape().At(3), - x_tensor->shape().At(4), scale_depth, scale_height, scale_width, - align_corners, y_tensor->mut_dptr()); + x_tensor->shape_view().At(2), x_tensor->shape_view().At(3), + x_tensor->shape_view().At(4), scale_depth, scale_height, + scale_width, align_corners, y_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -174,24 +174,24 @@ class UpsampleTrilinearGrad3DCPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2), dy_tensor->shape().At(3), - dy_tensor->shape().At(4)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2), dx_tensor->shape().At(3), - dx_tensor->shape().At(4)); - - const int64_t in_depth = dx_tensor->shape().At(2); - const int64_t in_height = dx_tensor->shape().At(3); - const int64_t in_width = dx_tensor->shape().At(4); - - const int64_t out_depth = dy_tensor->shape().At(2); - const int64_t out_height = dy_tensor->shape().At(3); - const int64_t out_width = dy_tensor->shape().At(4); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4)); + + const int64_t in_depth = dx_tensor->shape_view().At(2); + const int64_t in_height = dx_tensor->shape_view().At(3); + const int64_t in_width = dx_tensor->shape_view().At(4); + + const int64_t out_depth = dy_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(3); + const int64_t out_width = dy_tensor->shape_view().At(4); const std::vector output_size = ctx->Attr>("output_size"); double depth_scale = ctx->Attr("depth_scale"); @@ -208,9 +208,9 @@ class UpsampleTrilinearGrad3DCPUKernel final : public user_op::OpKernel { const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); UpsampleTrilinear3DBackward(elem_cnt, dy_tensor->dptr(), dy_helper, dx_helper, - dx_tensor->shape().At(2), dx_tensor->shape().At(3), - dx_tensor->shape().At(4), scale_depth, scale_height, scale_width, - align_corners, dx_tensor->mut_dptr()); + dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3), + dx_tensor->shape_view().At(4), scale_depth, scale_height, + scale_width, align_corners, dx_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/upsample_trilinear_3d_kernel.cu b/oneflow/user/kernels/upsample_trilinear_3d_kernel.cu index 7ce58e53027..d26eb8084ac 100644 --- a/oneflow/user/kernels/upsample_trilinear_3d_kernel.cu +++ b/oneflow/user/kernels/upsample_trilinear_3d_kernel.cu @@ -129,21 +129,21 @@ class UpsampleTrilinear3DGPUKernel final : public user_op::OpKernel { const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = y_tensor->shape().elem_cnt(); - NdIndexOffsetHelper in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1), - x_tensor->shape().At(2), x_tensor->shape().At(3), - x_tensor->shape().At(4)); - NdIndexOffsetHelper out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1), - y_tensor->shape().At(2), y_tensor->shape().At(3), - y_tensor->shape().At(4)); - - const int64_t in_depth = x_tensor->shape().At(2); - const int64_t in_height = x_tensor->shape().At(3); - const int64_t in_width = x_tensor->shape().At(4); - - const int64_t out_depth = y_tensor->shape().At(2); - const int64_t out_height = y_tensor->shape().At(3); - const int64_t out_width = y_tensor->shape().At(4); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3), y_tensor->shape_view().At(4)); + + const int64_t in_depth = x_tensor->shape_view().At(2); + const int64_t in_height = x_tensor->shape_view().At(3); + const int64_t in_width = x_tensor->shape_view().At(4); + + const int64_t out_depth = y_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(3); + const int64_t out_width = y_tensor->shape_view().At(4); const std::vector output_size = ctx->Attr>("output_size"); double depth_scale = ctx->Attr("depth_scale"); @@ -160,9 +160,9 @@ class UpsampleTrilinear3DGPUKernel final : public user_op::OpKernel { const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); RUN_CUDA_KERNEL((UpsampleTrilinear3DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape().At(2), - x_tensor->shape().At(3), x_tensor->shape().At(4), scale_depth, scale_height, - scale_width, align_corners, y_tensor->mut_dptr()); + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), scale_depth, + scale_height, scale_width, align_corners, y_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -179,24 +179,24 @@ class UpsampleTrilinearGrad3DGPUKernel final : public user_op::OpKernel { user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape().elem_cnt() * sizeof(T)); + dx_tensor->shape_view().elem_cnt() * sizeof(T)); const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = dy_tensor->shape().elem_cnt(); - NdIndexOffsetHelper dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1), - dy_tensor->shape().At(2), dy_tensor->shape().At(3), - dy_tensor->shape().At(4)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1), - dx_tensor->shape().At(2), dx_tensor->shape().At(3), - dx_tensor->shape().At(4)); - - const int64_t in_depth = dx_tensor->shape().At(2); - const int64_t in_height = dx_tensor->shape().At(3); - const int64_t in_width = dx_tensor->shape().At(4); - - const int64_t out_depth = dy_tensor->shape().At(2); - const int64_t out_height = dy_tensor->shape().At(3); - const int64_t out_width = dy_tensor->shape().At(4); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4)); + + const int64_t in_depth = dx_tensor->shape_view().At(2); + const int64_t in_height = dx_tensor->shape_view().At(3); + const int64_t in_width = dx_tensor->shape_view().At(4); + + const int64_t out_depth = dy_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(3); + const int64_t out_width = dy_tensor->shape_view().At(4); const std::vector output_size = ctx->Attr>("output_size"); double depth_scale = ctx->Attr("depth_scale"); @@ -213,9 +213,9 @@ class UpsampleTrilinearGrad3DGPUKernel final : public user_op::OpKernel { const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); RUN_CUDA_KERNEL((UpsampleTrilinear3DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape().At(2), - dx_tensor->shape().At(3), dx_tensor->shape().At(4), scale_depth, scale_height, - scale_width, align_corners, dx_tensor->mut_dptr()); + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), scale_depth, + scale_height, scale_width, align_corners, dx_tensor->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/variance_kernel.cpp b/oneflow/user/kernels/variance_kernel.cpp index ad133841c04..22b0b039740 100644 --- a/oneflow/user/kernels/variance_kernel.cpp +++ b/oneflow/user/kernels/variance_kernel.cpp @@ -37,10 +37,10 @@ class VarKernel final : public user_op::OpKernel { const std::vector axis = ctx->Attr>("dim"); // only all dims cuda case will use tmp buffer. T* tmp_buffer_ptr = - (axis.size() == input->shape().NumAxes() && DeviceType::kCUDA == device_type) + (axis.size() == input->shape_view().NumAxes() && DeviceType::kCUDA == device_type) ? ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr() : nullptr; - VarParamHelper param_helper(input->shape(), axis, unbiased); + VarParamHelper param_helper(input->shape_view(), axis, unbiased); VarFunctor()(ctx->stream(), in_ptr, out_ptr, tmp_buffer_ptr, param_helper.param); } diff --git a/oneflow/user/kernels/where_kernel.cpp b/oneflow/user/kernels/where_kernel.cpp index b87fb2131e2..ee9265f6cf5 100644 --- a/oneflow/user/kernels/where_kernel.cpp +++ b/oneflow/user/kernels/where_kernel.cpp @@ -32,28 +32,28 @@ class WhereKernel final : public user_op::OpKernel { const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - if (!(x->shape() == y->shape() && y->shape() == cond->shape())) { - size_t num_axes = out->shape().NumAxes(); - int64_t elem_cnt = out->shape().elem_cnt(); + if (!(x->shape_view() == y->shape_view() && y->shape_view() == cond->shape_view())) { + size_t num_axes = out->shape_view().NumAxes(); + int64_t elem_cnt = out->shape_view().elem_cnt(); const size_t x_bytes = GetCudaAlignedSize(elem_cnt * sizeof(T)); const size_t y_bytes = GetCudaAlignedSize(elem_cnt * sizeof(T)); T* y_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + x_bytes); CondT* cond_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + x_bytes + y_bytes); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out->shape(), tmp_buffer->mut_dptr()), - XpuVarNdarray(x->shape(), x->dptr(), num_axes)); + ctx->stream(), XpuVarNdarray(out->shape_view(), tmp_buffer->mut_dptr()), + XpuVarNdarray(x->shape_view(), x->dptr(), num_axes)); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out->shape(), y_tmp_buf), - XpuVarNdarray(y->shape(), y->dptr(), num_axes)); + ctx->stream(), XpuVarNdarray(out->shape_view(), y_tmp_buf), + XpuVarNdarray(y->shape_view(), y->dptr(), num_axes)); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out->shape(), cond_tmp_buf), - XpuVarNdarray(cond->shape(), cond->dptr(), num_axes)); - WhereKernelUtil::Where(ctx->stream(), out->shape().elem_cnt(), + ctx->stream(), XpuVarNdarray(out->shape_view(), cond_tmp_buf), + XpuVarNdarray(cond->shape_view(), cond->dptr(), num_axes)); + WhereKernelUtil::Where(ctx->stream(), out->shape_view().elem_cnt(), cond_tmp_buf, tmp_buffer->mut_dptr(), y_tmp_buf, out->mut_dptr()); } else { - WhereKernelUtil::Where(ctx->stream(), out->shape().elem_cnt(), + WhereKernelUtil::Where(ctx->stream(), out->shape_view().elem_cnt(), cond->dptr(), x->dptr(), y->dptr(), out->mut_dptr()); } @@ -83,24 +83,24 @@ class WhereScalarXKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED() << "The scalar in Where should be bool, float or int."; } - if (!(y->shape() == cond->shape())) { - size_t num_axes = out->shape().NumAxes(); - int64_t elem_cnt = out->shape().elem_cnt(); + if (!(y->shape_view() == cond->shape_view())) { + size_t num_axes = out->shape_view().NumAxes(); + int64_t elem_cnt = out->shape_view().elem_cnt(); const size_t y_bytes = GetCudaAlignedSize(elem_cnt * sizeof(T)); CondT* cond_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + y_bytes); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out->shape(), tmp_buffer->mut_dptr()), - XpuVarNdarray(y->shape(), y->dptr(), num_axes)); + ctx->stream(), XpuVarNdarray(out->shape_view(), tmp_buffer->mut_dptr()), + XpuVarNdarray(y->shape_view(), y->dptr(), num_axes)); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out->shape(), cond_tmp_buf), - XpuVarNdarray(cond->shape(), cond->dptr(), num_axes)); + ctx->stream(), XpuVarNdarray(out->shape_view(), cond_tmp_buf), + XpuVarNdarray(cond->shape_view(), cond->dptr(), num_axes)); WhereKernelUtil::WhereXScalar( - ctx->stream(), out->shape().elem_cnt(), cond_tmp_buf, scalar_operand, + ctx->stream(), out->shape_view().elem_cnt(), cond_tmp_buf, scalar_operand, tmp_buffer->mut_dptr(), out->mut_dptr()); } else { - WhereKernelUtil::WhereXScalar(ctx->stream(), out->shape().elem_cnt(), - cond->dptr(), scalar_operand, - y->dptr(), out->mut_dptr()); + WhereKernelUtil::WhereXScalar( + ctx->stream(), out->shape_view().elem_cnt(), cond->dptr(), scalar_operand, + y->dptr(), out->mut_dptr()); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -128,24 +128,24 @@ class WhereScalarYKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED() << "The scalar in Where should be bool, float or int"; } - if (!(x->shape() == cond->shape())) { - size_t num_axes = out->shape().NumAxes(); - int64_t elem_cnt = out->shape().elem_cnt(); + if (!(x->shape_view() == cond->shape_view())) { + size_t num_axes = out->shape_view().NumAxes(); + int64_t elem_cnt = out->shape_view().elem_cnt(); const size_t x_bytes = GetCudaAlignedSize(elem_cnt * sizeof(T)); CondT* cond_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + x_bytes); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out->shape(), tmp_buffer->mut_dptr()), - XpuVarNdarray(x->shape(), x->dptr(), num_axes)); + ctx->stream(), XpuVarNdarray(out->shape_view(), tmp_buffer->mut_dptr()), + XpuVarNdarray(x->shape_view(), x->dptr(), num_axes)); NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(out->shape(), cond_tmp_buf), - XpuVarNdarray(cond->shape(), cond->dptr(), num_axes)); - WhereKernelUtil::WhereYScalar(ctx->stream(), out->shape().elem_cnt(), - cond_tmp_buf, tmp_buffer->mut_dptr(), - scalar_operand, out->mut_dptr()); + ctx->stream(), XpuVarNdarray(out->shape_view(), cond_tmp_buf), + XpuVarNdarray(cond->shape_view(), cond->dptr(), num_axes)); + WhereKernelUtil::WhereYScalar( + ctx->stream(), out->shape_view().elem_cnt(), cond_tmp_buf, tmp_buffer->mut_dptr(), + scalar_operand, out->mut_dptr()); } else { - WhereKernelUtil::WhereYScalar(ctx->stream(), out->shape().elem_cnt(), - cond->dptr(), x->dptr(), - scalar_operand, out->mut_dptr()); + WhereKernelUtil::WhereYScalar( + ctx->stream(), out->shape_view().elem_cnt(), cond->dptr(), x->dptr(), + scalar_operand, out->mut_dptr()); } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -161,7 +161,7 @@ class WhereScalarXYKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* cond = ctx->Tensor4ArgNameAndIndex("condition", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - if (out->shape().elem_cnt() == 0) { return; } + if (out->shape_view().elem_cnt() == 0) { return; } T x_scalar_operand = static_cast(0); T y_scalar_operand = static_cast(0); if (ctx->Attr("has_x_int_operand") && ctx->Attr("has_y_int_operand")) { @@ -176,9 +176,9 @@ class WhereScalarXYKernel final : public user_op::OpKernel { } else { UNIMPLEMENTED() << "The scalar in Where should be bool, float or int"; } - WhereKernelUtil::WhereXYScalar(ctx->stream(), out->shape().elem_cnt(), - cond->dptr(), x_scalar_operand, - y_scalar_operand, out->mut_dptr()); + WhereKernelUtil::WhereXYScalar( + ctx->stream(), out->shape_view().elem_cnt(), cond->dptr(), x_scalar_operand, + y_scalar_operand, out->mut_dptr()); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/kernels/zero_like_kernel.cpp b/oneflow/user/kernels/zero_like_kernel.cpp index e25481a94d0..36033a29a4e 100644 --- a/oneflow/user/kernels/zero_like_kernel.cpp +++ b/oneflow/user/kernels/zero_like_kernel.cpp @@ -28,7 +28,7 @@ class ZeroLikeKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); Memset(ctx->stream(), out->mut_dptr(), 0, - out->shape().elem_cnt() * GetSizeOfDataType(out->data_type())); + out->shape_view().elem_cnt() * GetSizeOfDataType(out->data_type())); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/ops/flatten_op.cpp b/oneflow/user/ops/flatten_op.cpp index ca4b5358821..7ac839b479c 100644 --- a/oneflow/user/ops/flatten_op.cpp +++ b/oneflow/user/ops/flatten_op.cpp @@ -23,7 +23,7 @@ namespace oneflow { const int32_t end_dim = ctx->Attr("end_dim"); const user_op::TensorDesc& in_tensor_desc = ctx->InputTensorDesc("in", 0); user_op::TensorDesc* out_tensor_desc = ctx->OutputTensorDesc("out", 0); - const Shape& in_shape = ZeroDimCompatiableShape(in_tensor_desc.shape()); + const Shape& in_shape = ExpandDimIf0D(in_tensor_desc.shape()); CHECK_GE_OR_RETURN(start_dim, 0); CHECK_LT_OR_RETURN(start_dim, in_shape.NumAxes()); const int32_t true_end_dim = end_dim < 0 ? end_dim + in_shape.NumAxes() : end_dim; diff --git a/oneflow/user/ops/fused_scale_mask_softmax_dropout_op.cpp b/oneflow/user/ops/fused_scale_mask_softmax_dropout_op.cpp index 736006457dd..eabeed57b06 100644 --- a/oneflow/user/ops/fused_scale_mask_softmax_dropout_op.cpp +++ b/oneflow/user/ops/fused_scale_mask_softmax_dropout_op.cpp @@ -22,7 +22,11 @@ namespace oneflow { -> Maybe { const user_op::TensorDesc& x_desc = ctx->InputTensorDesc("x", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_OR_RETURN(x_desc.shape() == mask_desc.shape()); + const auto x_shape = x_desc.shape(); + const auto mask_shape = mask_desc.shape(); + CHECK_EQ_OR_RETURN(x_desc.shape().At(x_shape.NumAxes() - 1), + mask_desc.shape().At(mask_shape.NumAxes() - 1)) + << " last dim of x and mask is not equal."; *ctx->OutputShape("y", 0) = x_desc.shape(); *ctx->OutputIsDynamic("y", 0) = x_desc.is_dynamic(); *ctx->OutputShape("softmax_y", 0) = x_desc.shape(); @@ -37,7 +41,7 @@ namespace oneflow { -> Maybe { const user_op::TensorDesc& x_desc = ctx->InputTensorDesc("x", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_OR_RETURN(mask_desc.data_type() == DataType::kBool); + CHECK_EQ_OR_RETURN(mask_desc.data_type(), DataType::kBool) << " mask dtype only support bool."; *ctx->OutputDType("y", 0) = x_desc.data_type(); *ctx->OutputDType("softmax_y", 0) = x_desc.data_type(); return Maybe::Ok(); @@ -47,23 +51,37 @@ namespace oneflow { -> Maybe { user_op::InputArgModifier* mask_modifier = GetInputArgModifierFn("mask", 0); user_op::InputArgModifier* dropout_mask_modifier = GetInputArgModifierFn("dropout_mask", 0); - CHECK_OR_RETURN(mask_modifier != nullptr); - CHECK_OR_RETURN(dropout_mask_modifier != nullptr); + CHECK_OR_RETURN(mask_modifier != nullptr) << " cannot find mask input."; + CHECK_OR_RETURN(dropout_mask_modifier != nullptr) << " cannot find dropout mask input."; mask_modifier->set_requires_grad(false); dropout_mask_modifier->set_requires_grad(false); return Maybe::Ok(); } /*static*/ auto FusedScaleMaskSoftmaxDropoutOp::GetSbp(user_op::SbpContext* ctx) -> Maybe { const user_op::TensorDesc& x_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0); - CHECK_GE_OR_RETURN(x_tensor.shape().NumAxes(), 2); + CHECK_GE_OR_RETURN(x_tensor.shape().NumAxes(), 2) << " x num axes at least 2."; + const user_op::TensorDesc& mask_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("mask", 0); + CHECK_EQ_OR_RETURN(x_tensor.shape().NumAxes(), mask_tensor.shape().NumAxes()) + << " x num axes must equal with mask."; FOR_RANGE(int64_t, axis, 0, x_tensor.shape().NumAxes() - 2) { - ctx->NewBuilder() - .Split(user_op::OpArg("x", 0), axis) - .Split(user_op::OpArg("mask", 0), axis) - .Split(user_op::OpArg("dropout_mask", 0), axis) - .Split(user_op::OpArg("y", 0), axis) - .Split(user_op::OpArg("softmax_y", 0), axis) - .Build(); + // NOTE(chengcheng): mask support broadcast, when dim value = 1, sbp = broadcast + if (mask_tensor.shape().At(axis) == 1) { + ctx->NewBuilder() + .Split(user_op::OpArg("x", 0), axis) + .Broadcast(user_op::OpArg("mask", 0)) + .Split(user_op::OpArg("dropout_mask", 0), axis) + .Split(user_op::OpArg("y", 0), axis) + .Split(user_op::OpArg("softmax_y", 0), axis) + .Build(); + } else { + ctx->NewBuilder() + .Split(user_op::OpArg("x", 0), axis) + .Split(user_op::OpArg("mask", 0), axis) + .Split(user_op::OpArg("dropout_mask", 0), axis) + .Split(user_op::OpArg("y", 0), axis) + .Split(user_op::OpArg("softmax_y", 0), axis) + .Build(); + } } return Maybe::Ok(); } @@ -73,8 +91,10 @@ namespace oneflow { const user_op::TensorDesc& softmax_y_desc = ctx->InputTensorDesc("softmax_y", 0); const user_op::TensorDesc& dy_desc = ctx->InputTensorDesc("dy", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_EQ_OR_RETURN(dy_desc.shape(), softmax_y_desc.shape()); - CHECK_OR_RETURN(dy_desc.shape() == mask_desc.shape()); + CHECK_EQ_OR_RETURN(dy_desc.shape(), softmax_y_desc.shape()) << " dy and y shape must equal."; + CHECK_EQ_OR_RETURN(dy_desc.shape().At(dy_desc.shape().NumAxes() - 1), + mask_desc.shape().At(mask_desc.shape().NumAxes() - 1)) + << " last dim of y and mask is not equal."; user_op::TensorDesc* dx_desc = ctx->OutputTensorDesc("dx", 0); *dx_desc->mut_shape() = dy_desc.shape(); *dx_desc->mut_is_dynamic() = dy_desc.is_dynamic(); @@ -89,8 +109,9 @@ namespace oneflow { const user_op::TensorDesc& softmax_y_desc = ctx->InputTensorDesc("softmax_y", 0); const user_op::TensorDesc& dy_desc = ctx->InputTensorDesc("dy", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_OR_RETURN(dy_desc.data_type() == softmax_y_desc.data_type()); - CHECK_OR_RETURN(mask_desc.data_type() == DataType::kBool); + CHECK_EQ_OR_RETURN(dy_desc.data_type(), softmax_y_desc.data_type()) + << " dy and softmax_y dtype must equal"; + CHECK_EQ_OR_RETURN(mask_desc.data_type(), DataType::kBool) << " mask dtype only support bool."; user_op::TensorDesc* dx_desc = ctx->OutputTensorDesc("dx", 0); *dx_desc->mut_data_type() = dy_desc.data_type(); return Maybe::Ok(); @@ -98,15 +119,28 @@ namespace oneflow { /*static*/ auto FusedScaleMaskSoftmaxDropoutGradOp::GetSbp(user_op::SbpContext* ctx) -> Maybe { const user_op::TensorDesc& dy_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("dy", 0); - CHECK_GE_OR_RETURN(dy_tensor.shape().NumAxes(), 2); + CHECK_GE_OR_RETURN(dy_tensor.shape().NumAxes(), 2) << " dy num axes at least 2."; + const user_op::TensorDesc& mask_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("mask", 0); + CHECK_EQ_OR_RETURN(dy_tensor.shape().NumAxes(), mask_tensor.shape().NumAxes()) + << " dy num axes must equal with mask."; FOR_RANGE(int64_t, axis, 0, dy_tensor.shape().NumAxes() - 2) { - ctx->NewBuilder() - .Split(user_op::OpArg("softmax_y", 0), axis) - .Split(user_op::OpArg("dy", 0), axis) - .Split(user_op::OpArg("mask", 0), axis) - .Split(user_op::OpArg("dropout_mask", 0), axis) - .Split(user_op::OpArg("dx", 0), axis) - .Build(); + if (mask_tensor.shape().At(axis) == 1) { + ctx->NewBuilder() + .Split(user_op::OpArg("softmax_y", 0), axis) + .Split(user_op::OpArg("dy", 0), axis) + .Broadcast(user_op::OpArg("mask", 0)) + .Split(user_op::OpArg("dropout_mask", 0), axis) + .Split(user_op::OpArg("dx", 0), axis) + .Build(); + } else { + ctx->NewBuilder() + .Split(user_op::OpArg("softmax_y", 0), axis) + .Split(user_op::OpArg("dy", 0), axis) + .Split(user_op::OpArg("mask", 0), axis) + .Split(user_op::OpArg("dropout_mask", 0), axis) + .Split(user_op::OpArg("dx", 0), axis) + .Build(); + } } return Maybe::Ok(); } diff --git a/oneflow/user/ops/fused_scale_mask_softmax_op.cpp b/oneflow/user/ops/fused_scale_mask_softmax_op.cpp index fd00f053757..235e897db47 100644 --- a/oneflow/user/ops/fused_scale_mask_softmax_op.cpp +++ b/oneflow/user/ops/fused_scale_mask_softmax_op.cpp @@ -22,7 +22,11 @@ namespace oneflow { -> Maybe { const user_op::TensorDesc& x_desc = ctx->InputTensorDesc("x", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_OR_RETURN(x_desc.shape() == mask_desc.shape()); + const auto x_shape = x_desc.shape(); + const auto mask_shape = mask_desc.shape(); + CHECK_EQ_OR_RETURN(x_desc.shape().At(x_shape.NumAxes() - 1), + mask_desc.shape().At(mask_shape.NumAxes() - 1)) + << " last dim of x and mask is not equal."; *ctx->OutputShape("y", 0) = x_desc.shape(); *ctx->OutputIsDynamic("y", 0) = x_desc.is_dynamic(); return Maybe::Ok(); @@ -34,7 +38,7 @@ namespace oneflow { /*static*/ auto FusedScaleMaskSoftmaxOp::InferDataType(user_op::InferContext* ctx) -> Maybe { const user_op::TensorDesc& x_desc = ctx->InputTensorDesc("x", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_OR_RETURN(mask_desc.data_type() == DataType::kBool); + CHECK_EQ_OR_RETURN(mask_desc.data_type(), DataType::kBool) << " mask dtype only support bool."; *ctx->OutputDType("y", 0) = x_desc.data_type(); return Maybe::Ok(); } @@ -42,19 +46,30 @@ namespace oneflow { const user_op::GetInputArgModifier& GetInputArgModifierFn, const user_op::UserOpConfWrapper&) -> Maybe { user_op::InputArgModifier* mask_modifier = GetInputArgModifierFn("mask", 0); - CHECK_OR_RETURN(mask_modifier != nullptr); + CHECK_OR_RETURN(mask_modifier != nullptr) << " cannot find mask input."; mask_modifier->set_requires_grad(false); return Maybe::Ok(); } /*static*/ auto FusedScaleMaskSoftmaxOp::GetSbp(user_op::SbpContext* ctx) -> Maybe { const user_op::TensorDesc& x_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0); - CHECK_GE_OR_RETURN(x_tensor.shape().NumAxes(), 2); + CHECK_GE_OR_RETURN(x_tensor.shape().NumAxes(), 2) << " x num axes at least 2."; + const user_op::TensorDesc& mask_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("mask", 0); + CHECK_EQ_OR_RETURN(x_tensor.shape().NumAxes(), mask_tensor.shape().NumAxes()) + << " x num axes must equal with mask."; FOR_RANGE(int64_t, axis, 0, x_tensor.shape().NumAxes() - 2) { - ctx->NewBuilder() - .Split(user_op::OpArg("x", 0), axis) - .Split(user_op::OpArg("mask", 0), axis) - .Split(user_op::OpArg("y", 0), axis) - .Build(); + if (mask_tensor.shape().At(axis) == 1) { + ctx->NewBuilder() + .Split(user_op::OpArg("x", 0), axis) + .Broadcast(user_op::OpArg("mask", 0)) + .Split(user_op::OpArg("y", 0), axis) + .Build(); + } else { + ctx->NewBuilder() + .Split(user_op::OpArg("x", 0), axis) + .Split(user_op::OpArg("mask", 0), axis) + .Split(user_op::OpArg("y", 0), axis) + .Build(); + } } return Maybe::Ok(); } @@ -64,8 +79,10 @@ namespace oneflow { const user_op::TensorDesc& dy_desc = ctx->InputTensorDesc("dy", 0); const user_op::TensorDesc& y_desc = ctx->InputTensorDesc("y", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_EQ_OR_RETURN(dy_desc.shape(), y_desc.shape()); - CHECK_OR_RETURN(y_desc.shape() == mask_desc.shape()); + CHECK_EQ_OR_RETURN(dy_desc.shape(), y_desc.shape()) << " dy and y shape must equal."; + CHECK_EQ_OR_RETURN(y_desc.shape().At(y_desc.shape().NumAxes() - 1), + mask_desc.shape().At(mask_desc.shape().NumAxes() - 1)) + << " last dim of y and mask is not equal."; user_op::TensorDesc* dx_desc = ctx->OutputTensorDesc("dx", 0); *dx_desc->mut_shape() = dy_desc.shape(); *dx_desc->mut_is_dynamic() = dy_desc.is_dynamic(); @@ -80,22 +97,34 @@ namespace oneflow { const user_op::TensorDesc& dy_desc = ctx->InputTensorDesc("dy", 0); const user_op::TensorDesc& y_desc = ctx->InputTensorDesc("y", 0); const user_op::TensorDesc& mask_desc = ctx->InputTensorDesc("mask", 0); - CHECK_OR_RETURN(dy_desc.data_type() == y_desc.data_type()); - CHECK_OR_RETURN(mask_desc.data_type() == DataType::kBool); + CHECK_EQ_OR_RETURN(dy_desc.data_type(), y_desc.data_type()) << " dy and y dtype must equal"; + CHECK_EQ_OR_RETURN(mask_desc.data_type(), DataType::kBool) << " mask dtype only support bool."; user_op::TensorDesc* dx_desc = ctx->OutputTensorDesc("dx", 0); *dx_desc->mut_data_type() = dy_desc.data_type(); return Maybe::Ok(); } /*static*/ auto FusedScaleMaskSoftmaxGradOp::GetSbp(user_op::SbpContext* ctx) -> Maybe { const user_op::TensorDesc& dy_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("dy", 0); - CHECK_GE_OR_RETURN(dy_tensor.shape().NumAxes(), 2); + CHECK_GE_OR_RETURN(dy_tensor.shape().NumAxes(), 2) << " dy num axes at least 2."; + const user_op::TensorDesc& mask_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("mask", 0); + CHECK_EQ_OR_RETURN(dy_tensor.shape().NumAxes(), mask_tensor.shape().NumAxes()) + << " dy num axes must equal with mask."; FOR_RANGE(int64_t, axis, 0, dy_tensor.shape().NumAxes() - 2) { - ctx->NewBuilder() - .Split(user_op::OpArg("y", 0), axis) - .Split(user_op::OpArg("dy", 0), axis) - .Split(user_op::OpArg("mask", 0), axis) - .Split(user_op::OpArg("dx", 0), axis) - .Build(); + if (mask_tensor.shape().At(axis) == 1) { + ctx->NewBuilder() + .Split(user_op::OpArg("y", 0), axis) + .Split(user_op::OpArg("dy", 0), axis) + .Broadcast(user_op::OpArg("mask", 0)) + .Split(user_op::OpArg("dx", 0), axis) + .Build(); + } else { + ctx->NewBuilder() + .Split(user_op::OpArg("y", 0), axis) + .Split(user_op::OpArg("dy", 0), axis) + .Split(user_op::OpArg("mask", 0), axis) + .Split(user_op::OpArg("dx", 0), axis) + .Build(); + } } return Maybe::Ok(); } diff --git a/oneflow/user/ops/math_binary_broadcast_ops.cpp b/oneflow/user/ops/math_binary_broadcast_ops.cpp index 54697d29a9d..0c4ef770ac3 100644 --- a/oneflow/user/ops/math_binary_broadcast_ops.cpp +++ b/oneflow/user/ops/math_binary_broadcast_ops.cpp @@ -26,9 +26,7 @@ bool IsScalarTensor(const user_op::TensorDesc* tensor) { return tensor->shape().NumAxes() == 1 && tensor->shape().At(0) == 1; } -bool IsZeroDimTensor(const user_op::TensorDesc* tensor) { - return tensor->shape().NumAxes() == 0 && tensor->shape().elem_cnt() == 1; -} +bool IsZeroDimTensor(const user_op::TensorDesc* tensor) { return tensor->shape().NumAxes() == 0; } Maybe InferTensorDescBinaryBroadcastNormal(user_op::InferContext* ctx) { const user_op::TensorDesc& tensor_x = ctx->InputTensorDesc("x", 0); diff --git a/oneflow/user/ops/nll_op.cpp b/oneflow/user/ops/nll_op.cpp index b170194aff4..1afffc2c16b 100644 --- a/oneflow/user/ops/nll_op.cpp +++ b/oneflow/user/ops/nll_op.cpp @@ -14,125 +14,183 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/framework/framework.h" -#include "oneflow/user/ops/loss_op_util.h" #include "oneflow/core/framework/op_generated.h" namespace oneflow { -namespace { +/* static */ Maybe NLLOp::InferDataType(user_op::InferContext* ctx) { + CHECK_OR_RETURN(IsIndexDataType(ctx->InputDType("target", 0))) + << ctx->op_name() << ": expected target being integer type"; -Maybe InferTensorDescFn(user_op::InferContext* ctx) { - const auto& input_desc = ctx->InputTensorDesc("input", 0); - const auto& target_desc = ctx->InputTensorDesc("target", 0); - CHECK_EQ_OR_RETURN(input_desc.is_dynamic(), target_desc.is_dynamic()); - CHECK_GE_OR_RETURN(input_desc.shape().NumAxes(), 2); - CHECK_EQ_OR_RETURN(target_desc.shape().NumAxes(), 1); - CHECK_EQ_OR_RETURN(input_desc.shape().At(0), target_desc.shape().At(0)); + auto input_dtype = ctx->InputDType("input", 0); if (ctx->has_input("weight", 0)) { - const auto& weight_desc = ctx->InputTensorDesc("weight", 0); - CHECK_EQ_OR_RETURN(weight_desc.is_dynamic(), input_desc.is_dynamic()); - CHECK_EQ_OR_RETURN(weight_desc.shape(), Shape({input_desc.shape().At(1)})); + auto weight_dtype = ctx->InputDType("weight", 0); + CHECK_EQ_OR_RETURN(weight_dtype, input_dtype) << ctx->op_name() << ": expected weight dtype " + << input_dtype << ", but got " << weight_dtype; } - user_op::TensorDesc* out_desc = ctx->OutputTensorDesc("out", 0); - *out_desc->mut_is_dynamic() = input_desc.is_dynamic(); - *out_desc->mut_shape() = target_desc.shape(); - - user_op::TensorDesc* total_weight_desc = ctx->OutputTensorDesc("total_weight", 0); - *total_weight_desc->mut_is_dynamic() = input_desc.is_dynamic(); - *total_weight_desc->mut_shape() = Shape({}); - - return Maybe::Ok(); -} - -Maybe NllInferDataType(user_op::InferContext* ctx) { - const user_op::TensorDesc& target_desc = ctx->InputTensorDesc("target", 0); - CHECK_OR_RETURN(IsIndexDataType(target_desc.data_type())); - - *ctx->OutputDType("out", 0) = ctx->InputDType("input", 0); - *ctx->OutputDType("total_weight", 0) = ctx->InputDType("input", 0); + *ctx->OutputDType("output", 0) = input_dtype; + *ctx->OutputDType("out_weight", 0) = input_dtype; return Maybe::Ok(); } -Maybe InferGradTensorDescFn(user_op::InferContext* ctx) { +/* static */ Maybe NLLOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { const auto& input_desc = ctx->InputTensorDesc("input", 0); const auto& target_desc = ctx->InputTensorDesc("target", 0); - const auto& total_weight_desc = ctx->InputTensorDesc("total_weight", 0); - const auto& dy_desc = ctx->InputTensorDesc("dy", 0); - CHECK_EQ_OR_RETURN(input_desc.is_dynamic(), target_desc.is_dynamic()); - CHECK_GE_OR_RETURN(input_desc.shape().NumAxes(), 2); - CHECK_EQ_OR_RETURN(target_desc.shape().NumAxes(), 1); - CHECK_EQ_OR_RETURN(input_desc.shape().At(0), target_desc.shape().At(0)); - CHECK_EQ_OR_RETURN(dy_desc.shape(), target_desc.shape()); - CHECK_EQ_OR_RETURN(total_weight_desc.shape(), Shape({})); + + const bool is_dynamic = input_desc.is_dynamic(); + CHECK_EQ_OR_RETURN(target_desc.is_dynamic(), is_dynamic) + << ctx->op_name() << ": expected the same dynamic with input and target"; + const int64_t K = input_desc.shape().NumAxes(); + CHECK_GE_OR_RETURN(K, 2) << ctx->op_name() << ": expected 2 or more dimensions for input"; + CHECK_EQ_OR_RETURN(target_desc.shape().NumAxes(), K - 1) + << ctx->op_name() << ": expected 1 less diemensions than input for target"; + const int64_t N = target_desc.shape().elem_cnt(); + const int64_t C = input_desc.shape().At(input_desc.shape().NumAxes() - 1); + CHECK_EQ_OR_RETURN(input_desc.shape().elem_cnt(), N * C) + << ctx->op_name() << ": expected input size " << input_desc.shape().ToString() + << " to match target size " << target_desc.shape().ToString(); + if (ctx->has_input("weight", 0)) { const auto& weight_desc = ctx->InputTensorDesc("weight", 0); - CHECK_EQ_OR_RETURN(weight_desc.is_dynamic(), input_desc.is_dynamic()); - CHECK_EQ_OR_RETURN(weight_desc.shape(), Shape({input_desc.shape().At(1)})); + CHECK_EQ_OR_RETURN(weight_desc.is_dynamic(), is_dynamic) + << ctx->op_name() << ": expected the same dynamic with input and weight"; + CHECK_EQ_OR_RETURN(weight_desc.shape().elem_cnt(), C) + << ctx->op_name() << ": expected weight size " << C << ", got " + << weight_desc.shape().ToString(); } - user_op::TensorDesc* dx_desc = ctx->OutputTensorDesc("dx", 0); - *dx_desc->mut_is_dynamic() = input_desc.is_dynamic(); - *dx_desc->mut_shape() = input_desc.shape(); + user_op::TensorDesc* output_desc = ctx->OutputTensorDesc("output", 0); + *output_desc->mut_is_dynamic() = is_dynamic; + *output_desc->mut_shape() = Shape({N}); - return Maybe::Ok(); -} - -Maybe InferGradDataType(user_op::InferContext* ctx) { - const user_op::TensorDesc& target_desc = ctx->InputTensorDesc("target", 0); - CHECK_OR_RETURN(IsIndexDataType(target_desc.data_type())); - - *ctx->OutputDType("dx", 0) = ctx->InputDType("dy", 0); + user_op::TensorDesc* out_weight_desc = ctx->OutputTensorDesc("out_weight", 0); + *out_weight_desc->mut_is_dynamic() = is_dynamic; + *out_weight_desc->mut_shape() = Shape({N}); return Maybe::Ok(); } -} // namespace -/* static */ Maybe NllOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { - return InferTensorDescFn(ctx); -} - -/*static*/ Maybe NllOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { - return InferLogicalTensorDesc(ctx); -} +/* static */ Maybe NLLOp::GetSbp(user_op::SbpContext* ctx) { + // split batch dim + auto builder1 = ctx->NewBuilder() + .Split(user_op::OpArg("input", 0), 0) + .Split(user_op::OpArg("target", 0), 0) + .Split(user_op::OpArg("output", 0), 0) + .Split(user_op::OpArg("out_weight", 0), 0); + if (ctx->user_op_conf().has_input("weight", 0)) { + builder1.Broadcast(user_op::OpArg("weight", 0)); + } + builder1.Build(); + + // split class dim + const auto& shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("input", 0).shape(); + auto builder2 = ctx->NewBuilder() + .Split(user_op::OpArg("input", 0), shape.NumAxes() - 1) + .Broadcast(user_op::OpArg("target", 0)) + .PartialSum(user_op::OpArg("output", 0)) + .PartialSum(user_op::OpArg("out_weight", 0)); + if (ctx->user_op_conf().has_input("weight", 0)) { + builder2.Split(user_op::OpArg("weight", 0), 0); + } + builder2.Build(); -/* static */ Maybe NllOp::GetSbp(user_op::SbpContext* ctx) { - return GenLossForwardDefaultGetSbpFn( - [](user_op::UserOpSbpSignatureBuilder& builder, user_op::SbpContext* ctx) { - builder.PartialSum(user_op::OpArg("total_weight", 0)); - })(ctx); + return Maybe::Ok(); } -/* static */ Maybe NllOp::ModifyInputArg(const GetInputArgModifier& GetInputArgModifierFn, +/* static */ Maybe NLLOp::ModifyInputArg(const GetInputArgModifier& GetInputArgModifierFn, const user_op::UserOpConfWrapper& conf) { user_op::InputArgModifier* target_modifier = GetInputArgModifierFn("target", 0); CHECK_OR_RETURN(target_modifier != nullptr); target_modifier->set_requires_grad(false); + if (conf.has_input("weight", 0)) { + auto* weight_modifier = GetInputArgModifierFn("weight", 0); + if (weight_modifier) { weight_modifier->set_requires_grad(false); } + } return Maybe::Ok(); } -/* static */ Maybe NllOp::InferDataType(user_op::InferContext* ctx) { - return NllInferDataType(ctx); -} +/* static */ Maybe NLLGradOp::InferDataType(user_op::InferContext* ctx) { + CHECK_OR_RETURN(IsIndexDataType(ctx->InputDType("target", 0))) + << ctx->op_name() << ": expected target being integer type"; -/* static */ Maybe NllGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { - return InferGradTensorDescFn(ctx); -} + auto input_dtype = ctx->InputDType("input", 0); + CHECK_EQ_OR_RETURN(ctx->InputDType("out_grad", 0), input_dtype) + << ctx->op_name() << ": expected out_grad dtype " << input_dtype << ", got " + << ctx->InputDType("out_grad", 0); + + if (ctx->has_input("weight", 0)) { + CHECK_EQ_OR_RETURN(ctx->InputDType("weight", 0), input_dtype) + << ctx->op_name() << ": expected weight dtype " << input_dtype << ", got " + << ctx->InputDType("weight", 0); + } + + *ctx->OutputDType("in_grad", 0) = input_dtype; -/*static*/ Maybe NllGradOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { - return InferLogicalTensorDesc(ctx); + return Maybe::Ok(); } -/* static */ Maybe NllGradOp::GetSbp(user_op::SbpContext* ctx) { - return GenLossBackwardDefaultGetSbpFn( - [](user_op::UserOpSbpSignatureBuilder& builder, user_op::SbpContext* ctx) { - builder.PartialSum(user_op::OpArg("total_weight", 0)); - })(ctx); +/* static */ Maybe NLLGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { + const auto& input_desc = ctx->InputTensorDesc("input", 0); + const auto& target_desc = ctx->InputTensorDesc("target", 0); + const auto& out_grad_desc = ctx->InputTensorDesc("out_grad", 0); + + bool is_dynamic = input_desc.is_dynamic(); + CHECK_EQ_OR_RETURN(target_desc.is_dynamic(), is_dynamic) + << ctx->op_name() << ": expected target dynamic " << is_dynamic; + CHECK_EQ_OR_RETURN(out_grad_desc.is_dynamic(), is_dynamic) + << ctx->op_name() << ": expected out_grad dynamic " << is_dynamic; + + const int64_t N = target_desc.shape().elem_cnt(); + CHECK_EQ_OR_RETURN(out_grad_desc.shape().elem_cnt(), N) + << ctx->op_name() << ": expected out_grad size " << N << ", got " + << out_grad_desc.shape().ToString(); + + const int64_t C = input_desc.shape().At(input_desc.shape().NumAxes() - 1); + CHECK_EQ_OR_RETURN(input_desc.shape().elem_cnt(), N * C) + << ctx->op_name() << ": expected input size " << N << ", got " + << input_desc.shape().ToString(); + + if (ctx->has_input("weight", 0)) { + const auto& weight_desc = ctx->InputTensorDesc("weight", 0); + CHECK_EQ_OR_RETURN(weight_desc.shape().elem_cnt(), C) + << ctx->op_name() << ": expected weight size " << C << ", got " + << weight_desc.shape().ToString(); + } + + user_op::TensorDesc* in_grad_desc = ctx->OutputTensorDesc("in_grad", 0); + *in_grad_desc->mut_is_dynamic() = is_dynamic; + *in_grad_desc->mut_shape() = input_desc.shape(); + + return Maybe::Ok(); } -/* static */ Maybe NllGradOp::InferDataType(user_op::InferContext* ctx) { - return InferGradDataType(ctx); +/* static */ Maybe NLLGradOp::GetSbp(user_op::SbpContext* ctx) { + // split batch dim + auto builder1 = ctx->NewBuilder() + .Split(user_op::OpArg("input", 0), 0) + .Split(user_op::OpArg("target", 0), 0) + .Split(user_op::OpArg("out_grad", 0), 0) + .Split(user_op::OpArg("in_grad", 0), 0); + if (ctx->user_op_conf().has_input("weight", 0)) { + builder1.Broadcast(user_op::OpArg("weight", 0)); + } + builder1.Build(); + + // split class dim + const auto& shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("input", 0).shape(); + auto builder2 = ctx->NewBuilder() + .Split(user_op::OpArg("input", 0), shape.NumAxes() - 1) + .Broadcast(user_op::OpArg("target", 0)) + .Broadcast(user_op::OpArg("out_grad", 0)) + .Split(user_op::OpArg("in_grad", 0), shape.NumAxes() - 1); + if (ctx->user_op_conf().has_input("weight", 0)) { + builder2.Split(user_op::OpArg("weight", 0), 0); + } + builder2.Build(); + + return Maybe::Ok(); } REGISTER_USER_OP_GRAD("nll").SetGenBackwardOpConfFn( @@ -142,15 +200,14 @@ REGISTER_USER_OP_GRAD("nll").SetGenBackwardOpConfFn( builder.Op("nll_grad") .Input("input", op.input("input", 0)) .Input("target", op.input("target", 0)) - .Input("total_weight", op.output("total_weight", 0)) - .Input("dy", op.GetGradTensorWithOpOutput("out", 0)) - .Output("dx") + .Input("out_grad", op.GetGradTensorWithOpOutput("output", 0)) + .Output("in_grad") .Attr("ignore_index", op.attr("ignore_index")); if (op.user_op_conf().has_input("weight", 0)) { builder.Input("weight", op.input("weight", 0)); } - user_op::UserOpConfWrapper grad_op = builder.Build(); - op.BindGradTensorWithOpInput(grad_op.output("dx", 0), "input", 0); + auto grad_op = builder.Build(); + op.BindGradTensorWithOpInput(grad_op.output("in_grad", 0), "input", 0); AddOp(grad_op); } return Maybe::Ok(); diff --git a/oneflow/user/ops/slice_op.cpp b/oneflow/user/ops/slice_op.cpp index 482118b253d..71e2aa66d92 100644 --- a/oneflow/user/ops/slice_op.cpp +++ b/oneflow/user/ops/slice_op.cpp @@ -29,136 +29,19 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { } } // namespace -/*static*/ Maybe SliceOp::GetSbp(user_op::SbpContext* ctx) { - const Shape& x_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape(); - const int64_t ndim = x_shape.NumAxes(); - const auto& start_vec = ctx->Attr>("start"); - const auto& stop_vec = ctx->Attr>("stop"); - const auto& step_vec = ctx->Attr>("step"); - CHECK_EQ_OR_RETURN(start_vec.size(), ndim); - CHECK_EQ_OR_RETURN(stop_vec.size(), ndim); - CHECK_EQ_OR_RETURN(step_vec.size(), ndim); - - FOR_RANGE(int, i, 0, ndim) { - if (IsFullSlice(start_vec.at(i), stop_vec.at(i), step_vec.at(i), x_shape.At(i))) { - ctx->NewBuilder().Split(ctx->inputs(), i).Split(ctx->outputs(), i).Build(); - } - } - ctx->NewBuilder().PartialSum(ctx->inputs()).PartialSum(ctx->outputs()).Build(); - return Maybe::Ok(); -} -/*static*/ Maybe SliceOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { - const Shape& x_shape = ZeroDimCompatiableShape(ctx->InputShape("x", 0)); - const int64_t ndim = x_shape.NumAxes(); - const auto& start_vec = ctx->Attr>("start"); - const auto& stop_vec = ctx->Attr>("stop"); - const auto& step_vec = ctx->Attr>("step"); - CHECK_EQ_OR_RETURN(start_vec.size(), ndim); - CHECK_EQ_OR_RETURN(stop_vec.size(), ndim); - CHECK_EQ_OR_RETURN(step_vec.size(), ndim); - - DimVector dim_vec(ndim); - FOR_RANGE(size_t, i, 0, dim_vec.size()) { - const int64_t dim_size = x_shape.At(i); - const int64_t step = step_vec.at(i); - int64_t start = start_vec.at(i); - int64_t stop = stop_vec.at(i); - if (dim_size == 0 || start == stop) { - dim_vec[i] = 0; - continue; - } - CHECK_NE_OR_RETURN(step, 0) << "slice step cannot be 0"; - start = RegulateSliceStart(start, dim_size); - stop = RegulateSliceStop(stop, dim_size); - if (step > 0) { - CHECK_LE_OR_RETURN(start, stop) << "slice start must be less than stop when step > 0" - ", otherwise empty result will be outputted."; - } else { - CHECK_GT_OR_RETURN(start, stop) << "slice start must be more than stop when step < 0" - ", otherwise empty result will be outputted."; - } - const int64_t diff = (step > 0) ? (stop - start - 1) : (stop - start + 1); - dim_vec[i] = diff / step + 1; - } - *ctx->OutputShape("y", 0) = Shape(dim_vec); - return Maybe::Ok(); -} -/*static*/ Maybe SliceOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { - return InferLogicalTensorDesc(ctx); -} -/*static*/ Maybe SliceOp::InferDataType(user_op::InferContext* ctx) { - *ctx->OutputDType("y", 0) = ctx->InputDType("x", 0); - return Maybe::Ok(); -} - -/*static*/ Maybe SliceGradOp::GetSbp(user_op::SbpContext* ctx) { - const Shape& like_shape = ctx->Attr("like_shape"); - const int64_t ndim = like_shape.NumAxes(); - const auto& start_vec = ctx->Attr>("start"); - const auto& stop_vec = ctx->Attr>("stop"); - const auto& step_vec = ctx->Attr>("step"); - CHECK_EQ_OR_RETURN(start_vec.size(), ndim); - CHECK_EQ_OR_RETURN(stop_vec.size(), ndim); - CHECK_EQ_OR_RETURN(step_vec.size(), ndim); - - FOR_RANGE(int, i, 0, ndim) { - if (IsFullSlice(start_vec.at(i), stop_vec.at(i), step_vec.at(i), like_shape.At(i))) { - ctx->NewBuilder().Split(ctx->inputs(), i).Split(ctx->outputs(), i).Build(); - } - } - ctx->NewBuilder().PartialSum(user_op::OpArg("dy", 0)).PartialSum(user_op::OpArg("dx", 0)).Build(); - ctx->NewBuilder().Broadcast(user_op::OpArg("dy", 0)).Broadcast(user_op::OpArg("dx", 0)).Build(); - return Maybe::Ok(); -} -/*static*/ Maybe SliceGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { - const Shape& like_shape = ctx->Attr("like_shape"); - const Shape& dy_shape = ctx->InputShape("dy", 0); - const auto& start_vec = ctx->Attr>("start"); - const auto& stop_vec = ctx->Attr>("stop"); - const auto& step_vec = ctx->Attr>("step"); - - const int64_t ndim = dy_shape.NumAxes(); - CHECK_EQ_OR_RETURN(like_shape.NumAxes(), ndim); - CHECK_EQ_OR_RETURN(start_vec.size(), ndim); - CHECK_EQ_OR_RETURN(stop_vec.size(), ndim); - CHECK_EQ_OR_RETURN(step_vec.size(), ndim); - *ctx->OutputShape("dx", 0) = like_shape; - return Maybe::Ok(); -} -/*static*/ Maybe SliceGradOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { - Shape logical_shape = ctx->Attr("like_shape"); - const user_op::TensorDesc& dy_desc = ctx->InputTensorDesc("dy", 0); - user_op::TensorDesc* dx_desc = ctx->OutputTensorDesc("dx", 0); - *dx_desc->mut_is_dynamic() = dy_desc.is_dynamic(); - - const auto& nd_sbp = ctx->NdSbp4ArgNameAndIndex("dx", 0); - *(dx_desc->mut_shape()) = - *JUST(GetPhysicalShape(logical_shape, nd_sbp, ctx->parallel_desc(), ctx->parallel_ctx())); - int dx_ndim = dx_desc->shape().NumAxes(); - int dy_ndim = dy_desc.shape().NumAxes(); - CHECK_EQ_OR_RETURN(dx_ndim, dy_ndim) - << "Output dimension (" << dx_ndim << ") should equal to the input dimension (" << dy_ndim - << ") for slice backward."; - return Maybe::Ok(); -} -/*static*/ Maybe SliceGradOp::InferDataType(user_op::InferContext* ctx) { - *ctx->OutputDType("dx", 0) = ctx->InputDType("dy", 0); - return Maybe::Ok(); -} -/*static*/ Maybe SliceGradOp::ModifyInputArg(const GetInputArgModifier& GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { - user_op::InputArgModifier* dy_modifier = GetInputArgModifierFn("dy", 0); - CHECK_NOTNULL_OR_RETURN(dy_modifier); - dy_modifier->set_requires_grad(false); - return Maybe::Ok(); -} - -/*static*/ Maybe LogicalSliceAssignOp::GetSbp(user_op::SbpContext* ctx) { +/*static*/ Maybe SliceUpdateOp::GetSbp(user_op::SbpContext* ctx) { const Shape& x_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("ref", 0).shape(); const int64_t ndim = x_shape.NumAxes(); const auto& start_vec = ctx->Attr>("start"); const auto& stop_vec = ctx->Attr>("stop"); const auto& step_vec = ctx->Attr>("step"); + CHECK_EQ_OR_RETURN(start_vec.size(), ndim) + << "start_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + CHECK_EQ_OR_RETURN(stop_vec.size(), ndim) + << "stop_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + CHECK_EQ_OR_RETURN(step_vec.size(), ndim) + << "step_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + FOR_RANGE(int64_t, axis, 0, ndim) { ctx->NewBuilder() .Split(user_op::OpArg("ref", 0), axis) @@ -177,8 +60,9 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { .Build(); return Maybe::Ok(); } -/*static*/ Maybe LogicalSliceAssignOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { +/*static*/ Maybe SliceUpdateOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { const user_op::TensorDesc& ref_desc = ctx->InputTensorDesc("ref", 0); + const Shape& value_shape = ctx->InputTensorDesc("value", 0).shape(); const auto& start_vec = ctx->Attr>("start"); const auto& stop_vec = ctx->Attr>("stop"); const auto& step_vec = ctx->Attr>("step"); @@ -187,20 +71,24 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { const int64_t step = step_vec.at(i); const int64_t start = start_vec.at(i); const int64_t stop = stop_vec.at(i); - CHECK_GT_OR_RETURN(step, 0) << "logical_slice_assign step must be greater than 0"; - CHECK_GE_OR_RETURN(start, 0) << "logical_slice_assign start must be greater or equal to 0"; - CHECK_GT_OR_RETURN(stop, 0) << "logical_slice_assign stop must be greater than 0"; - CHECK_LT_OR_RETURN(start, stop) << "logical_slice_assign start must be less than stop"; + CHECK_GT_OR_RETURN(step, 0) << "slice_update step must be greater than 0"; + CHECK_GE_OR_RETURN(start, 0) << "slice_update start must be greater or equal to 0"; + CHECK_GE_OR_RETURN(stop, 0) << "slice_update stop must be greater or equal than 0"; + CHECK_LE_OR_RETURN(start, stop) << "slice_update start must be less or equal than stop"; + CHECK_EQ_OR_RETURN((stop - start + step - 1) / step, value_shape.At(i)) + << "slice_update slice tuple size must equal to value tensor shape, but got " << start + << ":" << stop << ":" << step << " vs " << value_shape.At(i) << " at dim " + << "i"; } auto* y_desc = ctx->OutputTensorDesc("y", 0); *y_desc->mut_shape() = ref_desc.shape(); *y_desc->mut_is_dynamic() = ref_desc.is_dynamic(); return Maybe::Ok(); } -/*static*/ Maybe LogicalSliceAssignOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { +/*static*/ Maybe SliceUpdateOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { return InferLogicalTensorDesc(ctx); } -/*static*/ Maybe LogicalSliceAssignOp::InferDataType(user_op::InferContext* ctx) { +/*static*/ Maybe SliceUpdateOp::InferDataType(user_op::InferContext* ctx) { const user_op::TensorDesc& ref_desc = ctx->InputTensorDesc("ref", 0); const user_op::TensorDesc& value_desc = ctx->InputTensorDesc("value", 0); CHECK_OR_RETURN(ref_desc.data_type() == value_desc.data_type()); @@ -209,7 +97,7 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { return Maybe::Ok(); } -/*static*/ Maybe LogicalSliceOp::GetSbp(user_op::SbpContext* ctx) { +/*static*/ Maybe SliceOp::GetSbp(user_op::SbpContext* ctx) { const user_op::TensorDesc& input_desc = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0); FOR_RANGE(int64_t, axis, 0, input_desc.shape().NumAxes()) { ctx->NewBuilder() @@ -221,7 +109,7 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { ctx->NewBuilder().PartialSum(user_op::OpArg("x", 0)).PartialSum(user_op::OpArg("y", 0)).Build(); return Maybe::Ok(); } -/*static*/ Maybe LogicalSliceOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { +/*static*/ Maybe SliceOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { const Shape& x_shape = ctx->InputShape("x", 0); const int64_t ndim = x_shape.NumAxes(); const auto& start_vec = ctx->Attr>("start"); @@ -232,154 +120,97 @@ bool IsFullSlice(int64_t start, int64_t stop, int64_t step, int64_t size) { const int64_t step = step_vec.at(i); const int64_t start = start_vec.at(i); const int64_t stop = stop_vec.at(i); - CHECK_GT_OR_RETURN(step, 0) << "LogicalSlice step must be greater than 0"; - CHECK_GE_OR_RETURN(start, 0) << "LogicalSlice start must be greater or equal to 0"; - CHECK_GT_OR_RETURN(stop, 0) << "LogicalSlice stop must be greater than 0"; - CHECK_LT_OR_RETURN(start, stop) << "LogicalSlice start must be less than stop"; + CHECK_GT_OR_RETURN(step, 0) << "Slice step must be greater than 0"; + CHECK_GE_OR_RETURN(start, 0) << "Slice start must be greater or equal to 0"; + CHECK_GE_OR_RETURN(stop, 0) << "Slice stop must be greater or equal to 0"; + CHECK_LE_OR_RETURN(start, stop) << "Slice start must be less or equal to stop"; const int64_t diff = stop - start - 1; dim_vec[i] = diff / step + 1; } *ctx->OutputShape("y", 0) = Shape(dim_vec); return Maybe::Ok(); } -/*static*/ Maybe LogicalSliceOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { +/*static*/ Maybe SliceOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { return InferLogicalTensorDesc(ctx); } -/*static*/ Maybe LogicalSliceOp::InferDataType(user_op::InferContext* ctx) { +/*static*/ Maybe SliceOp::InferDataType(user_op::InferContext* ctx) { *ctx->OutputDType("y", 0) = ctx->InputDType("x", 0); return Maybe::Ok(); } -/*static*/ Maybe SliceUpdateOp::GetSbp(user_op::SbpContext* ctx) { - const Shape& x_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape(); - const int64_t ndim = x_shape.NumAxes(); +/*static*/ Maybe SliceGradOp::GetSbp(user_op::SbpContext* ctx) { + const Shape& like_shape = ctx->Attr("like_shape"); + const int64_t ndim = like_shape.NumAxes(); const auto& start_vec = ctx->Attr>("start"); const auto& stop_vec = ctx->Attr>("stop"); const auto& step_vec = ctx->Attr>("step"); - CHECK_EQ_OR_RETURN(start_vec.size(), ndim); - CHECK_EQ_OR_RETURN(stop_vec.size(), ndim); - CHECK_EQ_OR_RETURN(step_vec.size(), ndim); + CHECK_EQ_OR_RETURN(start_vec.size(), ndim) + << "start_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + CHECK_EQ_OR_RETURN(stop_vec.size(), ndim) + << "stop_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + CHECK_EQ_OR_RETURN(step_vec.size(), ndim) + << "step_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; FOR_RANGE(int, i, 0, ndim) { - if (IsFullSlice(start_vec.at(i), stop_vec.at(i), step_vec.at(i), x_shape.At(i))) { + if (IsFullSlice(start_vec[i], stop_vec[i], step_vec[i], like_shape.At(i))) { ctx->NewBuilder().Split(ctx->inputs(), i).Split(ctx->outputs(), i).Build(); } } - ctx->NewBuilder().PartialSum(ctx->inputs()).PartialSum(ctx->outputs()).Build(); + ctx->NewBuilder().PartialSum(user_op::OpArg("dy", 0)).PartialSum(user_op::OpArg("dx", 0)).Build(); + ctx->NewBuilder().Broadcast(user_op::OpArg("dy", 0)).Broadcast(user_op::OpArg("dx", 0)).Build(); return Maybe::Ok(); } - -/*static*/ Maybe SliceUpdateOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { - const auto& x_desc = ctx->InputTensorDesc("x", 0); - const int64_t ndim = x_desc.shape().NumAxes(); - const auto& update_desc = ctx->InputTensorDesc("update", 0); - CHECK_EQ_OR_RETURN(update_desc.shape().NumAxes(), ndim); +/*static*/ Maybe SliceGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { + const Shape& like_shape = ctx->Attr("like_shape"); + const Shape& dy_shape = ctx->InputShape("dy", 0); const auto& start_vec = ctx->Attr>("start"); const auto& stop_vec = ctx->Attr>("stop"); const auto& step_vec = ctx->Attr>("step"); - CHECK_EQ_OR_RETURN(start_vec.size(), ndim); - CHECK_EQ_OR_RETURN(stop_vec.size(), ndim); - CHECK_EQ_OR_RETURN(step_vec.size(), ndim); - // validate update shape and start, stop, step attributes - FOR_RANGE(int, i, 0, ndim) { - const int64_t dim_size = x_desc.shape().At(i); - const int64_t step = step_vec.at(i); - CHECK_NE_OR_RETURN(step, 0) << "slice step cannot be 0"; - int64_t start = RegulateSliceStart(start_vec.at(i), dim_size); - int64_t stop = RegulateSliceStop(stop_vec.at(i), dim_size); - if (step > 0) { - CHECK_LT_OR_RETURN(start, stop) << "slice start must be less than stop when step > 0" - ", otherwise empty result will be outputted."; - } else { - CHECK_GT_OR_RETURN(start, stop) << "slice start must be more than stop when step < 0" - ", otherwise empty result will be outputted."; - } - const int64_t diff = (step > 0) ? (stop - start - 1) : (stop - start + 1); - const int64_t sliced_dim_size = diff / step + 1; - CHECK_EQ_OR_RETURN(sliced_dim_size, update_desc.shape().At(i)) - << "sliced dim size " << sliced_dim_size << " at axis " << i - << " not equal to the update shape " << update_desc.shape().ToString(); - } - auto* y_desc = ctx->OutputTensorDesc("y", 0); - *y_desc->mut_shape() = x_desc.shape(); - *y_desc->mut_is_dynamic() = x_desc.is_dynamic(); + + const int64_t ndim = dy_shape.NumAxes(); + CHECK_EQ_OR_RETURN(start_vec.size(), ndim) + << "start_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + CHECK_EQ_OR_RETURN(stop_vec.size(), ndim) + << "stop_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + CHECK_EQ_OR_RETURN(step_vec.size(), ndim) + << "step_vec's dim not equal to ref shape's dim: " << start_vec.size() << " vs " << ndim; + *ctx->OutputShape("dx", 0) = like_shape; return Maybe::Ok(); } -/*static*/ Maybe SliceUpdateOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { - return InferLogicalTensorDesc(ctx); -} -/*static*/ Maybe SliceUpdateOp::InferDataType(user_op::InferContext* ctx) { - const auto& x_desc = ctx->InputTensorDesc("x", 0); - const auto& update_desc = ctx->InputTensorDesc("update", 0); - CHECK_EQ_OR_RETURN(update_desc.data_type(), x_desc.data_type()); - auto* y_desc = ctx->OutputTensorDesc("y", 0); - *y_desc->mut_data_type() = x_desc.data_type(); +/*static*/ Maybe SliceGradOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { + Shape logical_shape = ctx->Attr("like_shape"); + const user_op::TensorDesc& dy_desc = ctx->InputTensorDesc("dy", 0); + user_op::TensorDesc* dx_desc = ctx->OutputTensorDesc("dx", 0); + *dx_desc->mut_is_dynamic() = dy_desc.is_dynamic(); + + const auto& nd_sbp = ctx->NdSbp4ArgNameAndIndex("dx", 0); + *(dx_desc->mut_shape()) = + *JUST(GetPhysicalShape(logical_shape, nd_sbp, ctx->parallel_desc(), ctx->parallel_ctx())); + int dx_ndim = dx_desc->shape().NumAxes(); + int dy_ndim = dy_desc.shape().NumAxes(); + CHECK_EQ_OR_RETURN(dx_ndim, dy_ndim) + << "Output dimension (" << dx_ndim << ") should equal to the input dimension (" << dy_ndim + << ") for slice backward."; return Maybe::Ok(); } - -namespace { - -Maybe GenSliceGradOp(const user_op::UserOpWrapper& op, user_op::AddOpFn AddOp) { - if (op.NeedGenGradTensor4OpInput("x", 0)) { - const auto& x_desc = op.TensorDesc4ArgNameAndIndex("x", 0); - user_op::UserOpConfWrapperBuilder builder(op.op_name() + "_grad"); - user_op::UserOpConfWrapper grad_op = builder.Op("slice_grad") - .Input("dy", op.GetGradTensorWithOpOutput("y", 0)) - .Attr("like_shape", x_desc.shape()) - .Attr("start", op.attr>("start")) - .Attr("stop", op.attr>("stop")) - .Attr("step", op.attr>("step")) - .Output("dx") - .Build(); - op.BindGradTensorWithOpInput(grad_op.output("dx", 0), "x", 0); - AddOp(grad_op); - } +/*static*/ Maybe SliceGradOp::InferDataType(user_op::InferContext* ctx) { + *ctx->OutputDType("dx", 0) = ctx->InputDType("dy", 0); return Maybe::Ok(); } - -Maybe GenSliceUpdateGradOp(user_op::BackwardOpConfContext* ctx) { - const std::string update_grad_op_name = ctx->FwOp().op_name() + "_update_grad"; - ctx->DefineOp(update_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("slice") - .InputBind("x", ctx->FwOp().output_grad("y", 0)) - .Attr("start", ctx->FwOp().attr>("start")) - .Attr("stop", ctx->FwOp().attr>("stop")) - .Attr("step", ctx->FwOp().attr>("step")) - .Output("y") - .Build(); - }); - ctx->FwOp().InputGradBind(user_op::OpArg("update", 0), [&]() -> const std::string& { - return ctx->GetOp(update_grad_op_name).output("y", 0); - }); - - const std::string zero_grad_op_name = ctx->FwOp().op_name() + "_zero_grad"; - ctx->DefineOp(zero_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("zero_like") - .InputBind("like", ctx->FwOp().input("update", 0)) - .Output("out") - .Build(); - }); - const std::string x_grad_op_name = ctx->FwOp().op_name() + "_x_grad"; - ctx->DefineOp(x_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("slice_update") - .InputBind("x", ctx->FwOp().output_grad("y", 0)) - .InputBind("update", ctx->GetOp(zero_grad_op_name).output("out", 0)) - .Attr("start", ctx->FwOp().attr>("start")) - .Attr("stop", ctx->FwOp().attr>("stop")) - .Attr("step", ctx->FwOp().attr>("step")) - .Output("y") - .Build(); - }); - ctx->FwOp().InputGradBind(user_op::OpArg("x", 0), [&]() -> const std::string& { - return ctx->GetOp(x_grad_op_name).output("y", 0); - }); +/*static*/ Maybe SliceGradOp::ModifyInputArg(const GetInputArgModifier& GetInputArgModifierFn, + const user_op::UserOpConfWrapper&) { + user_op::InputArgModifier* dy_modifier = GetInputArgModifierFn("dy", 0); + dy_modifier->set_requires_grad(false); return Maybe::Ok(); } -Maybe GenLogicalSliceAssignGradOp(user_op::BackwardOpConfContext* ctx) { +namespace { + +Maybe GenSliceUpdateGradOp(user_op::BackwardOpConfContext* ctx) { + // value grad const std::string update_grad_op_name = ctx->FwOp().op_name() + "_value_grad"; ctx->DefineOp(update_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("logical_slice") + return builder.OpTypeName("slice") .InputBind("x", ctx->FwOp().output_grad("y", 0)) .Attr("start", ctx->FwOp().attr>("start")) .Attr("stop", ctx->FwOp().attr>("stop")) @@ -391,6 +222,7 @@ Maybe GenLogicalSliceAssignGradOp(user_op::BackwardOpConfContext* ctx) { return ctx->GetOp(update_grad_op_name).output("y", 0); }); + // ref grad const std::string zero_grad_op_name = ctx->FwOp().op_name() + "_zero_grad"; ctx->DefineOp(zero_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { return builder.OpTypeName("zero_like") @@ -400,7 +232,7 @@ Maybe GenLogicalSliceAssignGradOp(user_op::BackwardOpConfContext* ctx) { }); const std::string x_grad_op_name = ctx->FwOp().op_name() + "_x_grad"; ctx->DefineOp(x_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("logical_slice_assign") + return builder.OpTypeName("slice_update") .InputBind("ref", ctx->FwOp().output_grad("y", 0)) .InputBind("value", ctx->GetOp(zero_grad_op_name).output("out", 0)) .Attr("start", ctx->FwOp().attr>("start")) @@ -415,37 +247,27 @@ Maybe GenLogicalSliceAssignGradOp(user_op::BackwardOpConfContext* ctx) { return Maybe::Ok(); } -Maybe GenLogicalSliceGradOp(user_op::BackwardOpConfContext* ctx) { - const std::string zero_grad_op_name = ctx->FwOp().op_name() + "_zero_grad"; - ctx->DefineOp(zero_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("zero_like") - .InputBind("like", ctx->FwOp().input("x", 0)) - .Output("out") - .Build(); - }); - const std::string x_grad_op_name = ctx->FwOp().op_name() + "_x_grad"; - ctx->DefineOp(x_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("logical_slice_assign") - .InputBind("ref", ctx->GetOp(zero_grad_op_name).output("out", 0)) - .InputBind("value", ctx->FwOp().output_grad("y", 0)) +Maybe GenSliceGradOp(user_op::BackwardOpConfContext* ctx) { + const std::string ref_grad_op_name = ctx->FwOp().op_name() + "_x_grad"; + ctx->DefineOp(ref_grad_op_name, [&](user_op::BackwardOpBuilder& builder) { + return builder.OpTypeName("slice_grad") + .InputBind("dy", ctx->FwOp().output_grad("y", 0)) + .Attr("like_shape", ctx->FwOp().arg_tensor_desc("x", 0).shape()) .Attr("start", ctx->FwOp().attr>("start")) .Attr("stop", ctx->FwOp().attr>("stop")) .Attr("step", ctx->FwOp().attr>("step")) - .Output("y") + .Output("dx") .Build(); }); ctx->FwOp().InputGradBind(user_op::OpArg("x", 0), [&]() -> const std::string& { - return ctx->GetOp(x_grad_op_name).output("y", 0); + return ctx->GetOp(ref_grad_op_name).output("dx", 0); }); - return Maybe::Ok(); } } // namespace -REGISTER_USER_OP_GRAD("slice").SetGenBackwardOpConfFn(GenSliceGradOp); REGISTER_USER_OP_GRAD("slice_update").SetBackwardOpConfGenFn(GenSliceUpdateGradOp); -REGISTER_USER_OP_GRAD("logical_slice_assign").SetBackwardOpConfGenFn(GenLogicalSliceAssignGradOp); -REGISTER_USER_OP_GRAD("logical_slice").SetBackwardOpConfGenFn(GenLogicalSliceGradOp); +REGISTER_USER_OP_GRAD("slice").SetBackwardOpConfGenFn(GenSliceGradOp); } // namespace oneflow diff --git a/oneflow/user/summary/event_writer_helper.cpp b/oneflow/user/summary/event_writer_helper.cpp index 02f6c223e65..7e1d92b4cb8 100644 --- a/oneflow/user/summary/event_writer_helper.cpp +++ b/oneflow/user/summary/event_writer_helper.cpp @@ -60,7 +60,7 @@ Maybe FillHistogramInSummary(const user_op::Tensor& value, const std::stri v->set_tag(tag); *v->mutable_metadata() = metadata; summary::Histogram histo; - for (int64_t i = 0; i < value.shape().elem_cnt(); i++) { + for (int64_t i = 0; i < value.shape_view().elem_cnt(); i++) { double double_val = value.dptr()[i]; histo.AppendValue(double_val); } @@ -117,20 +117,21 @@ bool WriteImageToBuffer(const uint8_t* image, int width, int height, int depth, Maybe FillImageInSummary(const user_op::Tensor& tensor, const std::string& tag, Summary* s) { SummaryMetadata metadata; SetPluginData(&metadata, kImagePluginName); - if (!(tensor.shape().NumAxes() == 4 - && (tensor.shape().At(3) == 1 || tensor.shape().At(3) == 3 || tensor.shape().At(3) == 4))) { + if (!(tensor.shape_view().NumAxes() == 4 + && (tensor.shape_view().At(3) == 1 || tensor.shape_view().At(3) == 3 + || tensor.shape_view().At(3) == 4))) { UNIMPLEMENTED(); } - if (!(tensor.shape().At(0) < (1LL << 31) && tensor.shape().At(1) < (1LL << 31) - && tensor.shape().At(2) < (1LL << 31) - && (tensor.shape().At(1) * tensor.shape().At(2)) < (1LL << 29))) { + if (!(tensor.shape_view().At(0) < (1LL << 31) && tensor.shape_view().At(1) < (1LL << 31) + && tensor.shape_view().At(2) < (1LL << 31) + && (tensor.shape_view().At(1) * tensor.shape_view().At(2)) < (1LL << 29))) { UNIMPLEMENTED(); } - const int64_t batch_size = static_cast(tensor.shape().At(0)); - const int64_t h = static_cast(tensor.shape().At(1)); - const int64_t w = static_cast(tensor.shape().At(2)); + const int64_t batch_size = static_cast(tensor.shape_view().At(0)); + const int64_t h = static_cast(tensor.shape_view().At(1)); + const int64_t w = static_cast(tensor.shape_view().At(2)); const int64_t hw = h * w; - const int64_t depth = static_cast(tensor.shape().At(3)); + const int64_t depth = static_cast(tensor.shape_view().At(3)); if (tensor.data_type() == DataType::kUInt8) { auto ith_image = [&tensor, hw, depth](int i) { auto images = tensor.dptr(); diff --git a/python/oneflow/__init__.py b/python/oneflow/__init__.py index 949d7b01d45..4a88712bc5f 100755 --- a/python/oneflow/__init__.py +++ b/python/oneflow/__init__.py @@ -152,6 +152,7 @@ def is_deprecated(func_or_class): from oneflow._C import sqrt from oneflow._C import square from oneflow._C import matmul +from oneflow._C import mv from oneflow._C import bernoulli from oneflow._C import round from oneflow._C import softplus @@ -358,8 +359,7 @@ def atexit_hook(hook): from oneflow.nn.modules.reshape import reshape_op as reshape from oneflow.nn.modules.reshape import view_op as view from oneflow.nn.modules.slice import slice_op as slice -from oneflow.nn.modules.slice import logical_slice_assign_op as logical_slice_assign -from oneflow.nn.modules.slice import logical_slice_op as logical_slice +from oneflow.nn.modules.slice import slice_update_op as slice_update from oneflow.nn.modules.sort import sort_op as sort from oneflow.nn.modules.tensor_buffer import gen_tensor_buffer from oneflow.nn.modules.tensor_buffer import ( diff --git a/python/oneflow/env.py b/python/oneflow/env.py index acc5b219ea5..9afdcb22cb3 100644 --- a/python/oneflow/env.py +++ b/python/oneflow/env.py @@ -57,3 +57,31 @@ def get_world_size(): """ return oneflow._oneflow_internal.GetWorldSize() + + +def init_rdma(): + """ + Init RDMA in the current envirment. If the current envirment support + RDMA, turning on RDMA by calling oneflow.env.init_rdma() can speed up + data transfer. + + Note: + - Make sure to avoid using fork() after oneflow.env.init_rdma() is invoked. + Otherwise, data corruption or segmentation fault may result! + + - Requires all devices to execute oneflow.env.init_rdma() simultaneously. + Otherwise, deadlock may result! + + + """ + oneflow._oneflow_internal.InitRDMA() + + +def rdma_is_initialized(): + """Returns whether RDMA is initialized in the current envirment or not. + + Returns: + Whether RDMA is initialized or not. + + """ + return oneflow._oneflow_internal.RDMAIsInitialized() diff --git a/python/oneflow/framework/distribute.py b/python/oneflow/framework/distribute.py index 6d5272e8a01..1f9b9f995a5 100644 --- a/python/oneflow/framework/distribute.py +++ b/python/oneflow/framework/distribute.py @@ -14,16 +14,18 @@ limitations under the License. """ import traceback +import warnings from contextlib import contextmanager import oneflow._oneflow_internal -def split_sbp(axis: int) -> oneflow._oneflow_internal.sbp.sbp: - """Generate a split scheme in which op will be splitted at `axis`. +def split_sbp(dim=None, **kwargs) -> oneflow._oneflow_internal.sbp.sbp: + """ + Generate a split signature which indicates the tensor will be split along `dim`. Args: - axis (int): At `axis` the op will be splitted. + dim (int): The dimension in which the tensor is split. Returns: SbpParallel: Split scheme object, often required by `to_global` method of `Tensor` @@ -34,5 +36,37 @@ def split_sbp(axis: int) -> oneflow._oneflow_internal.sbp.sbp: ct2 = t1.to_global(sbp=flow.sbp.split(0), placement=("cuda", ranks=[0, 1, 2, 3])) """ - assert type(axis) is int - return oneflow._oneflow_internal.sbp.split(axis) + if dim is None: + for key, value in kwargs.items(): + if key == "axis": + if not isinstance(value, int): + raise TypeError( + "split_sbp(): parameter must be int, not {}.".format( + type(value) + ) + ) + warnings.warn( + "This 'axis' parameter of oneflow.sbp.split() has been updated to 'dim' since OneFlow version 0.8." + ) + dim = value + else: + raise TypeError( + "split_sbp() got an unexpected keyword argument '%s'." % key + ) + + if dim is None: + raise TypeError("split_sbp() missing 1 required argument: 'dim'.") + + else: + for key, value in kwargs.items(): + if key == "axis": + raise TypeError( + "split_sbp() received an invalid combination of arguments - duplicate argument `axis`" + ) + else: + raise TypeError( + "split_sbp() got an unexpected keyword argument '%s'." % key + ) + + assert isinstance(dim, int) + return oneflow._oneflow_internal.sbp.split(dim) diff --git a/python/oneflow/framework/docstr/math_ops.py b/python/oneflow/framework/docstr/math_ops.py index 97b7281a45e..39b597de6a1 100644 --- a/python/oneflow/framework/docstr/math_ops.py +++ b/python/oneflow/framework/docstr/math_ops.py @@ -1274,6 +1274,39 @@ """, ) +add_docstr( + oneflow.mv, + r""" + mv(input, vec) -> Tensor + + The documentation is referenced from: https://pytorch.org/docs/1.10/generated/torch.mv.html. + + Performs a matrix-vector product of the matrix :attr:`input` and the vector :attr:`vec`. + + If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`vec` is a + 1-D tensor of size `m`, :attr:`out` will be a 1-D tensor of size `n`. + + .. note:: This function does not broadcast. + + Args: + input (oneflow.Tensor): matrix to be matrix multiplied + vec (oneflow.Tensor): vector to be matrix multiplied + Returns: + oneflow.Tensor: the output Tensor + + For example: + + .. code-block:: python + + >>> import oneflow as flow + >>> mat = flow.randn(2, 3) + >>> vec = flow.randn(3) + >>> out = flow.mv(mat, vec) + >>> out.shape + oneflow.Size([2]) + """, +) + add_docstr( oneflow.round, r"""This operator rounds the value of Blob to the nearest integer. @@ -1675,8 +1708,8 @@ >>> import oneflow as flow - >>> input = flow.rand(3,4,5,6) - >>> output = flow.vsplit(input,(1,3)) + >>> input = flow.rand(4, 4, 5, 6) + >>> output = flow.vsplit(input, (1, 3)) >>> output[0].size() oneflow.Size([1, 4, 5, 6]) >>> output[1].size() diff --git a/python/oneflow/framework/docstr/tensor.py b/python/oneflow/framework/docstr/tensor.py index c3eb93ee05e..ba295357946 100644 --- a/python/oneflow/framework/docstr/tensor.py +++ b/python/oneflow/framework/docstr/tensor.py @@ -319,13 +319,13 @@ >>> # results on rank 0 oneflow.Size([4]) - tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) .. code-block:: python >>> # results on rank 1 oneflow.Size([4]) - tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) """, ) @@ -365,13 +365,13 @@ >>> # results on rank 0 oneflow.Size([2]) - tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) .. code-block:: python >>> # results on rank 1 oneflow.Size([2]) - tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) """, ) @@ -424,13 +424,13 @@ >>> # results on rank 0 oneflow.Size([4]) - tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) .. code-block:: python >>> # results on rank 1 oneflow.Size([4]) - tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) For global tensor: @@ -447,13 +447,13 @@ >>> # results on rank 0 oneflow.Size([2]) - tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) .. code-block:: python >>> # results on rank 1 oneflow.Size([2]) - tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) + tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32) """, ) @@ -603,6 +603,13 @@ """, ) +add_docstr( + oneflow.Tensor.mv, + """ + See :func:`oneflow.mv` + """, +) + add_docstr( oneflow.Tensor.narrow, """ @@ -2074,6 +2081,15 @@ """, ) +add_docstr( + oneflow.Tensor.is_pinned, + r""" + Tensor.is_pinned() -> bool + + Returns true if this tensor resides in pinned memory. + """, +) + add_docstr( oneflow.Tensor.type, r"""Returns the type if dtype is not provided, else casts this object to the specified type. diff --git a/python/oneflow/framework/docstr/tensor_attributes.py b/python/oneflow/framework/docstr/tensor_attributes.py index 18e486057f0..20c69fce5fd 100644 --- a/python/oneflow/framework/docstr/tensor_attributes.py +++ b/python/oneflow/framework/docstr/tensor_attributes.py @@ -97,9 +97,9 @@ ``oneflow.sbp`` includes three types: - - oneflow.sbp.split(axis) + - oneflow.sbp.split(dim) - Indicates that the global tensor is evenly divided according to the dimension `axis` and distributed on each rank. + Indicates that the global tensor is evenly divided according to the dimension `dim` and distributed on each rank. - oneflow.sbp.broadcast() @@ -120,7 +120,7 @@ >>> s = flow.sbp.split(0) >>> s - oneflow.sbp.split(axis=0) + oneflow.sbp.split(dim=0) >>> b = flow.sbp.broadcast() >>> b oneflow.sbp.broadcast diff --git a/python/oneflow/framework/graph_build_util.py b/python/oneflow/framework/graph_build_util.py index 05aa6cef6eb..d6d4ba6a703 100644 --- a/python/oneflow/framework/graph_build_util.py +++ b/python/oneflow/framework/graph_build_util.py @@ -27,7 +27,6 @@ import oneflow.framework.scope_util as scope_util import oneflow.framework.session_context as session_context from oneflow.framework.tensor import Tensor - import oneflow._oneflow_internal._C as _C lazy_mode = oneflow._oneflow_internal.lazy_mode @@ -42,9 +41,11 @@ def graph_build_context(config_proto, session): config_proto_str, oneflow.placement("cpu", [0]), False, # is_mirrored ) + graph_scope = _make_new_graph_scope(new_scope, config_proto.job_name) + with lazy_mode.guard(True): with JobBuildAndInferCtx(config_proto): - with BlockScopeContext(prev_scope, new_scope): + with BlockScopeContext(prev_scope, graph_scope): yield @@ -118,6 +119,36 @@ def __exit__(self, exc_type, exc_val, exc_tb): ) +def _make_new_scope(prev_scope, scope_proto_str_setter): + new_scope = None + + def build_scope(builder): + nonlocal new_scope + new_scope = builder.BuildScopeByProtoStrSetter( + prev_scope, scope_proto_str_setter + ) + assert new_scope is not None + + oneflow._oneflow_internal.deprecated.PhysicalRun(build_scope) + oneflow._oneflow_internal.eager.Sync() + return new_scope + + +def _make_new_graph_scope(prev_scope, graph_name): + assert prev_scope is not None + attr_dict = dict() + name2default = session_context.GetDefaultSession().scope_attr_name2default_val + + def scope_proto_str_setter(serialized_scope_proto: str): + scope_proto = text_format.Parse( + serialized_scope_proto, scope_pb2_util.ScopeProto() + ) + scope_proto.module_name = graph_name + return str(text_format.MessageToString(scope_proto)) + + return _make_new_scope(prev_scope, scope_proto_str_setter) + + def make_new_block_scope(prev_scope, block): assert prev_scope is not None assert block is not None @@ -147,21 +178,9 @@ def scope_proto_str_setter(serialized_scope_proto: str): # set module name if isinstance(block, oneflow.nn.graph.block.ModuleBlock): scope_proto.module_name = block.name_prefix + block.name - return str(text_format.MessageToString(scope_proto)) - new_scope = None - - def build_scope(builder): - nonlocal new_scope - new_scope = builder.BuildScopeByProtoStrSetter( - prev_scope, scope_proto_str_setter - ) - assert new_scope is not None - - oneflow._oneflow_internal.deprecated.PhysicalRun(build_scope) - oneflow._oneflow_internal.eager.Sync() - return new_scope + return _make_new_scope(prev_scope, scope_proto_str_setter) def scope_to_proto(scope): diff --git a/python/oneflow/framework/tensor.py b/python/oneflow/framework/tensor.py index 8c97c973596..6168a16cda2 100755 --- a/python/oneflow/framework/tensor.py +++ b/python/oneflow/framework/tensor.py @@ -71,30 +71,6 @@ def _backward(self, gradient=None, retain_graph=False, create_graph=False): flow._oneflow_internal.nn.graph.AddTensorAsGraphLoss(self) -def _setitem(self, key, value): - if self.is_global: - if isinstance(value, (int, float)): - value = flow._C.global_constant( - [1], - value, - dtype=self.dtype, - placement=self.placement, - sbp=[flow.sbp.broadcast,] * len(self.sbp), - ) - else: - value = value.to_global( - self.placement, sbp=[flow.sbp.broadcast,] * len(self.sbp) - ) - else: - if isinstance(value, (int, float)): - value = flow._C.constant([1], value, dtype=self.dtype, device=self.device) - else: - value = value.to(device=self.device) - - flow._C.tensor_setitem(self, key, value) - return self - - def _str(self): return self.__repr__() @@ -625,6 +601,10 @@ def _matmul(self, other): return flow.matmul(self, other) +def _mv(self, vec): + return flow._C.mv(self, vec) + + def _round(self): return flow.round(self) @@ -641,10 +621,6 @@ def _triu(self, diagonal=0): return flow.triu(self, diagonal=diagonal) -def _to_local(self): - return flow.to_local(self) - - def _relu(self): return flow._C.relu(self) @@ -920,24 +896,6 @@ def _to(self, *args, **kwargs): return flow._C.to(self, *new_args, **kwargs) -def _local_to_global(self, placement=None, sbp=None, *, check_meta=True): - return flow.local_to_global(self, placement, sbp, check_meta) - - -def _global_to_global( - self, placement=None, sbp=None, *, grad_sbp=None, check_meta=False -): - return flow.global_to_global(self, placement, sbp, grad_sbp, check_meta) - - -def _to_global(self, placement=None, sbp=None, **kwargs): - return flow.to_global(self, placement, sbp, **kwargs) - - -def _to_local(self): - return flow.to_local(self) - - def _tolist(self): if self.numel() == 1 and self.ndim == 0: return self.item() @@ -1065,9 +1023,14 @@ def _numpy(self): tensors = flow.tensor_buffer_to_list_of_tensors(self, shapes, dtypes) return [t.numpy() for t in tensors] if self.is_global: - self = self.to_global( - placement=flow.env.all_device_placement("cpu"), sbp=flow.sbp.broadcast - ).to_local() + self_cpu_placement = flow.placement("cpu", self.placement.ranks) + self = ( + self.to_global(placement=self_cpu_placement) + .to_global( + placement=flow.env.all_device_placement("cpu"), sbp=flow.sbp.broadcast + ) + .to_local() + ) assert self.is_local if self.device != flow.device("cpu"): self = self.cpu() @@ -1144,7 +1107,6 @@ def RegisterMethods(): Tensor.sub = _sub Tensor.sub_ = _sub_inplace Tensor.backward = _backward - Tensor.__setitem__ = _setitem Tensor.__str__ = _str Tensor.__repr__ = _repr Tensor.__bool__ = is_nonzero @@ -1176,9 +1138,6 @@ def RegisterMethods(): Tensor.new_zeros = _new_zeros Tensor.where = _where Tensor.norm = _norm - Tensor.local_to_global = _local_to_global - Tensor.global_to_global = _global_to_global - Tensor.to_global = _to_global Tensor.repeat = _repeat Tensor.repeat_interleave = _repeat_interleave Tensor.tile = _tile @@ -1189,7 +1148,6 @@ def RegisterMethods(): Tensor.masked_select = _masked_select Tensor.eq = _eq Tensor.item = _item - Tensor.to_local = _to_local Tensor.sort = _sort Tensor.type_as = _type_as Tensor.tolist = _tolist @@ -1203,6 +1161,7 @@ def RegisterMethods(): Tensor.new_tensor = _new_tensor Tensor.cumsum = _cumsum Tensor.cumprod = _cumprod + Tensor.mv = _mv def register_tensor_op(op_name): diff --git a/python/oneflow/framework/tensor_str.py b/python/oneflow/framework/tensor_str.py index 808db4e640e..eaba9a96dbb 100644 --- a/python/oneflow/framework/tensor_str.py +++ b/python/oneflow/framework/tensor_str.py @@ -285,10 +285,6 @@ def _tensor_str(self, indent): if self.dtype is flow.float16: self = self.float() - # TODO: not support nd sbp tensor for now - if self.is_global and len(self.placement.ranks.shape) > 1: - return "[...]" - with flow.no_grad(): formatter = _Formatter(get_summarized_data(self) if summarize else self) return _tensor_str_with_formatter(self, indent, summarize, formatter) diff --git a/python/oneflow/framework/tensor_str_util.py b/python/oneflow/framework/tensor_str_util.py index afbd436167f..742990a9e39 100644 --- a/python/oneflow/framework/tensor_str_util.py +++ b/python/oneflow/framework/tensor_str_util.py @@ -22,15 +22,12 @@ def slice_wrapper(tensor, slice_tuple: Tuple[int, int, int]): with flow.no_grad(): ndim = tensor.ndim slice_tuple_list = [slice_tuple] + [[None, None, None]] * (ndim - 1) - # TODO(): a kind 'slice op' supports both local and global tensor - if tensor.is_global: - # input is s0, output is p - # input is b, output is b - # input is p, output is p - # so 'to b' is not needed here - tensor = flow.logical_slice(tensor, slice_tuple_list) - else: - tensor = flow.slice(tensor, slice_tuple_list) + # If tensor is global_tensor + # input is s0, output is p + # input is b, output is b + # input is p, output is p + # so 'to b' is not needed here + tensor = flow.slice(tensor, slice_tuple_list) # TODO(): flow.sequeeze will fail in some global tensor case if tensor.shape[0] == 1 and ndim > 1: tensor = tensor.reshape(list(tensor.shape[1:])) diff --git a/python/oneflow/nn/graph/block.py b/python/oneflow/nn/graph/block.py index 1fef925861f..fa38031ebb1 100644 --- a/python/oneflow/nn/graph/block.py +++ b/python/oneflow/nn/graph/block.py @@ -20,7 +20,7 @@ import oneflow._C import oneflow._oneflow_internal -import oneflow.framework.graph_build_util as graph_build_util +from oneflow.framework import graph_build_util from oneflow.env import get_rank from oneflow.framework.tensor import Tensor, TensorTuple from oneflow.nn.module import Module @@ -75,6 +75,7 @@ def __init__( self._origin = None self._scope = None self._prev_scope = None + assert belonged_graph is None or isinstance(belonged_graph, weakref.ProxyTypes) self._belonged_graph = belonged_graph self.config = BlockConfig() @@ -263,10 +264,6 @@ def __block_forward(self, *args, **kwargs): args, kwargs = self.__pre_forward_map(*args, **kwargs) with self.scope_context(): result = self._origin.__class__.forward(self, *args, **kwargs) - # Always pack outputs to remain type of outputs - outputs = (result,) - result = self.__post_forward_map(*outputs) - result = seq_to_func_return(result, True) self._is_executing_forward = False return result @@ -275,22 +272,16 @@ def __pre_forward_map(self, *args, **kwargs): # Identity op outside activation checkpointing scope will be the endpoint of an activation checkpointing segment. # Identity op as the first op of a pipeline stage will make backward op depends on the identity op within the stage, # otherwise the backward op may depends the op in former stage which will make graph creates unnessary buffers. - if self.config.activation_checkpointing or ( - self.config.stage_id is not None and self.config.stage_id >= 0 - ): + if self.config._stage_placement is not None: - def insert_identity(t): + def insert_to_global(t): assert isinstance(t, Tensor) - return oneflow._C.identity(t) + return t.to_global(placement=self.config._stage_placement) args, kwargs = self.__map_io( - "input", insert_identity, "insert_identity", *args, **kwargs + "input", insert_to_global, "insert_to_global", *args, **kwargs ) - return args, kwargs - - def __post_forward_map(self, *args): - # Insert identity op when doing activation checkpointing or pipeline execution. if self.config.activation_checkpointing or ( self.config.stage_id is not None and self.config.stage_id >= 0 ): @@ -299,10 +290,11 @@ def insert_identity(t): assert isinstance(t, Tensor) return oneflow._C.identity(t) - args, _ = self.__map_io( - "output", insert_identity, "insert_identity", *args, + args, kwargs = self.__map_io( + "input", insert_identity, "insert_identity", *args, **kwargs ) - return args + + return args, kwargs def add_module(self, name: str, module: Optional[Module]) -> None: self.__setattr__( @@ -563,11 +555,13 @@ def _ops_repr(self): ) if self._belonged_graph.is_compiled: - module_conf = self._belonged_graph._graph_proto.module_name2module_conf[ - self.name_prefix + self.name - ] - - return operators_repr(module_conf.ops) + if self._belonged_graph._compiled_graph_proto is not None: + module_conf = self._belonged_graph._compiled_graph_proto.module_name2module_conf[ + self.name_prefix + self.name + ] + return operators_repr( + module_conf.ops, self._belonged_graph._compiled_graph_proto + ) return [] diff --git a/python/oneflow/nn/graph/block_config.py b/python/oneflow/nn/graph/block_config.py index da00c19f6f7..313f65f9a68 100644 --- a/python/oneflow/nn/graph/block_config.py +++ b/python/oneflow/nn/graph/block_config.py @@ -18,31 +18,23 @@ class BlockConfig(object): r"""Configurations on Module Block in nn.Graph. - When an nn.Module is added into an nn.Graph, it is wrapped into a ModuleBlock. You can set or get optimization configs on an nn.Module with it's `ModuleBlock.config`. + When an nn.Module is added into an nn.Graph, it is wrapped into a ModuleBlock. You can set or get optimization configs on an nn.Module with it's `ModuleBlock.config`. """ def __init__(self): self._is_null = True self._stage_id = None + self._stage_placement = None self._activation_checkpointing = None # NOTE(lixiang): For the normal display of docstr, the API Doc of the get and set methods are written together in the stage_id function. @property def stage_id(self): r"""Set/Get stage id of nn.Module/ModuleBlock in pipeline parallelism. - - When calling stage_id(value: int = None), set different module's stage id to hint the graph preparing right num of buffers in pipeline. - - For example: - - .. code-block:: python - - # m_stage0 and m_stage1 are the two pipeline stages of the network, respectively. - # We can set Stage ID by setting the config.stage_id attribute of Module. - # The Stage ID is numbered starting from 0 and increasing by 1. - self.module_pipeline.m_stage0.config.stage_id = 0 - self.module_pipeline.m_stage1.config.stage_id = 1 + When calling stage_id(value: int = None), set different module's stage id to hint the graph + preparing right num of buffers in pipeline. (Not Recommended, for easy and efficient pipeline + parallelism experience, please use config.set_stage(stage_id, placement)) """ return self._stage_id @@ -51,9 +43,45 @@ def stage_id(self, value: int = None): r"""Set stage id of Block in pipeline parallelism. Set different module's stage id to hint the graph preparing right num of buffers in pipeline. """ + print( + "Warning: `config.stage_id = i` is deprecated, please use \n", + " config.set_stage(i, placement) for easy and efficient Pipeline parallel experience.", + ) + self._is_null = False self._stage_id = value + def set_stage(self, stage_id: int = None, placement=None): + r"""Set stage id and placement of nn.Module/ModuleBlock in pipeline parallelism. + + Args: + stage_id (int): stage id of this module. + placement (flow.placement): the placement of all tensor in this module. + + Note: + There will be automatically do tensor.to_global(placement) for all input tensor of + this module. So there is no need to write to_global() in the module forward when using + Pipeline Parallelism which is not recommended. + + For example: + + .. code-block:: python + + # m_stage0 and m_stage1 are the two pipeline stages of the network, respectively. + # We can set Stage ID and Placement by using Module.config.set_stage() + # The Stage ID is numbered starting from 0 and increasing by 1. + # The Placement is all tensors placement of this module. + P_0 = flow.placement(type = "cuda", ranks = [0, 1]) + P_1 = flow.placement(type = "cuda", ranks = [2, 3]) + self.module_pipeline.m_stage0.config.set_stage(stage_id = 0, placement = P0) + self.module_pipeline.m_stage1.config.set_stage(stage_id = 1, placement = P1) + + """ + + self._is_null = False + self._stage_id = stage_id + self._stage_placement = placement + # NOTE(lixiang): For the normal display of docstr, the API Doc of the get and set methods are written together in the activation_checkpointing function. @property def activation_checkpointing(self): diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py index 119a121058e..1952cea3699 100644 --- a/python/oneflow/nn/graph/graph.py +++ b/python/oneflow/nn/graph/graph.py @@ -19,12 +19,13 @@ import inspect from collections import OrderedDict from functools import partial -from typing import Dict, Optional, Union, List +from typing import Dict, Optional, Union, List, Callable import weakref from google.protobuf import text_format import oneflow import oneflow._oneflow_internal +import oneflow.core.job.job_pb2 as job_pb import oneflow.framework.c_api_util as c_api_util import oneflow.framework.graph_build_util as graph_build_util import oneflow.framework.session_context as session_ctx @@ -125,6 +126,8 @@ def __init__(self): self._forward_job_proto = None # forward, backward and optimized graph job proto self._full_job_proto = None + # completed graph job proto + self._compiled_job_proto = None self._job_id = None self._args_repr = [] self._outs_repr = [] @@ -212,6 +215,9 @@ def __call__(self, *args, **kwargs): if not self._is_compiled: self._compile(*args, **kwargs) + self.__print( + 0, 2, lambda: f"{self.name} with operators:\n" + self.__repr__() + ) return self.__run(*args, **kwargs) @@ -525,23 +531,25 @@ def _shallow_repr(self): return shallow_repr def _ops_repr(self): - r"""Generate this graph's operators' string representation + r"""Generate operators' string representation of this graph """ - if self._is_compiled: - conf = self._graph_proto.module_name2module_conf[ - self._config_proto.job_name - ] - return operators_repr(conf.ops) + if self._is_compiled and self._compiled_graph_proto is not None: + module_conf = self._compiled_graph_proto.module_name2module_conf[self.name] + return operators_repr(module_conf.ops, self._compiled_graph_proto) + return [] - def __print(self, s_level=2, v_level=0, msg: str = ""): + def __print(self, s_level=2, v_level=0, msg=None): r"""Do print according to info level.""" assert isinstance(s_level, int) assert isinstance(v_level, int) - assert isinstance(msg, str) + assert isinstance(msg, str) or isinstance(msg, Callable) if s_level >= self._debug_min_s_level: if (s_level > 0) or (s_level == 0 and v_level <= self._debug_max_v_level): - print(msg, flush=True) + if isinstance(msg, str): + print(msg, flush=True) + elif isinstance(msg, Callable): + print(msg(), flush=True) @property def _config_proto(self): @@ -581,6 +589,17 @@ def _full_graph_proto(self, full_job_proto): self._full_job_proto = full_job_proto self._c_nn_graph.job = full_job_proto.SerializeToString() + @property + def _compiled_graph_proto(self): + if not self._is_compiled: + self.__print( + 2, + 0, + f"[ERROR]{self._shallow_repr()} has not been compiled, so it's compiled graph proto is None." + " You can call the graph to trigger it's compilation.", + ) + return self._compiled_job_proto + def _generate_name(self): child_name = self.__class__.__name__ if Graph._child_init_cnt.get(child_name) is None: @@ -782,6 +801,11 @@ def finish_complie_and_init_runtime(self): self._debug_max_py_stack_depth, ): self._c_nn_graph.complie_and_init_runtime() + # Get compiled job + compiled_job_str = self._c_nn_graph.get_current_job_str() + self._compiled_job_proto = job_pb.Job() + self._compiled_job_proto.ParseFromString(compiled_job_str) + compile_and_init_end = time.perf_counter() self.__print( 0, @@ -1336,6 +1360,13 @@ def __getattr__(self, name: str): ) def __del__(self): + # Ensure vm has finished running this graph. + if self._session._env.is_shutting_down(): + # After python shutting down, it's not safe to call oneflow._oneflow_internal.eager. + # But shutting down will do sync in SwitchToShuttingDownPhase. + # So it's safe to skip sync here. + return + oneflow._oneflow_internal.eager.Sync() current_env_enable_mlir_inference_opt = os.getenv( "ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION" ) @@ -1345,13 +1376,6 @@ def __del__(self): os.environ[ "ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION" ] = self.env_enable_mlir_inference_opt - # Ensure vm has finished running this graph. - if self._session._env.is_shutting_down(): - # After python shutting down, it's not safe to call oneflow._oneflow_internal.eager. - # But shutting down will do sync in SwitchToShuttingDownPhase. - # So it's safe to skip sync here. - return - oneflow._oneflow_internal.eager.Sync() oneflow._oneflow_internal.ClearVariableTensorMgr() def __ensure_input_tensors_contiguous(self, *args, **kwargs): diff --git a/python/oneflow/nn/graph/graph_config.py b/python/oneflow/nn/graph/graph_config.py index ea48ad8d957..d367ca5c333 100644 --- a/python/oneflow/nn/graph/graph_config.py +++ b/python/oneflow/nn/graph/graph_config.py @@ -278,6 +278,16 @@ def build(self, x): """ self.proto.cudnn_conv_heuristic_search_algo = mode + def disable_straighten_algorithm(self, mode: bool = False): + r""" Whether we disable the straighten algorithm. + + If using nccl compute stream, turning it on might not speed up the training. + If not using nccl compute stream, turning it on might slow down data parallelism by 0.6% and slow down model parallelism by 6%. + + The switch is off by default (i.e. use the straighten algorithm by default). + """ + self.proto.disable_straighten_algorithm_in_task_graph = mode + def _generate_optimizer_and_variable_configs( self, opt_dict: OptDict = None, variables_conf: OrderedDict = None, ): diff --git a/python/oneflow/nn/graph/util.py b/python/oneflow/nn/graph/util.py index 41d631d8894..caa1c905f5f 100644 --- a/python/oneflow/nn/graph/util.py +++ b/python/oneflow/nn/graph/util.py @@ -14,57 +14,206 @@ limitations under the License. """ import sys +from string import Template from collections import OrderedDict -import oneflow.core.operator.op_conf_pb2 as op_conf_util -from oneflow.framework.tensor import Tensor from typing import Callable, Dict, Union, List, Tuple -from string import Template + import google.protobuf as protobuf +from google.protobuf import text_format + +import oneflow +import oneflow.core.job.job_pb2 as job_pb +import oneflow.core.operator.op_conf_pb2 as op_conf_util +from oneflow.framework.tensor import Tensor + + +def _nd_sbp2repr(nd_sbp): + dim_len = len(nd_sbp.sbp_parallel) + nd_sbp_str = "sbp=(" + for i in range(dim_len): + if i > 0: + nd_sbp_str += ", " + sbp = nd_sbp.sbp_parallel[i] + if sbp.HasField("broadcast_parallel"): + nd_sbp_str += "B" + elif sbp.HasField("partial_sum_parallel"): + nd_sbp_str += "P" + elif sbp.HasField("split_parallel"): + nd_sbp_str += "S(" + str(sbp.split_parallel.axis) + ")" + nd_sbp_str += ")" + return nd_sbp_str + + +def _blob_desc_repr(blob_desc): + desc_str = "size=(" + for i in range(len(blob_desc.shape.dim)): + if i > 0: + desc_str += ", " + desc_str += str(blob_desc.shape.dim[i]) + desc_str += "), " + desc_str += "dtype=(" + desc_str += str(oneflow.dtype.get(int(blob_desc.data_type))) + desc_str += ")" + return desc_str + + +def _get_args_repr(ordered_bn, bn2lbn, bn2nd_sbp, lbn2blob_desc): + arg_repr_list = [] + for bn in ordered_bn: + lbns = list(bn2lbn[bn].s) + + # sbp repr + sub_bns_sbp = [] + for bn_idx in range(len(lbns)): + sub_bn = bn + "_" + str(bn_idx) + nd_sbp = bn2nd_sbp[sub_bn] + sub_bns_sbp.append(_nd_sbp2repr(nd_sbp)) + + # TODO: placement repr + + # shape repr and dtype + sub_bns_desc = [] + for bn_idx in range(len(lbns)): + sub_bns_desc.append(_blob_desc_repr(lbn2blob_desc[lbns[bn_idx]])) + + # sub arg repr + sub_arg_repr_list = [] + for bn_idx in range(len(lbns)): + sub_arg_repr_list.append( + lbns[bn_idx] + + ":(" + + sub_bns_sbp[bn_idx] + + ", " + + sub_bns_desc[bn_idx] + + ")" + ) + + if len(lbns) > 1: # arg of multiple tensors + arg_repr_list.append("[" + (", ").join(sub_arg_repr_list) + "]") + else: + assert len(lbns) == 1 + arg_repr_list.append(sub_arg_repr_list[0]) + + return arg_repr_list + + +def _get_user_op_io_repr(op_conf, bn2nd_sbp, lbn2blob_desc): + user_op_conf = op_conf.user_conf + input_sig_str = ", ".join( + _get_args_repr( + user_op_conf.input_order, user_op_conf.input, bn2nd_sbp, lbn2blob_desc + ) + ) + output_sig_str = ", ".join( + _get_args_repr( + user_op_conf.output_order, user_op_conf.output, bn2nd_sbp, lbn2blob_desc + ) + ) + return input_sig_str, output_sig_str + + +def _get_var_op_io_repr(op_conf, bn2nd_sbp, lbn2blob_desc): + input_sig_str = "" + var_op_conf = op_conf.variable_conf + output_lbn = op_conf.name + "/" + var_op_conf.out + output_sig_str = var_op_conf.out + nd_sbp = bn2nd_sbp[var_op_conf.out] + output_sig_str += ( + ":" + _nd_sbp2repr(nd_sbp) + ", " + _blob_desc_repr(lbn2blob_desc[output_lbn]) + ) + return input_sig_str, output_sig_str + + +def _get_iden_op_io_repr(op_conf, bn2nd_sbp, lbn2blob_desc): + iden_op_conf = op_conf.identity_conf + input_lbn = getattr(iden_op_conf, "in") + input_sig_str = ( + input_lbn + + ":" + + _nd_sbp2repr(bn2nd_sbp["in"]) + + ", " + + _blob_desc_repr(lbn2blob_desc[input_lbn]) + ) + + output_lbn = op_conf.name + "/" + iden_op_conf.out + output_sig_str = iden_op_conf.out + nd_sbp = bn2nd_sbp[iden_op_conf.out] + output_sig_str += ( + ":" + _nd_sbp2repr(nd_sbp) + ", " + _blob_desc_repr(lbn2blob_desc[output_lbn]) + ) + + return input_sig_str, output_sig_str def operators_repr( - ops: protobuf.pyext._message.RepeatedCompositeContainer, + ops: protobuf.pyext._message.RepeatedCompositeContainer, graph_proto: job_pb.Job ) -> List[str]: - r"""Generate operators' string representation + r"""Generate operators' string representation of this module """ + if len(ops) > 0: + op_confs = dict() + for op_conf in graph_proto.net.op: + op_confs[op_conf.name] = op_conf + + op2placement = dict() + for group in graph_proto.placement.placement_group: + parallel_conf = group.parallel_conf + for op_name in group.op_set.op_name: + op2placement[op_name] = str( + oneflow.placement( + proto_str=text_format.MessageToString(parallel_conf) + ) + ) - def _op_signature(op: op_conf_util.OperatorConf) -> str: - - signature_template = Template(op.name + "($input) -> ($output)") + def _op_signature(op: op_conf_util.OperatorConf) -> Tuple[bool, str]: + bn2nd_sbp = graph_proto.job_parallel_view_conf.op_name2nd_sbp_signature_conf[ + op.name + ].bn_in_op2nd_sbp + lbn2blob_desc = graph_proto.helper.lbn2logical_blob_desc + signature_template = Template( + op.name + + "($input) -> ($output)" + + ":placement=(" + + op2placement[op.name] + + ")" + ) input_sig_str = "..." output_sig_str = "..." - # only deal with UserOpConf and VariableOpConf for now + # Only deal with UserOpConf and VariableOpConf for now. if op.HasField("user_conf"): - user_conf = op.user_conf - input_params = [] - for param in user_conf.input_order: - x = user_conf.input[param].s - if len(x) > 1: # param of multiple tensors - input_params.append("[" + (", ").join(list(x)) + "]") - else: - assert len(x) == 1 - input_params.append(x[0]) - input_sig_str = ", ".join(input_params) - - output_params = [] - for param in user_conf.output_order: - x = user_conf.output[param].s - if len(x) > 1: - output_params.append("[" + (", ").join(list(x)) + "]") - else: - assert len(x) == 1 - output_params.append(x[0]) - output_sig_str = ", ".join(output_params) - + input_sig_str, output_sig_str = _get_user_op_io_repr( + op, bn2nd_sbp, lbn2blob_desc + ) elif op.HasField("variable_conf"): - variable_conf = op.variable_conf - input_sig_str = "" - output_sig_str = op.name + "/" + variable_conf.out - - return signature_template.substitute(input=input_sig_str, output=output_sig_str) + input_sig_str, output_sig_str = _get_var_op_io_repr( + op, bn2nd_sbp, lbn2blob_desc + ) + elif op.HasField("identity_conf"): + input_sig_str, output_sig_str = _get_iden_op_io_repr( + op, bn2nd_sbp, lbn2blob_desc + ) + elif op.name.startswith("System-"): + return False, "" - return map(lambda op: "(OPERATOR: " + _op_signature(op) + ")", ops) + op_str = "(OPERATOR: " + op_str += signature_template.substitute( + input=input_sig_str, output=output_sig_str + ) + op_str += ")" + + return True, op_str + + ops_strs = [] + for op in ops: + if op not in op_confs: + continue + op_conf = op_confs[op] + assert isinstance(op_conf, op_conf_util.OperatorConf) + got_repr, op_str = _op_signature(op_conf) + if got_repr: + ops_strs.append(op_str) + return ops_strs def add_indent(in_s, num_spaces): diff --git a/python/oneflow/nn/modules/fused_mlp.py b/python/oneflow/nn/modules/fused_mlp.py index fb117cffcbb..5efa1deda5d 100644 --- a/python/oneflow/nn/modules/fused_mlp.py +++ b/python/oneflow/nn/modules/fused_mlp.py @@ -91,8 +91,6 @@ def __init__( if self.dropout_rate_list[i] != 0.0: self.use_dropout = True break - if not self.training: - self.use_dropout = False def add_parameters(self) -> None: """Register parameter in FusedMLP module. @@ -184,7 +182,11 @@ def reset_parameters(self) -> None: flow.nn.init.uniform_(self.bias(layer_idx), -bound, bound) def forward(self, x): - if self.use_dropout: + if not self.training or not self.use_dropout: + return flow._C.fused_mlp( + x, self.weights(), self.biases(), self.skip_final_activation + ) + else: return flow._C.fused_matmul_bias_add_relu_dropout( x, self.weights(), @@ -192,10 +194,6 @@ def forward(self, x): self.skip_final_activation, self.dropout_rate_list, ) - else: - return flow._C.fused_mlp( - x, self.weights(), self.biases(), self.skip_final_activation - ) def extra_repr(self) -> str: return "in_features={}, hidden_features={}, out_features={}, skip_final_activation={}".format( diff --git a/python/oneflow/nn/modules/loss.py b/python/oneflow/nn/modules/loss.py index 1a0310b3f78..a03087cf8fb 100644 --- a/python/oneflow/nn/modules/loss.py +++ b/python/oneflow/nn/modules/loss.py @@ -33,7 +33,7 @@ def __init__( self, weight: Optional[Tensor] = None, reduction: str = "mean" ) -> None: super(_WeightedLoss, self).__init__(reduction=reduction) - self.weight = weight + self.register_buffer("weight", weight) class L1Loss(_Loss): diff --git a/python/oneflow/nn/modules/slice.py b/python/oneflow/nn/modules/slice.py index c17068247f6..c0c36d2cff5 100644 --- a/python/oneflow/nn/modules/slice.py +++ b/python/oneflow/nn/modules/slice.py @@ -44,10 +44,8 @@ def slice_op(input, slice_tup_list: Sequence[Tuple[int, int, int]]): return flow._C.slice(input, start, stop, step) -def logical_slice_assign_op( - input, update, slice_tup_list: Sequence[Tuple[int, int, int]] -): - """Update a slice of tensor `x`(in-place). Like `x[start:stop:step] = update`. +def slice_update_op(input, update, slice_tup_list: Sequence[Tuple[int, int, int]]): + """Update a slice of tensor `x`. Like `x[start:stop:step] = update`. Args: x: A `Tensor`, whose slice will be updated. @@ -63,8 +61,7 @@ def logical_slice_assign_op( >>> input = flow.Tensor(np.array([1, 1, 1, 1, 1]).astype(np.float32)) >>> update = flow.Tensor(np.array([2, 3, 4]).astype(np.float32)) - >>> y = flow.logical_slice_assign(input, update, slice_tup_list=[[1, 4, 1]]) - >>> input + >>> flow.slice_update(input, update, slice_tup_list=[[1, 4, 1]]) tensor([1., 2., 3., 4., 1.], dtype=oneflow.float32) """ @@ -72,34 +69,7 @@ def logical_slice_assign_op( (start, stop, step) = parse_slice_tuple_list(slice_tup_list, input.shape) if update.dtype != input.dtype: update = update.to(dtype=input.dtype) - return flow._C.logical_slice_assign(input, update, start, stop, step) - - -def logical_slice_op(input, slice_tup_list: Sequence[Tuple[int, int, int]]): - """Extracts a slice from a global tensor. - The `slice_tup_list` assigns the slice indices in each dimension, the format is (start, stop, step). - The operator will slice the tensor according to the `slice_tup_list`. - - Args: - input: A `Tensor`. - slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step). - - For example: - - .. code-block:: python - - >>> import oneflow as flow - - >>> placement = flow.placement("cpu", ranks=[0]) - >>> x = flow.Tensor([[1, 2], [3, 4]], placement=placement, sbp=flow.sbp.broadcast) - >>> y = flow.logical_slice(x, slice_tup_list=[[0, 1, 1]]) - >>> y.numpy() - array([[1., 2.]], dtype=float32) - - """ - - (start, stop, step) = parse_slice_tuple_list(slice_tup_list, input.shape) - return flow._C.logical_slice(input, start, stop, step) + return flow._C.slice_update(input, update, start, stop, step, inplace=True) if __name__ == "__main__": diff --git a/python/oneflow/nn/optimizer/polynomial_lr.py b/python/oneflow/nn/optimizer/polynomial_lr.py index 8b986203586..a9fa85f8132 100644 --- a/python/oneflow/nn/optimizer/polynomial_lr.py +++ b/python/oneflow/nn/optimizer/polynomial_lr.py @@ -36,13 +36,13 @@ class PolynomialLR(LRScheduler): .. math:: \begin{aligned} - & decay\_batch = min(decay\_batch, current\_batch) \\ + & current\_batch = min(decay\_batch, current\_batch) \\ & learning\_rate = (base\_lr-end\_lr)*(1-\frac{current\_batch}{decay\_batch})^{power}+end\_lr \end{aligned} Args: optimizer (Optimizer): Wrapper optimizer. - steps (int): The decayed steps. + decay_batch (int): The decayed steps. end_learning_rate (float, optional): The final learning rate. Defaults to 0.0001. power (float, optional): The power of polynomial. Defaults to 1.0. cycle (bool, optional): If cycle is True, the scheduler will decay the learning rate every decay steps. Defaults to False. @@ -55,7 +55,7 @@ class PolynomialLR(LRScheduler): ... polynomial_scheduler = flow.optim.lr_scheduler.PolynomialLR( - optimizer, steps=5, end_learning_rate=0.00001, power=2 + optimizer, decay_batch=5, end_learning_rate=0.00001, power=2 ) for epoch in range(num_epoch): @@ -66,15 +66,17 @@ class PolynomialLR(LRScheduler): def __init__( self, optimizer, - steps: int, + decay_batch: int, end_learning_rate: float = 0.0001, power: float = 1.0, cycle: bool = False, last_step: int = -1, verbose: bool = False, ): - assert steps > 0, f"steps must greater than zero, but got {steps}" - self.max_decay_steps = steps + assert ( + decay_batch > 0 + ), f"decay_batch must greater than zero, but got {decay_batch}" + self.max_decay_steps = decay_batch self.end_learning_rate = end_learning_rate self.power = power self.cycle = cycle diff --git a/python/oneflow/test/README.md b/python/oneflow/test/README.md index 1bb55344fd4..7ada2be57d5 100644 --- a/python/oneflow/test/README.md +++ b/python/oneflow/test/README.md @@ -3,7 +3,7 @@ |op name | Doc Test | Compatiable/Completeness Test | Exception | | ------------------------- | ------------- | ----------------------------- | --------- | -| oneflow.Tensor | [oneflow.tensor](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L20) | [tensor_scatter_nd_update](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_tensor_scatter_nd_update.py#L91) | | +| oneflow.Tensor | [oneflow.tensor](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L20) | [tensor_init](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_generator.py#L161) | [tensordot_neg_dims_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_tensordot.py#L25) | | oneflow.BoolTensor | | | | | oneflow.ByteTensor | | | | | oneflow.CharTensor | | | | @@ -12,107 +12,107 @@ | oneflow.HalfTensor | | | | | oneflow.IntTensor | | | | | oneflow.LongTensor | | | | -| oneflow.Size | [oneflow.Tensor.size](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1127) | | | -| oneflow.abs | [oneflow.Tensor.abs](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L471) | [abs_with_ndim_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_abs.py#L34) | | -| oneflow.acos | [oneflow.Tensor.acos](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L478) | [acos_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L348) | | -| oneflow.acosh | [oneflow.Tensor.acosh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L492) | [acosh_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L368) | | +| oneflow.Size | [oneflow.Tensor.size](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1319) | | [splitwithsize_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L239) | +| oneflow.abs | [oneflow.Tensor.abs](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L628) | [abs_with_0_size_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_abs.py#L27) | | +| oneflow.acos | [oneflow.Tensor.acos](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L635) | [acos_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L348) | | +| oneflow.acosh | [oneflow.Tensor.acosh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L649) | [acosh_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L368) | | | oneflow.adaptive_avg_pool1d | | | | | oneflow.adaptive_avg_pool2d | | | | | oneflow.adaptive_avg_pool3d | | | | -| oneflow.add | [oneflow.Tensor.add](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L985) | [add_with_alpha](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_add.py#L198) | | -| oneflow.addmm | [oneflow.Tensor.addmm](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L992) | [addmm](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_addmm.py#L60) | | -| oneflow.any | | [any_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_logical_reduce.py#L47) | | -| oneflow.arange | [oneflow.arange](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/arange.py#L20) | [arange](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_arange.py#L58) | | -| oneflow.arccos | [oneflow.Tensor.arccos](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L485) | [arccos_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L338) | | -| oneflow.arccosh | [oneflow.Tensor.arccosh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L499) | [arccosh_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L358) | | -| oneflow.arcsin | [oneflow.Tensor.arcsin](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1013) | | | -| oneflow.arcsinh | [oneflow.Tensor.arcsinh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1020) | | | -| oneflow.arctan | [oneflow.Tensor.arctanh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L506) | [arctan_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L440) | | -| oneflow.arctanh | [oneflow.Tensor.arctanh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L506) | [arctanh_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L462) | | -| oneflow.argmax | [oneflow.argmax](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L139) | [argmax](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_argmax.py#L83) | | -| oneflow.argmin | [oneflow.argmin](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L169) | [argmin](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_argmin.py#L34) | | -| oneflow.argsort | [oneflow.Tensor.argsort](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L527) | [argsort](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_argsort.py#L36) | | -| oneflow.argwhere | [oneflow.Tensor.argwhere](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L534) | [argwhere](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L625) | | +| oneflow.add | [oneflow.Tensor.add](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1163) | [padding_idx](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_sparse.py#L140) | [add_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_binary_functor_exception.py#L27) | +| oneflow.addmm | [oneflow.Tensor.addmm](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1170) | [addmm](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_addmm.py#L60) | | +| oneflow.any | [oneflow.any](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/reduce_ops.py#L219) | [any_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_logical_reduce.py#L52) | | +| oneflow.arange | [oneflow.arange](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/arange.py#L20) | [arange](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_arange.py#L58) | | +| oneflow.arccos | [oneflow.Tensor.arccos](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L642) | [arccos_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L338) | | +| oneflow.arccosh | [oneflow.Tensor.arccosh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L656) | [arccosh_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L358) | | +| oneflow.arcsin | [oneflow.Tensor.arcsin](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1205) | | | +| oneflow.arcsinh | [oneflow.Tensor.arcsinh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1212) | | | +| oneflow.arctan | [oneflow.Tensor.arctanh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L663) | [arctan_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L438) | | +| oneflow.arctanh | [oneflow.Tensor.arctanh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L663) | [arctanh_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L460) | | +| oneflow.argmax | [oneflow.argmax](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L139) | [argmax](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_argmax.py#L83) | [argmax_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L22) | +| oneflow.argmin | [oneflow.argmin](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L169) | [argmin](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_argmin.py#L34) | | +| oneflow.argsort | [oneflow.Tensor.argsort](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L684) | [argsort](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_argsort.py#L37) | | +| oneflow.argwhere | [oneflow.Tensor.argwhere](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L691) | [argwhere](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L672) | | | oneflow.as_strided | | | | | oneflow.as_tensor | | | | -| oneflow.asin | [oneflow.Tensor.asin](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1006) | | | -| oneflow.asinh | [oneflow.asinh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L298) | | | -| oneflow.atan | [oneflow.Tensor.atan2](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L122) | [atanh_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L412) | | -| oneflow.atan2 | [oneflow.Tensor.atan2](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L122) | | | -| oneflow.atanh | [oneflow.Tensor.atanh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L541) | [atanh_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L412) | | -| oneflow.autograd | | [autograd_interface](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_autograd.py#L81) | | +| oneflow.asin | [oneflow.Tensor.asin](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1198) | | | +| oneflow.asinh | [oneflow.asinh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L318) | | | +| oneflow.atan | [oneflow.atan2](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/trigonometric_ops.py#L21) | [atanh_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L410) | | +| oneflow.atan2 | [oneflow.atan2](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/trigonometric_ops.py#L21) | | | +| oneflow.atanh | [oneflow.Tensor.atanh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L698) | [atanh_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L410) | | +| oneflow.autograd | | [autograd_interface](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_autograd.py#L81) | | | oneflow.batch_gather | | | | -| oneflow.bernoulli | [oneflow.bernoulli](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/random.py#L20) | [bernoulli](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_bernoulli.py#L49) | | +| oneflow.bernoulli | [oneflow.bernoulli](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/random.py#L20) | [bernoulli](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_bernoulli.py#L49) | | | oneflow.bfloat16 | | | | -| oneflow.bmm | [oneflow.Tensor.bmm](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L695) | [bmm](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_bmm.py#L93) | | -| oneflow.bool | | [bool_add](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_add.py#L212) | | +| oneflow.bmm | [oneflow.bmm](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/bmm.py#L20) | [bmm](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_bmm.py#L93) | [bmm_exception_dim_not_right](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_bmm.py#L25) | +| oneflow.bool | | [bool_add](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_add.py#L212) | | | oneflow.boxing | | | | | oneflow.broadcast_like | | | | -| oneflow.cast | [oneflow.broadcast_like](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/broadcast_like.py#L20) | [cast](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_flatten.py#L63) | | -| oneflow.cat | [oneflow.cat](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L333) | [scatter_1n4d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_comm_ops.py#L84) | | -| oneflow.ceil | [oneflow.Tensor.ceil](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1440) | [ceil_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_ceil.py#L29) | | +| oneflow.cast | [oneflow.Tensor.cast](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L901) | [broadcast_mul](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_mul.py#L193) | [broadcast_like_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L28) | +| oneflow.cat | [oneflow.cat](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L333) | [scatter_nd](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_scatter_nd.py#L56) | [concat_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L37) | +| oneflow.ceil | [oneflow.Tensor.ceil](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1653) | [ceil_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_ceil.py#L29) | | | oneflow.char | | | | -| oneflow.chunk | [oneflow.Tensor.chunk](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L702) | | | -| oneflow.clamp | [oneflow.Tensor.clamp](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1266) | [clamp](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_clamp.py#L96) | | +| oneflow.chunk | [oneflow.Tensor.chunk](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L859) | [chunk](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_chunk.py#L37) | [chunk_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L254) | +| oneflow.clamp | [oneflow.clamp](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/clamp.py#L20) | [clamp](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_clamp.py#L96) | | | oneflow.clamp_ | | | | -| oneflow.clip | [oneflow.Tensor.clip](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1280) | [clip_grad](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_clip_grad.py#L152) | | +| oneflow.clip | [oneflow.clip](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/clamp.py#L70) | [clip_grad](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_clip_grad.py#L152) | | | oneflow.clip_ | | | | -| oneflow.concat | | [concat](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_concat.py#L124) | | +| oneflow.concat | | [concat](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_concat.py#L124) | [concat_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L37) | | oneflow.constant_initializer | | | | | oneflow.convert_oneflow_dtype_to_numpy_dtype | | | | -| oneflow.cos | [oneflow.Tensor.acos](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L478) | [cos](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L88) | | -| oneflow.cosh | [oneflow.Tensor.acosh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L492) | [arccosh_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L358) | | -| oneflow.cumprod | [oneflow.cumprod](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1576) | [cumprod](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_cum_ops.py#L37) | | -| oneflow.cumsum | [oneflow.cumsum](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1543) | [cumsum](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_cumsum.py#L36) | | -| oneflow.device | [oneflow.Tensor.device](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L84) | | | -| oneflow.diag | [oneflow.diagonal](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L20) | [diag](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_diag.py#L35) | | -| oneflow.diagonal | [oneflow.diagonal](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L20) | [diagonal](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_diagonal.py#L43) | | +| oneflow.cos | [oneflow.Tensor.acos](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L635) | [cos](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L88) | [cosine_similarity_not_floating_type](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_cosine_similarity.py#L24) | +| oneflow.cosh | [oneflow.Tensor.acosh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L649) | [arccosh_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L358) | | +| oneflow.cumprod | [oneflow.cumprod](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1723) | [cumprod](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_cum_ops.py#L38) | | +| oneflow.cumsum | [oneflow.cumsum](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1690) | [cumsum](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_cumsum.py#L37) | | +| oneflow.device | [oneflow.Tensor.device](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L85) | | [device_type](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_device.py#L25) | +| oneflow.diag | [oneflow.diagonal](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L20) | [diag](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_diag.py#L35) | [diagonal_index_error1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L204) | +| oneflow.diagonal | [oneflow.diagonal](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L20) | [diagonal](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_diagonal.py#L44) | [diagonal_index_error1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L204) | | oneflow.distributed_partial_fc_sample | | | | -| oneflow.div | [oneflow.Tensor.div_](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L893) | [div](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L478) | | +| oneflow.div | [oneflow.Tensor.div_](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1071) | [div](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L501) | [div_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_binary_functor_exception.py#L63) | | oneflow.div_ | | | | -| oneflow.dot | [oneflow.dot](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1262) | [dot](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_dot.py#L26) | | -| oneflow.double | [oneflow.Tensor.double](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1673) | [double](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_tensor_ops.py#L128) | | +| oneflow.dot | [oneflow.dot](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1370) | [tensordot_intdim](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_tensordot.py#L28) | [tensordot_neg_dims_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_tensordot.py#L25) | +| oneflow.double | [oneflow.Tensor.double](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1936) | [double](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_tensor_ops.py#L200) | | | oneflow.dtype | | | | | oneflow.dtypes | | | | -| oneflow.einsum | [oneflow.einsum](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/einsum.py#L20) | [einsum_bilinear_transformation](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_einsum_bilinear_transformation.py#L42) | | -| oneflow.empty | | [empty_consistent](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_empty.py#L54) | | -| oneflow.eq | [oneflow.Tensor.requires_grad](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L621) | [eq_with_0_size_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_eq.py#L32) | | +| oneflow.einsum | [oneflow.einsum](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/einsum.py#L20) | [einsum_alphaflod_usecase11](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_einsum_alphaflod_usecase11.py#L38) | | +| oneflow.empty | | [empty_consistent](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_empty.py#L76) | | +| oneflow.eq | [oneflow.Tensor.requires_grad](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L778) | [eq](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_eq.py#L38) | | | oneflow.equal | | | | -| oneflow.erf | [oneflow.Tensor.erf](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L763) | [erf](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_erf.py#L35) | | -| oneflow.erfc | [oneflow.Tensor.erfc](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L772) | [erfc](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_erfc.py#L35) | | -| oneflow.erfinv | [oneflow.Tensor.erfinv](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L781) | [erfinv_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L702) | | +| oneflow.erf | [oneflow.Tensor.erf](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L941) | [erf](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_erf.py#L35) | | +| oneflow.erfc | [oneflow.Tensor.erfc](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L950) | [erfc](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_erfc.py#L35) | | +| oneflow.erfinv | [oneflow.Tensor.erfinv](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L959) | [erfinv_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L700) | | | oneflow.erfinv_ | | | | -| oneflow.exp | [oneflow.Tensor.expand](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L129) | [expand_broadcast](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_expand_op.py#L208) | | -| oneflow.expand | [oneflow.Tensor.expand](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L129) | [expand_broadcast](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_expand_op.py#L208) | | -| oneflow.expm1 | [oneflow.Tensor.expm1](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1447) | [expm1](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_expm1.py#L46) | | -| oneflow.eye | [oneflow.eye](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1382) | [eye](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_eye.py#L50) | | -| oneflow.flatten | [oneflow.Tensor.flatten](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L154) | [flatten](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_flatten.py#L38) | | -| oneflow.flip | [oneflow.Tensor.flip](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L168) | [flip](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_flip.py#L40) | | -| oneflow.float | [oneflow.Tensor.float](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1652) | [float](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_tensor_ops.py#L114) | | +| oneflow.exp | [oneflow.Tensor.expand](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L130) | [expm1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_expm1.py#L35) | [expand_dim_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L78) | +| oneflow.expand | [oneflow.Tensor.expand](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L130) | [expand_compare_with_numpy](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_expand.py#L206) | [expand_dim_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L78) | +| oneflow.expm1 | [oneflow.Tensor.expm1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1660) | [expm1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_expm1.py#L35) | | +| oneflow.eye | [oneflow.eye](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1529) | [eye](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_eye.py#L50) | | +| oneflow.flatten | [oneflow.flatten](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/flatten.py#L20) | [flatten_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_flatten.py#L71) | | +| oneflow.flip | [oneflow.Tensor.flip](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L169) | [flip](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_flip.py#L40) | | +| oneflow.float | [oneflow.Tensor.float](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1915) | [float](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_tensor_ops.py#L186) | | | oneflow.float16 | | | | | oneflow.float32 | | | | | oneflow.float64 | | | | -| oneflow.floor | [oneflow.Tensor.floor](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L161) | [floor](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_floor.py#L49) | | +| oneflow.floor | [oneflow.Tensor.floor](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L162) | [floor](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_floor.py#L49) | | | oneflow.floor_ | | | | | oneflow.floor_divide | | | | -| oneflow.fmod | [oneflow.Tensor.fmod](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1370) | [fmod_with_0_size_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L832) | | +| oneflow.fmod | [oneflow.Tensor.fmod](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1583) | [fmod_with_0_size_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L885) | | | oneflow.from_numpy | | | | -| oneflow.full | | [full_with_random_data_int](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_constant.py#L115) | | -| oneflow.gather | [oneflow.gather](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L367) | [gather_1n4d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_comm_ops.py#L106) | | +| oneflow.full | | [full_with_random_data_int](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_constant.py#L126) | | +| oneflow.gather | [oneflow.gather](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L367) | [gather_1n4d](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_comm_ops.py#L106) | [gather_index_type_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L120) | | oneflow.gather_nd | | | | -| oneflow.ge | [oneflow.gelu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L74) | [image_normalize](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_image_normalize.py#L75) | | -| oneflow.gelu | [oneflow.gelu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L74) | [gelu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L147) | | +| oneflow.ge | [oneflow.arange](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/arange.py#L20) | [generator_manual_seed](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_generator.py#L72) | [get_sbp_with_invalid_axis](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_local_global_convert_error.py#L24) | +| oneflow.gelu | [oneflow.Tensor.gelu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1017) | [gelu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L149) | | | oneflow.glorot_normal_initializer | | | | | oneflow.glorot_uniform_initializer | | | | | oneflow.grad_enable | | | | -| oneflow.greater | [oneflow.greater](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/comparison.py#L21) | [greater_equal](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_greater_equal.py#L38) | | +| oneflow.greater | [oneflow.greater](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/comparison.py#L21) | [greater](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_greater.py#L44) | | | oneflow.greater_equal | | | | -| oneflow.gt | [oneflow.Tensor.gt](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L857) | | | -| oneflow.half | | | | -| oneflow.hsplit | [oneflow.hsplit](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1459) | | | +| oneflow.gt | [oneflow.Tensor.gt](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1035) | | | +| oneflow.half | [oneflow.Tensor.half](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1449) | [half](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L1065) | | +| oneflow.hsplit | [oneflow.hsplit](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1606) | | | | oneflow.in_top_k | | | | | oneflow.index_select | | | | -| oneflow.int | [oneflow.Tensor.int](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1610) | [interpolate](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_interpolate.py#L658) | | +| oneflow.int | [oneflow.Tensor.int](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1873) | [randint](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_randint.py#L99) | | | oneflow.int32 | | | | | oneflow.int64 | | | | | oneflow.int8 | | | | @@ -121,138 +121,137 @@ | oneflow.is_nonzero | | | | | oneflow.is_tensor | | | | | oneflow.kaiming_initializer | | | | -| oneflow.le | [oneflow.tile](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tile.py#L20) | [upsample2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_upsample.py#L380) | | +| oneflow.le | [oneflow.tile](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tile.py#L20) | [less_equal](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_less_equal.py#L84) | [reflect_pad_size_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L107) | | oneflow.linalg_flow | | | | | oneflow.linalg_matrix_norm | | | | | oneflow.linalg_norm | | | | | oneflow.linalg_vector_norm | | | | -| oneflow.linspace | | [linspace_int_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_linspace.py#L32) | | -| oneflow.log | [oneflow.Tensor.logical_not](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L355) | [logical_slice_assign](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_slice.py#L171) | | -| oneflow.log1p | [oneflow.Tensor.log1p](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L864) | [log1p_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_log1p.py#L31) | | -| oneflow.log2 | [oneflow.log2](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L928) | | | +| oneflow.linspace | | [linspace_int_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_linspace.py#L32) | | +| oneflow.log | [oneflow.Tensor.logical_not](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L512) | [logical_or](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_logical_or.py#L58) | | +| oneflow.log1p | [oneflow.Tensor.log1p](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1042) | [log1p_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_log1p.py#L31) | | +| oneflow.log2 | [oneflow.log2](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L948) | [log2_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L808) | | | oneflow.log_softmax | | | | | oneflow.logical_and | | | | | oneflow.logical_not | | | | | oneflow.logical_or | | | | -| oneflow.logical_slice | | | | -| oneflow.logical_slice_assign | | | | | oneflow.logical_xor | | | | -| oneflow.long | [oneflow.Tensor.long](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1631) | [long](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_tensor_ops.py#L86) | | -| oneflow.lt | [oneflow.Tensor.lt](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L802) | [multistep_lr](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L160) | | +| oneflow.long | [oneflow.Tensor.long](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1894) | [long](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_tensor_ops.py#L144) | | +| oneflow.lt | [oneflow.Tensor.lt](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L980) | [multi_input](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_autograd_function.py#L54) | [multi_input_with_diff_device](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_multi_input_with_diff_device_or_placement.py#L27) | | oneflow.manual_seed | | | | | oneflow.masked_fill | | | | | oneflow.masked_select | | | | -| oneflow.matmul | [oneflow.Tensor.matmul](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L443) | [matmul](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_matmul.py#L42) | | -| oneflow.max | [oneflow.argmax](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L139) | [maxpool](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_maxpool.py#L219) | | -| oneflow.maximum | [oneflow.maximum](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L977) | [maximum_minimum_with_same_input](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_maximum_minimum.py#L93) | | -| oneflow.mean | [oneflow.Tensor.mean](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1504) | [mean](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_mean.py#L33) | | -| oneflow.meshgrid | [oneflow.meshgrid](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/meshgrid.py#L20) | [meshgrid](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_meshgrid.py#L68) | | -| oneflow.min | [oneflow.argmin](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L169) | [min_max_observer](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_min_max_observer.py#L136) | | -| oneflow.minimum | [oneflow.minimum](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L955) | | | -| oneflow.mish | [oneflow.mish](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L254) | [mish_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L182) | | -| oneflow.movedim | [oneflow.movedim](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1320) | [movedim](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_movedim.py#L37) | | -| oneflow.mul | [oneflow.Tensor.matmul](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L443) | [mul_with_scalar](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_mul.py#L47) | | -| oneflow.narrow | [oneflow.Tensor.narrow](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L450) | [narrow](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_narrow.py#L34) | | -| oneflow.ne | [oneflow.decode_onerec](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/dataset.py#L20) | [ones_like](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_ones_like.py#L53) | | -| oneflow.neg | [oneflow.Tensor.negative](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L907) | [negative_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_negative.py#L42) | | -| oneflow.negative | [oneflow.Tensor.negative](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L907) | [negative_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_negative.py#L42) | | +| oneflow.matmul | [oneflow.Tensor.matmul](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L600) | [matmul](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_matmul.py#L42) | [matmul_dimension_error1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L220) | +| oneflow.max | [oneflow.max](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/reduce_ops.py#L20) | [maxpool1d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_maxpool.py#L155) | [argmax_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L22) | +| oneflow.maximum | [oneflow.maximum](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L997) | [maximum_minimum_with_same_input](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_maximum_minimum.py#L93) | | +| oneflow.mean | [oneflow.mean](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/reduce_ops.py#L123) | [mean](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_mean.py#L33) | | +| oneflow.meshgrid | [oneflow.meshgrid](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/meshgrid.py#L20) | [meshgrid](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_meshgrid.py#L68) | [meshgrid_tensors_scalar_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L276) | +| oneflow.min | [oneflow.min](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/reduce_ops.py#L56) | [argmin](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_argmin.py#L34) | | +| oneflow.minimum | [oneflow.minimum](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L975) | | | +| oneflow.mish | [oneflow.Tensor.mish](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1049) | [mish_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L189) | | +| oneflow.movedim | [oneflow.movedim](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1428) | [movedim](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_movedim.py#L37) | | +| oneflow.mul | [oneflow.Tensor.matmul](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L600) | [mul_with_scalar](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_mul.py#L47) | [matmul_dimension_error1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L220) | +| oneflow.narrow | [oneflow.narrow](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor_ops.py#L20) | [narrow](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_narrow.py#L35) | [narrow_dim_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L178) | +| oneflow.ne | [oneflow.comm.send](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/comm.py#L20) | [generator_manual_seed](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_generator.py#L72) | [onehot_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L375) | +| oneflow.neg | [oneflow.Tensor.negative](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1085) | [negative](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_negative.py#L31) | | +| oneflow.negative | [oneflow.Tensor.negative](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1085) | [negative](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_negative.py#L31) | | | oneflow.new_ones | | | | -| oneflow.nms | [oneflow.Tensor.nms](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1461) | [nms](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_nms.py#L91) | | +| oneflow.nms | [oneflow.Tensor.nms](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1674) | [nms](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_nms.py#L50) | | | oneflow.no_grad | | | | -| oneflow.nonzero | [oneflow.nonzero](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/nonzero.py#L20) | [nonzero](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_nozero.py#L31) | | +| oneflow.nonzero | [oneflow.Tensor.nonzero](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1681) | [nonzero](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_nozero.py#L31) | | | oneflow.not_equal | | | | -| oneflow.numel | [oneflow.Tensor.numel](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L193) | | | +| oneflow.numel | [oneflow.Tensor.numel](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L194) | | | | oneflow.one_embedding | | | | -| oneflow.ones | [oneflow.ones_like](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/constant.py#L20) | [ones_like](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_ones_like.py#L53) | | +| oneflow.ones | [oneflow.ones_like](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/constant.py#L20) | [ones_like](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_ones_like.py#L53) | | | oneflow.ones_initializer | | | | | oneflow.ones_like | | | | -| oneflow.pad | | [ConstantPad2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_zeropad2d.py#L96) | | -| oneflow.permute | [oneflow.Tensor.permute](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L464) | [permute4d_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_contiguous.py#L69) | | -| oneflow.placement | [oneflow.Tensor.placement](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L94) | | | -| oneflow.pow | [oneflow.Tensor.pow](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L950) | [pow_float_scalar_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L163) | | -| oneflow.prod | [oneflow.Tensor.prod](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1513) | [cumprod](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_cum_ops.py#L37) | | -| oneflow.randint | | [randint_consistent](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_randint.py#L56) | | -| oneflow.randn | | [randn](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_randn.py#L86) | | +| oneflow.pad | | [padding_idx](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_sparse.py#L140) | [pad_size_attribute_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L89) | +| oneflow.permute | [oneflow.permute](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor_ops.py#L82) | [permute2d_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_contiguous.py#L40) | | +| oneflow.placement | [oneflow.Tensor.placement](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L95) | | | +| oneflow.pow | [oneflow.Tensor.pow](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1128) | [pow_float_scalar_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L163) | | +| oneflow.prod | [oneflow.prod](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/reduce_ops.py#L154) | [cumprod](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_cum_ops.py#L38) | | +| oneflow.randint | | [randint](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_randint.py#L99) | | +| oneflow.randn | | [randn](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_randn.py#L102) | | | oneflow.random_normal_initializer | | | | | oneflow.random_uniform_initializer | | | | -| oneflow.randperm | | [randperm](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_randperm.py#L86) | | -| oneflow.reciprocal | [oneflow.Tensor.reciprocal](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L978) | | | -| oneflow.relu | [oneflow.relu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L50) | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | | -| oneflow.repeat | [oneflow.Tensor.repeat](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1334) | | | -| oneflow.reshape | [oneflow.Tensor.reshape](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1522) | [reshape](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_reshape.py#L59) | [reshape_exception_only_one_dim_infered](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/exceptions/test_reshape.py#L25) | +| oneflow.randperm | | [randperm](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_randperm.py#L86) | | +| oneflow.reciprocal | [oneflow.Tensor.reciprocal](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1156) | | | +| oneflow.relu | [oneflow.Tensor.relu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1135) | [relu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L124) | [relu_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L29) | +| oneflow.repeat | [oneflow.Tensor.repeat](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1538) | | [repeat_interleave_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_repeat_interleave.py#L25) | +| oneflow.reshape | [oneflow.Tensor.reshape](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1753) | [reshape](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_reshape.py#L86) | [reshape_exception_only_one_dim_infered](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_reshape.py#L25) | | oneflow.roi_align | | | | -| oneflow.roll | [oneflow.Tensor.roll](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L964) | | | -| oneflow.round | [oneflow.Tensor.round](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L971) | [round_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L724) | | -| oneflow.rsqrt | [oneflow.Tensor.rsqrt](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1064) | [rsqrt_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L136) | | -| oneflow.save | | [save_state_dict](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_module.py#L179) | | -| oneflow.sbp | [oneflow.Tensor.sbp](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L101) | [sbp_symbol](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_sbp_symbol.py#L23) | | -| oneflow.scatter | | [scatter_1n4d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_comm_ops.py#L84) | | +| oneflow.roll | [oneflow.Tensor.roll](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1142) | | [roll_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L112) | +| oneflow.round | [oneflow.Tensor.round](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1149) | [round_tensor_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_2.py#L722) | | +| oneflow.rsqrt | [oneflow.Tensor.rsqrt](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1256) | [rsqrt_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L136) | | +| oneflow.save | | [save_state_dict](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_module.py#L179) | | +| oneflow.sbp | [oneflow.Tensor.sbp](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L102) | [sbp_symbol](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_sbp_symbol.py#L23) | | +| oneflow.scatter | | [scatter_nd](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_scatter_nd.py#L56) | | | oneflow.scatter_add | | | | -| oneflow.select | [oneflow.select](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1291) | | | -| oneflow.selu | [oneflow.selu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L396) | [selu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L192) | | +| oneflow.select | [oneflow.select](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1399) | | [ApplySelectIndexing_input_dim_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_tensor_index.py#L37) | +| oneflow.selu | [oneflow.Tensor.selu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1284) | [selu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L199) | | | oneflow.set_num_threads | | | | | oneflow.set_printoptions | | | | | oneflow.set_rng_state | | | | -| oneflow.sigmoid | [oneflow.sigmoid](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L325) | [sigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L152) | | -| oneflow.sign | [oneflow.Tensor.sign](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1106) | [sign](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_sign.py#L45) | | -| oneflow.silu | [oneflow.silu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L224) | [silu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L187) | | -| oneflow.sin | [oneflow.Tensor.asin](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1006) | [cosine_decay_lr](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L82) | | +| oneflow.sigmoid | [oneflow.Tensor.sigmoid](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1291) | [sigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L154) | | +| oneflow.sign | [oneflow.Tensor.sign](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1298) | [sign](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_sign.py#L45) | | +| oneflow.silu | [oneflow.Tensor.silu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1305) | [silu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L194) | | +| oneflow.sin | [oneflow.Tensor.asin](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1198) | [cosine_decay_lr](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L82) | [cosine_similarity_not_floating_type](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_cosine_similarity.py#L24) | | oneflow.sin_ | | | | -| oneflow.sinh | [oneflow.Tensor.arcsinh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1020) | | | -| oneflow.slice | | [slice](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_slice.py#L133) | | -| oneflow.softmax | [oneflow.Tensor.softmax](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1141) | [softmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L395) | | -| oneflow.softplus | [oneflow.softplus](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L133) | [softplus](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L502) | | -| oneflow.softshrink | | [softshrink_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L207) | | -| oneflow.softsign | [oneflow.Tensor.softsign](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1155) | [softsign_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L685) | | -| oneflow.sort | [oneflow.sort](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/sort.py#L20) | [argsort](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_argsort.py#L36) | | -| oneflow.split | [oneflow.Tensor.split](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L709) | | | -| oneflow.sqrt | [oneflow.Tensor.sqrt](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L363) | [sqrt_sum_with_cpu_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_sqrt_square_sum.py#L48) | | -| oneflow.square | [oneflow.Tensor.square](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L370) | [square_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L146) | | -| oneflow.squeeze | [oneflow.squeeze](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L303) | [squeeze_1d_input](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_squeeze.py#L51) | | -| oneflow.stack | [oneflow.stack](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L272) | [stack_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_stack.py#L28) | | +| oneflow.sinh | [oneflow.Tensor.arcsinh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1212) | | | +| oneflow.slice | | [slice](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_slice.py#L151) | [PrepareSliceIndices_indices_amount_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_tensor_index.py#L22) | +| oneflow.slice_update | | | | +| oneflow.softmax | [oneflow.Tensor.softmax](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1333) | [softmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L415) | [softmax_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L109) | +| oneflow.softplus | [oneflow.Tensor.softplus](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1340) | [softplus_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L209) | | +| oneflow.softshrink | | [softshrink_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L214) | | +| oneflow.softsign | [oneflow.Tensor.softsign](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1347) | [softsign_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L710) | | +| oneflow.sort | [oneflow.Tensor.argsort](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L684) | [argsort](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_argsort.py#L37) | | +| oneflow.split | [oneflow.Tensor.split](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L866) | | [split_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L224) | +| oneflow.sqrt | [oneflow.Tensor.sqrt](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L520) | [sqrt_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L109) | | +| oneflow.square | [oneflow.Tensor.square](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L527) | [square_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L146) | | +| oneflow.squeeze | [oneflow.unsqueeze](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor_ops.py#L50) | [unsqueeze](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_unsqueeze.py#L68) | [squeeze_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L106) | +| oneflow.stack | [oneflow.stack](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L272) | [stack_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_stack.py#L28) | [stack_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L62) | | oneflow.stateful_op | | | | -| oneflow.std | [oneflow.Tensor.std](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L377) | [std_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_std.py#L26) | | -| oneflow.sub | [oneflow.Tensor.sub_](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L900) | [sub](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_sub.py#L96) | | -| oneflow.sum | [oneflow.einsum](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/einsum.py#L20) | [einsum_bilinear_transformation](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_einsum_bilinear_transformation.py#L42) | | +| oneflow.std | [oneflow.Tensor.std](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L534) | [std_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_std.py#L26) | | +| oneflow.sub | [oneflow.Tensor.sub_](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1078) | [sub](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_sub.py#L96) | | +| oneflow.sum | [oneflow.sum](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/reduce_ops.py#L92) | [einsum_alphaflod_usecase11](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_einsum_alphaflod_usecase11.py#L38) | | | oneflow.support | | | | -| oneflow.swapaxes | [oneflow.swapaxes](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/swapaxes.py#L20) | [swapaxes_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_swapaxes.py#L32) | | -| oneflow.t | [oneflow.nn.functional.layer_norm](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/normalization.py#L20) | [cast](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_flatten.py#L63) | | -| oneflow.tan | [oneflow.tanh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L150) | [ConstantPad2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_zeropad2d.py#L96) | | -| oneflow.tanh | [oneflow.tanh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L150) | [tanh_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L132) | | +| oneflow.swapaxes | [oneflow.Tensor.swapaxes](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L880) | [swapaxes_flow_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_swapaxes.py#L31) | | +| oneflow.t | [oneflow.permute](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor_ops.py#L82) | [greter_equal](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_greater_equal.py#L88) | [repeat_interleave_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_repeat_interleave.py#L25) | +| oneflow.tan | [oneflow.atan2](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/trigonometric_ops.py#L21) | [constant_warmup_cosine_annealing](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L446) | | +| oneflow.tanh | [oneflow.Tensor.arctanh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L663) | [tanh_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L134) | | | oneflow.tensor_buffer | | | | | oneflow.tensor_buffer_to_list_of_tensors | | | | | oneflow.tensor_buffer_to_tensor | | | | | oneflow.tensor_scatter_nd_update | | | | | oneflow.tensor_split | | | | | oneflow.tensor_to_tensor_buffer | | | | -| oneflow.tile | [oneflow.tile](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tile.py#L20) | | | +| oneflow.tile | [oneflow.tile](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tile.py#L20) | | [tile_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L431) | | oneflow.to_global | | | | | oneflow.to_local | | | | -| oneflow.topk | [oneflow.topk](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/topk.py#L20) | | | -| oneflow.transpose | [oneflow.transpose](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L245) | [transpose](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_transpose.py#L86) | | -| oneflow.tril | [oneflow.tril](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L84) | [tril_without_diag](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_tril.py#L26) | | -| oneflow.triu | [oneflow.triu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L114) | [triu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_triu.py#L47) | | +| oneflow.topk | [oneflow.Tensor.topk](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1667) | | | +| oneflow.transpose | [oneflow.transpose](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L245) | [transpose_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_contiguous.py#L32) | | +| oneflow.tril | [oneflow.tril](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L84) | [tril_without_diag](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_tril.py#L26) | | +| oneflow.triu | [oneflow.triu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L114) | [triu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_triu.py#L47) | | | oneflow.truncated_normal_initializer | | | | | oneflow.uint8 | | | | -| oneflow.unsqueeze | [oneflow.Tensor.unsqueeze](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L457) | [unsqueeze_with_0_size_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_unsqueeze.py#L88) | | -| oneflow.var | [oneflow.Tensor.var](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L384) | | | +| oneflow.unsqueeze | [oneflow.unsqueeze](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor_ops.py#L50) | [unsqueeze](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_unsqueeze.py#L68) | | +| oneflow.var | [oneflow.Tensor.var](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L541) | | | | oneflow.variance_scaling_initializer | | | | | oneflow.version | | | | -| oneflow.view | [oneflow.Tensor.view](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1529) | [view](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_view.py#L78) | | -| oneflow.vsplit | [oneflow.vsplit](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1502) | | | -| oneflow.where | [oneflow.Tensor.argwhere](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L534) | [argwhere](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L625) | | +| oneflow.view | [oneflow.Tensor.view](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1776) | [view](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_view.py#L79) | [view_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_array_functor.py#L166) | +| oneflow.vsplit | [oneflow.vsplit](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/math_ops.py#L1649) | | | +| oneflow.where | [oneflow.Tensor.argwhere](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L691) | [where](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_where.py#L196) | | | oneflow.xavier_normal_initializer | | | | | oneflow.xavier_uniform_initializer | | | | | oneflow.zero_ | | | | -| oneflow.zeros | [oneflow.zeros_like](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/constant.py#L43) | [zeros_](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L908) | | +| oneflow.zeros | [oneflow.zeros_like](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/constant.py#L43) | [zeros_](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/tensor/test_tensor_part_1.py#L944) | | | oneflow.zeros_initializer | | | | | oneflow.zeros_like | | | | -| oneflow.optim.Adagrad | | [adagrad](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adagrad.py#L197) | | -| oneflow.optim.Adam | | [adam](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adam.py#L241) | | -| oneflow.optim.AdamW | | [adamw](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adamw.py#L244) | | -| oneflow.optim.LAMB | | [lambda_lr](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L199) | | -| oneflow.optim.RMSprop | | [rmsprop](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_rmsprop.py#L228) | | -| oneflow.optim.SGD | | [sgd](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_sgd.py#L194) | | +| oneflow.optim.Adagrad | | [adagrad](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adagrad.py#L197) | | +| oneflow.optim.Adam | | [adamw](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adamw.py#L244) | | +| oneflow.optim.AdamW | | [adamw](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adamw.py#L244) | | +| oneflow.optim.LAMB | | [lambda_lr](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L199) | | +| oneflow.optim.RMSprop | | [rmsprop](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_rmsprop.py#L228) | | +| oneflow.optim.SGD | | [sgd](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_sgd.py#L194) | | | oneflow.optim.lr_scheduler.ChainedScheduler | | | | | oneflow.optim.lr_scheduler.ConstantLR | | | | | oneflow.optim.lr_scheduler.CosineAnnealingLR | | | | @@ -271,96 +270,96 @@ | oneflow.nn.AdaptiveAvgPool2d | | | | | oneflow.nn.AdaptiveAvgPool3d | | | | | oneflow.nn.AllReduce | | | | -| oneflow.nn.AvgPool1d | | [avgpool1d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_avgpool.py#L28) | | -| oneflow.nn.AvgPool2d | | [avgpool2d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_avgpool.py#L44) | | -| oneflow.nn.AvgPool3d | | [avgpool3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_avgpool.py#L61) | | +| oneflow.nn.AvgPool1d | | [avgpool1d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_avgpool.py#L28) | | +| oneflow.nn.AvgPool2d | | [avgpool2d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_avgpool.py#L44) | | +| oneflow.nn.AvgPool3d | | [avgpool3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_avgpool.py#L61) | | | oneflow.nn.BCELoss | | | | | oneflow.nn.BCEWithLogitsLoss | | | | -| oneflow.nn.BatchNorm1d | | [batchnorm1d_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_batchnorm.py#L32) | | -| oneflow.nn.BatchNorm2d | | [batchnorm2d_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_batchnorm.py#L48) | | -| oneflow.nn.BatchNorm3d | | [batchnorm3d_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_batchnorm.py#L64) | | -| oneflow.nn.CELU | | [celu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L142) | | +| oneflow.nn.BatchNorm1d | | [batchnorm1d_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_batchnorm.py#L34) | | +| oneflow.nn.BatchNorm2d | | [batchnorm2d_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_batchnorm.py#L52) | | +| oneflow.nn.BatchNorm3d | | [batchnorm3d_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_batchnorm.py#L70) | | +| oneflow.nn.CELU | | [celu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L144) | [celu_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L47) | | oneflow.nn.COCOReader | | | | -| oneflow.nn.CTCLoss | | | | +| oneflow.nn.CTCLoss | | | [ctcloss_reduction_type_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L62) | | oneflow.nn.CoinFlip | | | | | oneflow.nn.CombinedMarginLoss | | | | -| oneflow.nn.ConstantPad1d | | [constantpad1d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_constantpad.py#L32) | | -| oneflow.nn.ConstantPad2d | | [ConstantPad2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_zeropad2d.py#L96) | | -| oneflow.nn.ConstantPad3d | | [constantpad3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_constantpad.py#L64) | | -| oneflow.nn.Conv1d | | [conv1d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_conv1d.py#L422) | | -| oneflow.nn.Conv2d | | [deconv2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_deconv2d.py#L68) | | -| oneflow.nn.Conv3d | | [conv3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_conv3d.py#L26) | | +| oneflow.nn.ConstantPad1d | | [constantpad1d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_constantpad.py#L32) | | +| oneflow.nn.ConstantPad2d | | [ConstantPad2d](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_zeropad2d.py#L96) | | +| oneflow.nn.ConstantPad3d | | [constantpad3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_constantpad.py#L64) | | +| oneflow.nn.Conv1d | | [conv1d](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_conv1d.py#L422) | | +| oneflow.nn.Conv2d | | [conv2d_default_init](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_conv2d.py#L1568) | | +| oneflow.nn.Conv3d | | | | | oneflow.nn.ConvTranspose1d | | | | | oneflow.nn.ConvTranspose2d | | | | | oneflow.nn.ConvTranspose3d | | | | | oneflow.nn.CropMirrorNormalize | | | | | oneflow.nn.CrossEntropyLoss | | | | | oneflow.nn.DistributedPariticalFCSample | | | | -| oneflow.nn.Dropout | | [dropout_numpy_case](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_dropout.py#L239) | | -| oneflow.nn.ELU | [oneflow.relu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L50) | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | | -| oneflow.nn.Embedding | | [embedding](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_sparse.py#L152) | | +| oneflow.nn.Dropout | | [dropout_p01](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_dropout.py#L44) | | +| oneflow.nn.ELU | [oneflow.Tensor.gelu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1017) | [relu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L124) | [relu_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L29) | +| oneflow.nn.Embedding | | [embedding](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_sparse.py#L45) | | | oneflow.nn.FakeQuantization | | | | -| oneflow.nn.Flatten | [oneflow.Tensor.flatten](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L154) | [flatten](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_flatten.py#L38) | | -| oneflow.nn.Fold | [oneflow.Tensor.unfold](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L398) | [fold](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_fold.py#L45) | | +| oneflow.nn.Flatten | [oneflow.flatten](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/flatten.py#L20) | [flatten_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_flatten.py#L71) | | +| oneflow.nn.Fold | [oneflow.Tensor.unfold](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L555) | [fold_with_random_data_1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_fold.py#L28) | | | oneflow.nn.FusedBatchNorm1d | | | | | oneflow.nn.FusedBatchNorm2d | | | | | oneflow.nn.FusedBatchNorm3d | | | | | oneflow.nn.FusedMLP | | | | -| oneflow.nn.GELU | [oneflow.gelu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L74) | [gelu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L147) | | -| oneflow.nn.GLU | | [glu_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_glu.py#L37) | | +| oneflow.nn.GELU | [oneflow.Tensor.gelu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1017) | [gelu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L149) | | +| oneflow.nn.GLU | | [glu_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_glu.py#L37) | [glu_scalar_tensor_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L57) | | oneflow.nn.GPTIndexedBinDataReader | | | | -| oneflow.nn.GRU | | | | -| oneflow.nn.GroupNorm | | [groupnorm](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_groupnorm.py#L332) | | -| oneflow.nn.Hardsigmoid | | [hardsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L157) | | -| oneflow.nn.Hardswish | | [hardswish_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L167) | | -| oneflow.nn.Hardtanh | | [hardtanh_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L172) | | -| oneflow.nn.Identity | | [identity_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_linear.py#L217) | | +| oneflow.nn.GRU | | [gru_cell](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_rnn_cell.py#L218) | | +| oneflow.nn.GroupNorm | | [groupnorm](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_groupnorm.py#L332) | | +| oneflow.nn.Hardsigmoid | | [hardsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L159) | | +| oneflow.nn.Hardswish | | [hardswish_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L174) | | +| oneflow.nn.Hardtanh | | [hardtanh_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L179) | | +| oneflow.nn.Identity | | [identity_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_linear.py#L217) | | | oneflow.nn.InstanceNorm1d | | | | | oneflow.nn.InstanceNorm2d | | | | | oneflow.nn.InstanceNorm3d | | | | | oneflow.nn.KLDivLoss | | | | | oneflow.nn.L1Loss | | | | -| oneflow.nn.LSTM | | | | -| oneflow.nn.LayerNorm | | | | -| oneflow.nn.LeakyReLU | | [leakyrelu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L177) | | -| oneflow.nn.Linear | | [linear_warmup_exp_lr](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L376) | | -| oneflow.nn.LogSigmoid | | [logsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L162) | | -| oneflow.nn.LogSoftmax | | [logsoftmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L414) | | +| oneflow.nn.LSTM | | [lstm_cell](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_rnn_cell.py#L200) | | +| oneflow.nn.LayerNorm | | | [layernorm_exception_input_shape_not_match](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_layernorm.py#L25) | +| oneflow.nn.LeakyReLU | | [leakyrelu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L184) | | +| oneflow.nn.Linear | | [linear_forward](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_linear.py#L163) | | +| oneflow.nn.LogSigmoid | | [logsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L169) | | +| oneflow.nn.LogSoftmax | | [logsoftmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L439) | | | oneflow.nn.MSELoss | | | | | oneflow.nn.MarginRankingLoss | | | | -| oneflow.nn.MaxPool1d | | [maxpool1d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_maxpool.py#L155) | | -| oneflow.nn.MaxPool2d | | [maxpool2d_channel_last](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_maxpool.py#L135) | | -| oneflow.nn.MaxPool3d | | [maxpool3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_maxpool.py#L199) | | +| oneflow.nn.MaxPool1d | | [maxpool1d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_maxpool.py#L155) | | +| oneflow.nn.MaxPool2d | | [maxpool2d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_maxpool.py#L177) | | +| oneflow.nn.MaxPool3d | | [maxpool3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_maxpool.py#L199) | | | oneflow.nn.MinMaxObserver | | | | -| oneflow.nn.Mish | [oneflow.mish](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L254) | [mish_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L182) | | -| oneflow.nn.Module | [oneflow.nn.Module.to_consistent](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/module.py#L20) | [module_to_global](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_module_to_consistent.py#L30) | | -| oneflow.nn.ModuleDict | | [moduledict](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_module.py#L303) | | +| oneflow.nn.Mish | [oneflow.Tensor.mish](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1049) | [mish_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L189) | | +| oneflow.nn.Module | [oneflow.nn.Module.to_consistent](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/module.py#L20) | [module_to_global](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_module_to_consistent.py#L30) | | +| oneflow.nn.ModuleDict | | [moduledict](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_module.py#L310) | | | oneflow.nn.ModuleList | | | | | oneflow.nn.MovingAverageMinMaxObserver | | | | | oneflow.nn.NLLLoss | | | | -| oneflow.nn.PReLU | | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | | -| oneflow.nn.Parameter | | [parameter](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_module.py#L98) | | +| oneflow.nn.PReLU | | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | [prelu_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L38) | +| oneflow.nn.Parameter | | [parameter](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_module.py#L98) | | | oneflow.nn.ParameterDict | | | | | oneflow.nn.ParameterList | | | | | oneflow.nn.PixelShuffle | | | | | oneflow.nn.Quantization | | | | -| oneflow.nn.RNN | | | | -| oneflow.nn.ReLU | [oneflow.relu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L50) | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | | -| oneflow.nn.ReLU6 | | [relu6_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L127) | | +| oneflow.nn.RNN | | [rnn_relu_cell](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_rnn_cell.py#L206) | | +| oneflow.nn.ReLU | [oneflow.Tensor.relu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1135) | [relu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L124) | [relu_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L29) | +| oneflow.nn.ReLU6 | | [relu6_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L129) | | | oneflow.nn.ReflectionPad2d | | | | -| oneflow.nn.ReplicationPad2d | | [ReplicationPad2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_replicationpad2d.py#L104) | | -| oneflow.nn.SELU | [oneflow.selu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L396) | [selu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L192) | | +| oneflow.nn.ReplicationPad2d | | [ReplicationPad2d](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_replicationpad2d.py#L104) | | +| oneflow.nn.SELU | [oneflow.Tensor.selu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1284) | [selu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L199) | | | oneflow.nn.Sequential | | | | -| oneflow.nn.SiLU | [oneflow.silu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L224) | [silu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L187) | | -| oneflow.nn.Sigmoid | [oneflow.sigmoid](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L325) | [sigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L152) | | +| oneflow.nn.SiLU | [oneflow.Tensor.silu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1305) | [silu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L194) | | +| oneflow.nn.Sigmoid | [oneflow.Tensor.sigmoid](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1291) | [sigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L154) | | | oneflow.nn.SmoothL1Loss | | | | -| oneflow.nn.Softmax | [oneflow.Tensor.softmax](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1141) | [softmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L395) | | -| oneflow.nn.Softplus | [oneflow.softplus](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L133) | [softplus](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L502) | | -| oneflow.nn.Softshrink | | [softshrink_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L207) | | -| oneflow.nn.Softsign | [oneflow.Tensor.softsign](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1155) | [softsign_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L685) | | -| oneflow.nn.Tanh | [oneflow.tanh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L150) | [tanh_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L132) | | +| oneflow.nn.Softmax | [oneflow.Tensor.softmax](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1333) | [softmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L415) | [softmax_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L109) | +| oneflow.nn.Softplus | [oneflow.Tensor.softplus](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1340) | [softplus_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L209) | | +| oneflow.nn.Softshrink | | [softshrink_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L214) | | +| oneflow.nn.Softsign | [oneflow.Tensor.softsign](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1347) | [softsign_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L710) | | +| oneflow.nn.Tanh | [oneflow.Tensor.arctanh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L663) | [tanh_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L134) | | | oneflow.nn.TripletMarginLoss | | | | -| oneflow.nn.Unfold | [oneflow.Tensor.unfold](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L398) | [unfold_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_unfold.py#L42) | | +| oneflow.nn.Unfold | [oneflow.Tensor.unfold](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L555) | [unfold_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_unfold.py#L28) | | | oneflow.nn.UpsamplingBilinear2d | | | | | oneflow.nn.UpsamplingNearest2d | | | | | oneflow.nn.ZeroPad2d | | | | @@ -371,87 +370,87 @@ | oneflow.nn.functional.avg_pool1d | | | | | oneflow.nn.functional.avg_pool2d | | | | | oneflow.nn.functional.avg_pool3d | | | | -| oneflow.nn.functional.celu | | [celu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L142) | | -| oneflow.nn.functional.conv1d | | [conv1d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_conv1d.py#L422) | | -| oneflow.nn.functional.conv2d | | [deconv2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_deconv2d.py#L68) | | -| oneflow.nn.functional.conv3d | | [conv3d_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_conv3d.py#L26) | | +| oneflow.nn.functional.celu | | [celu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L144) | [celu_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L47) | +| oneflow.nn.functional.conv1d | | [conv1d](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_conv1d.py#L422) | | +| oneflow.nn.functional.conv2d | | [conv2d_default_init](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_conv2d.py#L1568) | | +| oneflow.nn.functional.conv3d | | | | | oneflow.nn.functional.cross_entropy | | | | | oneflow.nn.functional.ctc_greedy_decoder | | | | -| oneflow.nn.functional.dropout | | [dropout_numpy_case](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_dropout.py#L239) | | -| oneflow.nn.functional.elu | [oneflow.relu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L50) | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | | -| oneflow.nn.functional.embedding | | [embedding](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_sparse.py#L152) | | +| oneflow.nn.functional.dropout | | [dropout_p01](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_dropout.py#L44) | | +| oneflow.nn.functional.elu | [oneflow.Tensor.gelu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1017) | [relu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L124) | [relu_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L29) | +| oneflow.nn.functional.embedding | | [embedding](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_sparse.py#L45) | | | oneflow.nn.functional.functional_maxpool | | | | -| oneflow.nn.functional.gelu | [oneflow.gelu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L74) | [gelu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L147) | | -| oneflow.nn.functional.glu | | [glu_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_glu.py#L37) | | +| oneflow.nn.functional.gelu | [oneflow.Tensor.gelu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1017) | [gelu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L149) | | +| oneflow.nn.functional.glu | | [glu_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_glu.py#L37) | [glu_scalar_tensor_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L57) | | oneflow.nn.functional.grid_sample | | | | -| oneflow.nn.functional.hardsigmoid | | [hardsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L157) | | -| oneflow.nn.functional.hardswish | | [hardswish_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L167) | | -| oneflow.nn.functional.hardtanh | | [hardtanh_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L172) | | -| oneflow.nn.functional.interpolate | | [interpolate](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_interpolate.py#L658) | | +| oneflow.nn.functional.hardsigmoid | | [hardsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L159) | | +| oneflow.nn.functional.hardswish | | [hardswish_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L174) | | +| oneflow.nn.functional.hardtanh | | [hardtanh_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L179) | | +| oneflow.nn.functional.interpolate | | [interpolate](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_interpolate.py#L658) | | | oneflow.nn.functional.layer_norm | | | | | oneflow.nn.functional.leaky_relu | | | | -| oneflow.nn.functional.linear | | [linear_warmup_exp_lr](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L376) | | +| oneflow.nn.functional.linear | | [linear_forward](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_linear.py#L163) | | | oneflow.nn.functional.log_softmax | | | | -| oneflow.nn.functional.logsigmoid | | [logsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L162) | | +| oneflow.nn.functional.logsigmoid | | [logsigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L169) | | | oneflow.nn.functional.max_pool1d | | | | | oneflow.nn.functional.max_pool2d | | | | | oneflow.nn.functional.max_pool3d | | | | -| oneflow.nn.functional.mish | [oneflow.mish](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L254) | [mish_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L182) | | -| oneflow.nn.functional.normalize | | [normalize_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_normalize.py#L36) | | +| oneflow.nn.functional.mish | [oneflow.Tensor.mish](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1049) | [mish_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L189) | | +| oneflow.nn.functional.normalize | | [normalize_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_normalize.py#L36) | [l2normalize_axis_error1](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L192) | | oneflow.nn.functional.one_hot | | | | -| oneflow.nn.functional.pad | | [ConstantPad2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_zeropad2d.py#L96) | | -| oneflow.nn.functional.prelu | | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | | -| oneflow.nn.functional.relu | [oneflow.relu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L50) | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | | -| oneflow.nn.functional.relu6 | | [relu6_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L127) | | -| oneflow.nn.functional.selu | [oneflow.selu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L396) | [selu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L192) | | -| oneflow.nn.functional.sigmoid | [oneflow.sigmoid](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L325) | [sigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L152) | | -| oneflow.nn.functional.silu | [oneflow.silu](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L224) | [silu_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L187) | | +| oneflow.nn.functional.pad | | [padding_idx](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_sparse.py#L140) | [pad_size_attribute_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L89) | +| oneflow.nn.functional.prelu | | [prelu_4dim_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_prelu.py#L32) | [prelu_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L38) | +| oneflow.nn.functional.relu | [oneflow.Tensor.relu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1135) | [relu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L124) | [relu_inplace_runtime_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L29) | +| oneflow.nn.functional.relu6 | | [relu6_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L129) | | +| oneflow.nn.functional.selu | [oneflow.Tensor.selu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1284) | [selu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L199) | | +| oneflow.nn.functional.sigmoid | [oneflow.Tensor.sigmoid](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1291) | [sigmoid_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L154) | | +| oneflow.nn.functional.silu | [oneflow.Tensor.silu](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1305) | [silu_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L194) | | | oneflow.nn.functional.smooth_l1_loss | | | | -| oneflow.nn.functional.softmax | [oneflow.Tensor.softmax](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1141) | [softmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L395) | | -| oneflow.nn.functional.softplus | [oneflow.softplus](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L133) | [softplus](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L502) | | -| oneflow.nn.functional.softshrink | | [softshrink_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L207) | | -| oneflow.nn.functional.softsign | [oneflow.Tensor.softsign](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1155) | [softsign_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L685) | | +| oneflow.nn.functional.softmax | [oneflow.Tensor.softmax](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1333) | [softmax_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L415) | [softmax_index_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_activation.py#L109) | +| oneflow.nn.functional.softplus | [oneflow.Tensor.softplus](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1340) | [softplus_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L209) | | +| oneflow.nn.functional.softshrink | | [softshrink_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L214) | | +| oneflow.nn.functional.softsign | [oneflow.Tensor.softsign](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L1347) | [softsign_module_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_activation.py#L710) | | | oneflow.nn.functional.sparse_softmax_cross_entropy | | | | -| oneflow.nn.functional.tanh | [oneflow.tanh](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/activation.py#L150) | [tanh_module](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L132) | | +| oneflow.nn.functional.tanh | [oneflow.Tensor.arctanh](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/tensor.py#L663) | [tanh_module](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_consistent_activation.py#L134) | | | oneflow.nn.functional.triplet_margin_loss | | | | -| oneflow.nn.functional.upsample | | [upsample2d](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_upsample.py#L380) | | +| oneflow.nn.functional.upsample | | [upsample2d](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_upsample.py#L357) | | | oneflow.nn.init.CalcGain | | | | | oneflow.nn.init.calculate_gain | | | | | oneflow.nn.init.constant_ | | | | -| oneflow.nn.init.flow | [oneflow.decode_onerec](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/dataset.py#L20) | [flow_erf_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_erf.py#L33) | | +| oneflow.nn.init.flow | [oneflow.comm.send](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/comm.py#L20) | [flow_erf_with_random_data](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_erf.py#L33) | | | oneflow.nn.init.kaiming_normal_ | | | | | oneflow.nn.init.kaiming_uniform_ | | | | | oneflow.nn.init.normal_ | | | | | oneflow.nn.init.ones_ | | | | -| oneflow.nn.init.os | [oneflow.transpose](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L245) | [cos](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L88) | | +| oneflow.nn.init.os | [oneflow.transpose](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/framework/docstr/array_ops.py#L245) | [cos](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_math_ops.py#L88) | [cross_entropy_reduction_type_error](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/exceptions/test_nn_functor.py#L50) | | oneflow.nn.init.trunc_normal_ | | | | | oneflow.nn.init.uniform_ | | | | | oneflow.nn.init.xavier_normal_ | | | | | oneflow.nn.init.xavier_uniform_ | | | | | oneflow.nn.init.zeros_ | | | | -| oneflow.nn.init.adagrad | | [adagrad](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adagrad.py#L197) | | -| oneflow.nn.init.adam | | [adam](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adam.py#L241) | | -| oneflow.nn.init.adamw | | [adamw](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adamw.py#L244) | | +| oneflow.nn.init.adagrad | | [adagrad](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adagrad.py#L197) | | +| oneflow.nn.init.adam | | [adamw](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adamw.py#L244) | | +| oneflow.nn.init.adamw | | [adamw](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_adamw.py#L244) | | | oneflow.nn.init.chained_scheduler | | | | | oneflow.nn.init.constant_lr | | | | | oneflow.nn.init.cosine_annealing_lr | | | | | oneflow.nn.init.cosine_annealing_warm_restarts | | | | | oneflow.nn.init.cosine_decay_lr | | | | | oneflow.nn.init.exponential_lr | | | | -| oneflow.nn.init.lamb | | [lambda_lr](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L199) | | +| oneflow.nn.init.lamb | | [lambda_lr](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_lr_scheduler.py#L199) | | | oneflow.nn.init.lambda_lr | | | | | oneflow.nn.init.linear_lr | | | | | oneflow.nn.init.lr_scheduler | | | | | oneflow.nn.init.multistep_lr | | | | | oneflow.nn.init.polynomial_lr | | | | | oneflow.nn.init.reduce_lr_on_plateau | | | | -| oneflow.nn.init.rmsprop | | [rmsprop](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_rmsprop.py#L228) | | +| oneflow.nn.init.rmsprop | | [rmsprop](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_rmsprop.py#L228) | | | oneflow.nn.init.sequential_lr | | | | -| oneflow.nn.init.sgd | | [sgd](https://github.com/Oneflow-Inc/oneflow/blob/8e2da64b33b59cc907195de423dc7fa632c1fee6/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_sgd.py#L194) | | +| oneflow.nn.init.sgd | | [sgd](https://github.com/Oneflow-Inc/oneflow/blob/64503e09ab90bd7e47e0682df217996daeec220d/python/oneflow/test/../../../python/oneflow/test/modules/test_optim_sgd.py#L194) | | | oneflow.nn.init.step_lr | | | | | oneflow.nn.init.warmup_lr | | | | ## Test Data Summary -- OneFlow Total API Number: ====================>448 -- Doc Test Ratio: ====================>35.71% = 160 / 448 -- Compatiable/Completeness Test Ratio: ====================>48.21% = 216 / 448 -- Exception Test Ratio: ====================>0.22% = 1 / 448 +- OneFlow Total API Number: ====================>446 +- Doc Test Ratio: ====================>36.32% = 162 / 446 +- Compatiable/Completeness Test Ratio: ====================>49.33% = 220 / 446 +- Exception Test Ratio: ====================>13.23% = 59 / 446 diff --git a/python/oneflow/test/exceptions/test_device.py b/python/oneflow/test/exceptions/test_device.py index 4aac53368a0..4a1453c3448 100644 --- a/python/oneflow/test/exceptions/test_device.py +++ b/python/oneflow/test/exceptions/test_device.py @@ -39,10 +39,7 @@ def test_device_index(test_case): # device = flow.device("cuda:1000") # flow.Tensor(2, 3).to(device=device) # test_case.assertTrue("CUDA error: invalid device ordinal" in str(exp.exception)) - - with test_case.assertRaises(RuntimeError) as exp: - device = flow.device("cpu:1000") - flow.Tensor(2, 3).to(device=device) + pass if __name__ == "__main__": diff --git a/python/oneflow/test/exceptions/test_local_global_convert_error.py b/python/oneflow/test/exceptions/test_local_global_convert_error.py index 8ebb5c63e6e..eac0acba4c7 100644 --- a/python/oneflow/test/exceptions/test_local_global_convert_error.py +++ b/python/oneflow/test/exceptions/test_local_global_convert_error.py @@ -64,7 +64,7 @@ def test_global_to_global_with_invalid_split_axis(test_case): @flow.unittest.skip_unless_1n1d() def test_call_to_local_for_local_tensor(test_case): x = flow.tensor([1, 2, 3, 4]) - with test_case.assertRaises(AssertionError) as ctx: + with test_case.assertRaises(RuntimeError) as ctx: y = x.to_local() test_case.assertTrue( "Expected global tensor for to_local but got local tensor!" diff --git a/python/oneflow/test/exceptions/test_mv.py b/python/oneflow/test/exceptions/test_mv.py new file mode 100644 index 00000000000..224d2d1e897 --- /dev/null +++ b/python/oneflow/test/exceptions/test_mv.py @@ -0,0 +1,50 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import unittest +import oneflow as flow +import oneflow.unittest + + +@flow.unittest.skip_unless_1n1d() +class TestMv(flow.unittest.TestCase): + def test_mv_not_matrix(test_case): + with test_case.assertRaises(Exception) as exp: + mat = flow.randn(2, 3, 3) + vec = flow.randn(3) + out = flow.mv(mat, vec) + test_case.assertTrue( + "vector + matrix @ vector expected, got 1, 3, 1" in str(exp.exception) + ) + + def test_mv_not_vector(test_case): + with test_case.assertRaises(Exception) as exp: + mat = flow.randn(2, 3) + vec = flow.randn(3, 1) + out = flow.mv(mat, vec) + test_case.assertTrue( + "vector + matrix @ vector expected, got 1, 2, 2" in str(exp.exception) + ) + + def test_mv_size_mismatch(test_case): + with test_case.assertRaises(Exception) as exp: + mat = flow.randn(2, 3) + vec = flow.randn(4) + out = flow.mv(mat, vec) + test_case.assertTrue("size mismatch" in str(exp.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/expensive/test_id_shuffle.py b/python/oneflow/test/expensive/test_id_shuffle.py index 301f186ee1d..bd6b3f3c891 100644 --- a/python/oneflow/test/expensive/test_id_shuffle.py +++ b/python/oneflow/test/expensive/test_id_shuffle.py @@ -351,7 +351,7 @@ def test_id_shuffle(test_case): for kwargs in GenArgDict(arg_dict): _test_id_shuffle(test_case, **kwargs) - def test_embedding_shuffle(test_case): + def _test_embedding_shuffle(test_case): arg_dict = OrderedDict() arg_dict["dtype"] = [flow.float32, flow.float16] arg_dict["enable_quantize"] = [True, False] @@ -359,7 +359,7 @@ def test_embedding_shuffle(test_case): for kwargs in GenArgDict(arg_dict): _test_embedding_shuffle(test_case, **kwargs) - def test_embedding_gradient_shuffle(test_case): + def _test_embedding_gradient_shuffle(test_case): arg_dict = OrderedDict() arg_dict["enable_quantize"] = [True, False] arg_dict["fp16"] = [True, False] @@ -367,7 +367,7 @@ def test_embedding_gradient_shuffle(test_case): for kwargs in GenArgDict(arg_dict): _test_embedding_gradient_shuffle(test_case, **kwargs) - def test_unique_key_value(test_case): + def _test_unique_key_value(test_case): arg_dict = OrderedDict() arg_dict["has_table_id"] = [True, False] arg_dict["num_tables"] = [13, 26, 1] diff --git a/python/oneflow/test/expensive/test_tensor_str.py b/python/oneflow/test/expensive/test_tensor_str.py index d41918330c1..2417de9d889 100644 --- a/python/oneflow/test/expensive/test_tensor_str.py +++ b/python/oneflow/test/expensive/test_tensor_str.py @@ -160,6 +160,15 @@ def _test_global_tensor_str_2d(test_case, device): test_case.assertTrue("1." in tensor_str) +def _test_nd_sbp_tensor_str(test_case, device, sbp0, sbp1): + placement = flow.placement(type=device, ranks=[[0, 1], [2, 3]]) + sbp = [sbp0, sbp1] + x = flow.ones((20, 20), placement=placement, sbp=sbp) + tensor_str = str(x) + test_case.assertTrue(str(sbp0) in tensor_str) + test_case.assertTrue(str(sbp1) in tensor_str) + + class TestTensorStrModule(flow.unittest.TestCase): @flow.unittest.skip_unless_1n1d() @unittest.skip("TODO: fengwei, this often fails") @@ -195,6 +204,27 @@ def test_tensor_str_1n2d(test_case): for arg in GenArgList(arg_dict): arg[0](test_case, *arg[1:]) + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + @flow.unittest.skip_unless_1n4d() + def test_nd_sbp_tensor_str(test_case): + arg_dict = OrderedDict() + arg_dict["test_fun"] = [ + _test_nd_sbp_tensor_str, + ] + arg_dict["device"] = ["cpu", "cuda"] + + sbp_arg_dict = OrderedDict() + sbp_list = [ + flow.sbp.broadcast, + flow.sbp.split(0), + flow.sbp.partial_sum, + ] + sbp_arg_dict["sbp0"] = sbp_list + sbp_arg_dict["sbp1"] = sbp_list + for arg in GenArgList(arg_dict): + for sbp in GenArgList(sbp_arg_dict): + arg[0](test_case, *(arg[1:] + sbp[:])) + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/gen_ops_process.py b/python/oneflow/test/gen_ops_process.py index a9c6def4628..6c6930f0bfc 100644 --- a/python/oneflow/test/gen_ops_process.py +++ b/python/oneflow/test/gen_ops_process.py @@ -152,8 +152,6 @@ "logical_and", "logical_not", "logical_or", - "logical_slice", - "logical_slice_assign", "logical_xor", "long", "lt", diff --git a/python/oneflow/test/graph/test_comb2d.py b/python/oneflow/test/graph/test_comb2d.py index aac2a5e12a5..7b746017bdb 100644 --- a/python/oneflow/test/graph/test_comb2d.py +++ b/python/oneflow/test/graph/test_comb2d.py @@ -24,7 +24,7 @@ import oneflow.unittest -class TestModule(nn.Module): +class _TestModule(nn.Module): def forward(self, x): sbp_1ds = [ flow.sbp.broadcast, @@ -62,7 +62,7 @@ def build(self, x): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") class TestLazyAllSbpCombinationTesting(flow.unittest.TestCase): def test_lazy_boxing_2d_all_combination(test_case): - model = TestModule() + model = _TestModule() graph = _TestGraph(model) x = flow.ones( diff --git a/python/oneflow/test/graph/test_graph_linear_train.py b/python/oneflow/test/graph/test_graph_linear_train.py index fbf0ab476e8..01841051c78 100644 --- a/python/oneflow/test/graph/test_graph_linear_train.py +++ b/python/oneflow/test/graph/test_graph_linear_train.py @@ -99,6 +99,7 @@ def build(self, x): def one_iter(): of_graph_out = linear_t_g(x) + print(linear_t_g.linear) return of_graph_out.numpy(), linear_t_g.linear.weight.origin.numpy() check_list = [] diff --git a/python/oneflow/test/graph/test_graph_lr_scheduler.py b/python/oneflow/test/graph/test_graph_lr_scheduler.py index 6ced90334ad..dbb13e561fa 100644 --- a/python/oneflow/test/graph/test_graph_lr_scheduler.py +++ b/python/oneflow/test/graph/test_graph_lr_scheduler.py @@ -181,7 +181,7 @@ def test_polynomial_lr(self): base_lr=0.1, iters=20, lr_scheduler=flow.optim.lr_scheduler.PolynomialLR, - steps=20, + decay_batch=20, end_learning_rate=1e-5, power=2.0, atol=1e-5, @@ -191,7 +191,7 @@ def test_polynomial_lr(self): base_lr=0.01, iters=20, lr_scheduler=flow.optim.lr_scheduler.PolynomialLR, - steps=20, + decay_batch=20, end_learning_rate=1e-4, power=1.0, cycle=True, diff --git a/python/oneflow/test/graph/test_graph_lrs.py b/python/oneflow/test/graph/test_graph_lrs.py index adedb2205a7..76fcd4c60bc 100644 --- a/python/oneflow/test/graph/test_graph_lrs.py +++ b/python/oneflow/test/graph/test_graph_lrs.py @@ -183,7 +183,7 @@ def _lr_fn(parameters): of_sgd = flow.optim.SGD(parameters, lr=0.001) lr = flow.optim.lr_scheduler.PolynomialLR( - of_sgd, steps=10, end_learning_rate=0.00001, power=2, cycle=True + of_sgd, decay_batch=10, end_learning_rate=0.00001, power=2, cycle=True ) return of_sgd, lr diff --git a/python/oneflow/test/graph/test_graph_ofrecord_reader.py b/python/oneflow/test/graph/test_graph_ofrecord_reader.py index 16b4f161e13..35dcd4d376c 100644 --- a/python/oneflow/test/graph/test_graph_ofrecord_reader.py +++ b/python/oneflow/test/graph/test_graph_ofrecord_reader.py @@ -90,9 +90,6 @@ def build(self): reader_g = GraphReader() image, label = reader_g() - print(image) - print(label) - if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/graph/test_graph_zero.py b/python/oneflow/test/graph/test_graph_zero.py index 51fa38a8657..4dc9f10bc47 100644 --- a/python/oneflow/test/graph/test_graph_zero.py +++ b/python/oneflow/test/graph/test_graph_zero.py @@ -88,7 +88,12 @@ def build(self, x): def one_train_iter(): out = linear_t_g(x) if flow.env.get_rank() == 0: - print(linear_t_g) + import traceback + + try: + print(linear_t_g) + except: + print(traceback.format_exc()) def one_eval_iter(): out = linear_e_g(x) @@ -206,7 +211,7 @@ def one_eval_iter(): for state in linear_t_g._state(): test_case.assertEqual( - state.origin.sbp, (oneflow.sbp.split(axis=0), oneflow.sbp.split(axis=0)) + state.origin.sbp, (oneflow.sbp.split(dim=0), oneflow.sbp.split(dim=0)) ) # In evaluation graph, paramters's sbp are flow.sbp.split(0). diff --git a/python/oneflow/test/graph/test_nccl_logical_send_recv.py b/python/oneflow/test/graph/test_nccl_logical_send_recv.py index addc6aaf015..9b6b90750d8 100644 --- a/python/oneflow/test/graph/test_nccl_logical_send_recv.py +++ b/python/oneflow/test/graph/test_nccl_logical_send_recv.py @@ -29,7 +29,7 @@ os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "1" -def _test_nccl_logical_send_recv(test_case, src_nd_sbp, dst_nd_sbp): +def _test_nccl_logical_send_recv_2d(test_case, src_nd_sbp, dst_nd_sbp): # can not process p in dst if flow.sbp.partial_sum() in dst_nd_sbp: return @@ -62,7 +62,7 @@ def _test_nccl_logical_send_recv(test_case, src_nd_sbp, dst_nd_sbp): # check graph boxing flow.boxing.nccl.enable_use_compute_stream(True) - class TestNcclLogicalSendRecvGraph(flow.nn.Graph): + class TestNcclLogicalSendRecv2DGraph(flow.nn.Graph): def __init__(self): super().__init__() @@ -70,7 +70,7 @@ def build(self, x): y = x.to_global(sbp=dst_nd_sbp, placement=placement) return y - graph = TestNcclLogicalSendRecvGraph() + graph = TestNcclLogicalSendRecv2DGraph() # graph.debug() y = graph(x) out_np = y.numpy() @@ -84,7 +84,7 @@ def build(self, x): test_case.assertTrue(np.array_equal(out_np, in_np)) -def gen_nd_sbp(): +def gen_2d_sbp(): sbp_list = [ flow.sbp.partial_sum(), flow.sbp.broadcast(), @@ -101,13 +101,85 @@ def gen_nd_sbp(): @flow.unittest.skip_unless_1n4d() @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") -class TestNcclLogicalSendRecv(flow.unittest.TestCase): - def test_nccl_logical_send_recv(test_case): +class TestNcclLogicalSendRecv2D(flow.unittest.TestCase): + def test_nccl_logical_send_recv_2d(test_case): arg_dict = OrderedDict() - arg_dict["src_nd_sbp"] = gen_nd_sbp() - arg_dict["dst_nd_sbp"] = gen_nd_sbp() + arg_dict["src_nd_sbp"] = gen_2d_sbp() + arg_dict["dst_nd_sbp"] = gen_2d_sbp() for arg in GenArgList(arg_dict): - _test_nccl_logical_send_recv(test_case, *arg) + _test_nccl_logical_send_recv_2d(test_case, *arg) + + +def _test_nccl_logical_send_recv_1d(test_case, src_nd_sbp, dst_nd_sbp): + # can not process p in dst + if flow.sbp.partial_sum() in dst_nd_sbp: + return + + # skip src == dst + if src_nd_sbp == dst_nd_sbp: + return + + # input + placement = flow.placement("cuda", ranks=[0, 1]) + local_np = np.arange(2 * 2 * 2).reshape(2, 2, 2) + x = flow.tensor(local_np, sbp=src_nd_sbp, placement=placement) + + # check eager boxing + eager_out = x.to_global(sbp=dst_nd_sbp, placement=placement) + test_case.assertTrue(np.array_equal(eager_out.numpy(), x.numpy())) + + # check graph boxing + flow.boxing.nccl.enable_use_compute_stream(True) + + class TestNcclLogicalSendRecv1DGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + + def build(self, x): + y = x.to_global(sbp=dst_nd_sbp, placement=placement) + return y + + graph = TestNcclLogicalSendRecv1DGraph() + # graph.debug(0) + y = graph(x) + out_np = y.numpy() + in_np = x.numpy() + # if flow.env.get_rank() == 0: + # print("src sbp ", src_nd_sbp, ", dst sbp ", dst_nd_sbp) + # print(graph) + # equal = np.array_equal(out_np, in_np) + # if not equal: + # print("in ", in_np) + # print("out ", out_np) + # print("====================") + test_case.assertTrue(np.array_equal(out_np, in_np)) + + +def gen_1d_sbp(): + sbp_list = [ + flow.sbp.partial_sum(), + flow.sbp.broadcast(), + flow.sbp.split(0), + flow.sbp.split(1), + flow.sbp.split(2), + ] + nd_sbp_list = [] + for sbp0 in sbp_list: + nd_sbp_list.append( + [sbp0,] + ) + return nd_sbp_list + + +@flow.unittest.skip_unless_1n2d() +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +class TestNcclLogicalSendRecv1D(flow.unittest.TestCase): + def test_nccl_logical_send_recv_1d(test_case): + arg_dict = OrderedDict() + arg_dict["src_nd_sbp"] = gen_1d_sbp() + arg_dict["dst_nd_sbp"] = gen_1d_sbp() + for arg in GenArgList(arg_dict): + _test_nccl_logical_send_recv_1d(test_case, *arg) if __name__ == "__main__": diff --git a/python/oneflow/test/modules/test_consistent_adaptive_pool.py b/python/oneflow/test/modules/test_consistent_adaptive_pool.py index 88f58934bc8..89f90a2d675 100644 --- a/python/oneflow/test/modules/test_consistent_adaptive_pool.py +++ b/python/oneflow/test/modules/test_consistent_adaptive_pool.py @@ -65,12 +65,12 @@ class TestAdaptiveAvgPool(flow.unittest.TestCase): def test_adaptive_avgpool(test_case): for placement in all_placement(): ndim = 3 - for sbp in all_sbp(placement, max_dim=ndim): + for sbp in all_sbp(placement, max_dim=2): _test_adaptive_avgpoolnd(test_case, ndim, 1, placement, sbp) _test_adaptive_avgpoolnd_functional(test_case, ndim, 1, placement, sbp) ndim = 4 - for sbp in all_sbp(placement, max_dim=ndim): + for sbp in all_sbp(placement, max_dim=2): _test_adaptive_avgpoolnd(test_case, ndim, 2, placement, sbp) _test_adaptive_avgpoolnd_functional(test_case, ndim, 2, placement, sbp) @@ -81,7 +81,7 @@ def test_adaptive_avgpool(test_case): ): continue ndim = 5 - for sbp in all_sbp(placement, max_dim=ndim): + for sbp in all_sbp(placement, max_dim=2): _test_adaptive_avgpoolnd(test_case, ndim, 3, placement, sbp) _test_adaptive_avgpoolnd_functional(test_case, ndim, 3, placement, sbp) diff --git a/python/oneflow/test/modules/test_consistent_mv.py b/python/oneflow/test/modules/test_consistent_mv.py new file mode 100644 index 00000000000..02bde993fa3 --- /dev/null +++ b/python/oneflow/test/modules/test_consistent_mv.py @@ -0,0 +1,39 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import unittest +import oneflow as flow +import oneflow.unittest +from oneflow.test_utils.automated_test_util import * + + +@autotest(n=1, check_graph=False) +def _test_mv(test_case, placement, sbp): + dim = random(1, 6) + mat = random_tensor(2, dim1=dim).to_global(placement=placement, sbp=sbp) + vec = random_tensor(1, dim0=dim).to_global(placement=placement, sbp=sbp) + return torch.mv(mat, vec) + + +class TestMvModule(flow.unittest.TestCase): + @globaltest + def test_mv(test_case): + for placement in all_placement(): + for sbp in all_sbp(placement): + _test_mv(test_case, placement, sbp) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_consistent_rnn_cell.py b/python/oneflow/test/modules/test_consistent_rnn_cell.py index 8ab9a42454d..41fdf87ed17 100644 --- a/python/oneflow/test/modules/test_consistent_rnn_cell.py +++ b/python/oneflow/test/modules/test_consistent_rnn_cell.py @@ -22,7 +22,7 @@ from oneflow.test_utils.automated_test_util import * -@autotest(n=2, check_graph=False) +@autotest(n=1, check_graph=False) def _test_lstm_cell(test_case, placement, sbp): batch_size = random(2, 3) * 8 time_steps = random(2, 3) * 8 @@ -68,7 +68,7 @@ def _test_lstm_cell(test_case, placement, sbp): return res[0] -@autotest(n=2, check_graph=False) +@autotest(n=1, check_graph=False) def _test_rnn_relu_cell(test_case, placement, sbp): batch_size = random(2, 3) * 8 time_steps = random(2, 3) * 8 @@ -112,7 +112,7 @@ def _test_rnn_relu_cell(test_case, placement, sbp): return hx -@autotest(n=2, check_graph=False) +@autotest(n=1, check_graph=False) def _test_rnn_tanh_cell(test_case, placement, sbp): batch_size = random(2, 3) * 8 time_steps = random(2, 3) * 8 @@ -156,7 +156,7 @@ def _test_rnn_tanh_cell(test_case, placement, sbp): return hx -@autotest(n=2, check_graph=False) +@autotest(n=1, check_graph=False) def _test_gru_cell(test_case, placement, sbp): batch_size = random(2, 3) * 8 time_steps = random(2, 3) * 8 diff --git a/python/oneflow/test/modules/test_consistent_slice.py b/python/oneflow/test/modules/test_consistent_slice.py index d3dd5f7092a..0a7422d3f63 100644 --- a/python/oneflow/test/modules/test_consistent_slice.py +++ b/python/oneflow/test/modules/test_consistent_slice.py @@ -89,38 +89,25 @@ def _test_slice_ellipsis_type(test_case, placement, sbp): _check_forward_and_backward(test_case, input, of_out, torch_out) -def _test_logical_slice(test_case, placement, sbp): - input = random_tensor(2, 8, 8, requires_grad=True).oneflow - x_numpy = input.detach().cpu().numpy() - - x = input.to_global(placement=placement, sbp=sbp) - y = flow.logical_slice(x, slice_tup_list=[[0, 1, 1]]) - - # forward - test_case.assertTrue(np.array_equal(y.numpy(), x_numpy[0:1:1])) - - # backward - y.sum().backward() - input_grad_np = np.zeros((8, 8)) - input_grad_np[0:1:1, :] = 1 - test_case.assertTrue(np.array_equal(input.grad.numpy(), input_grad_np)) - - -def _test_logical_slice_with_bool(test_case, placement, sbp): +def _test_slice_with_bool(test_case, placement, sbp): x = random_tensor(2, 8, 8).oneflow > 0.5 x_numpy = x.detach().cpu().numpy() x = x.to_global(placement=placement, sbp=sbp) - y = flow.logical_slice(x, slice_tup_list=[[0, 1, 1]]) + y = flow.slice(x, slice_tup_list=[[0, 1, 1]]) test_case.assertTrue(np.array_equal(y.numpy(), x_numpy[0:1:1])) -def _test_logical_slice_with_grad(test_case, placement, sbp): +@autotest( + n=2, auto_backward=False, check_graph=False, +) +def _test_slice_with_grad(test_case, placement): + sbp = random_sbp(placement, max_dim=2).value() x = random_tensor(2, 8, 16, requires_grad=True).oneflow x_numpy = x.detach().cpu().numpy() - class LogicalSliceWithGrad(flow.nn.Module): + class SliceWithGrad(flow.nn.Module): def __init__(self): super().__init__() self.input_grad = flow.nn.Parameter(flow.zeros(8, 16)) @@ -130,16 +117,16 @@ def forward(self, input): x = x.to_global(placement, sbp) return x[:, :8] - logical_slice_with_grad = LogicalSliceWithGrad().to_global( + slice_with_grad_m = SliceWithGrad().to_global( placement, [flow.sbp.broadcast,] * len(sbp) ) - of_sgd = flow.optim.SGD(logical_slice_with_grad.parameters(), lr=1.0, momentum=0.0) + of_sgd = flow.optim.SGD(slice_with_grad_m.parameters(), lr=1.0, momentum=0.0) - class LogicalSliceTrainGraph(flow.nn.Graph): + class SliceTrainGraph(flow.nn.Graph): def __init__(self): super().__init__() - self.module = logical_slice_with_grad + self.module = slice_with_grad_m self.add_optimizer(of_sgd) def build(self, x): @@ -148,7 +135,7 @@ def build(self, x): z.backward() return out - graph = LogicalSliceTrainGraph() + graph = SliceTrainGraph() input = x.to_global(placement=placement, sbp=sbp) y = graph(input) @@ -173,16 +160,12 @@ def test_slice(test_case): _test_slice_1dim(test_case, placement, sbp) _test_negative_index(test_case, placement, sbp) _test_slice_ellipsis_type(test_case, placement, sbp) + _test_slice_with_bool(test_case, placement, sbp) - -class TestLogicalSlice(flow.unittest.TestCase): @globaltest - def test_logical_slice(test_case): + def test_graph_slice(test_case): for placement in all_placement(): - for sbp in all_sbp(placement, max_dim=2): - _test_logical_slice(test_case, placement, sbp) - _test_logical_slice_with_bool(test_case, placement, sbp) - _test_logical_slice_with_grad(test_case, placement, sbp) + _test_slice_with_grad(test_case, placement) if __name__ == "__main__": diff --git a/python/oneflow/test/modules/test_consistent_slice_assign.py b/python/oneflow/test/modules/test_consistent_slice_update.py similarity index 75% rename from python/oneflow/test/modules/test_consistent_slice_assign.py rename to python/oneflow/test/modules/test_consistent_slice_update.py index 410b199ac53..e1acb85b0f1 100644 --- a/python/oneflow/test/modules/test_consistent_slice_assign.py +++ b/python/oneflow/test/modules/test_consistent_slice_update.py @@ -22,7 +22,7 @@ from oneflow.test_utils.automated_test_util import * -def _test_logical_slice_assign(test_case, placement, sbp): +def _test_slice_update(test_case, placement, sbp): input = random_tensor(2, 8, 16, requires_grad=True).oneflow value = random_tensor(2, 8, 8, requires_grad=True).oneflow x = (input + 0).to_global( @@ -50,11 +50,11 @@ def _test_logical_slice_assign(test_case, placement, sbp): test_case.assertTrue(np.array_equal(value.grad.numpy(), value_grad_np)) -def _test_graph_logical_slice_assign(test_case, placement, sbp): +def _test_graph_slice_update(test_case, placement, sbp): ref = random_tensor(2, 8, 16, requires_grad=True).oneflow value = random_tensor(2, 8, 8, requires_grad=True).oneflow - class LogicalSliceAssignWithGrad(flow.nn.Module): + class SliceUpdateWithGrad(flow.nn.Module): def __init__(self): super().__init__() self.ref_grad = flow.nn.Parameter(flow.zeros(8, 16)) @@ -68,18 +68,16 @@ def forward(self, ref, value): x[:, :8] = y return x - logical_slice_assign_with_grad = LogicalSliceAssignWithGrad().to_global( + slice_update_with_grad_m = SliceUpdateWithGrad().to_global( placement, [flow.sbp.broadcast,] * len(sbp) ) - of_sgd = flow.optim.SGD( - logical_slice_assign_with_grad.parameters(), lr=1.0, momentum=0.0 - ) + of_sgd = flow.optim.SGD(slice_update_with_grad_m.parameters(), lr=1.0, momentum=0.0) - class LogicalSliceAssignTrainGraph(flow.nn.Graph): + class SliceUpdateTrainGraph(flow.nn.Graph): def __init__(self): super().__init__() - self.module = logical_slice_assign_with_grad + self.module = slice_update_with_grad_m self.add_optimizer(of_sgd) def build(self, x, y): @@ -88,7 +86,7 @@ def build(self, x, y): z.backward() return out - graph = LogicalSliceAssignTrainGraph() + graph = SliceUpdateTrainGraph() x = ref.to_global(placement=placement, sbp=sbp) y = value.to_global(placement=placement, sbp=sbp) @@ -117,15 +115,19 @@ def build(self, x, y): ) -class TestGlobalLogicalSliceAssign(flow.unittest.TestCase): +class TestGlobalSliceUpdate(flow.unittest.TestCase): @globaltest - def test_logical_slice_assign(test_case): + def test_slice_update(test_case): for placement in all_placement(): - for sbp in all_sbp(placement, max_dim=2): - if placement.ranks.size == 1: - continue - _test_logical_slice_assign(test_case, placement, sbp) - _test_graph_logical_slice_assign(test_case, placement, sbp) + # TODO(wyg): It will be infer all broadcast sbp when 1n1d, + # slice_update will get error when doing inplace operator. + # Remove this judgement after refactor sbp infer method in Operator class. + if placement.ranks.size == 1: + continue + for _ in range(2): + sbp = random_sbp(placement, max_dim=2).value() + _test_slice_update(test_case, placement, sbp) + _test_graph_slice_update(test_case, placement, sbp) if __name__ == "__main__": diff --git a/python/oneflow/test/modules/test_consistent_stateful_kernel_with_cache.py b/python/oneflow/test/modules/test_consistent_stateful_kernel_with_cache.py index 61aabee9e72..5f384d6470b 100644 --- a/python/oneflow/test/modules/test_consistent_stateful_kernel_with_cache.py +++ b/python/oneflow/test/modules/test_consistent_stateful_kernel_with_cache.py @@ -29,35 +29,18 @@ def _test_global_stateful_kernel_with_inpersistent_state(test_case, placement, s .to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast) ) x = x.to_global(placement, sbp) - y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) + y = x[0:3, 0:1] y_np = np.array([[0], [8], [16]]) - test_case.assertTrue( - np.array_equal( - y.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast) - .to_local() - .numpy(), - y_np, - ) - ) - x = x.to_global(sbp=flow.sbp.split(1)) - y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) - test_case.assertTrue( - np.array_equal( - y.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast) - .to_local() - .numpy(), - y_np, - ) - ) + test_case.assertTrue(np.array_equal(y.numpy(), y_np,)) + x = x.to_global(flow.env.all_device_placement("cpu"), sbp=flow.sbp.split(1)) + y = x[0:3, 0:1] + test_case.assertTrue(np.array_equal(y.numpy(), y_np,)) class TestStatefulKernelWithInpersistentState(flow.unittest.TestCase): @globaltest def test_global_stateful_kernel_with_inpersistent_state(test_case): for placement in all_placement(): - # logical_slice only support 1d sbp - if len(placement.ranks.shape) != 1: - continue for sbp in all_sbp(placement, max_dim=2): _test_global_stateful_kernel_with_inpersistent_state( test_case, placement, sbp diff --git a/python/oneflow/test/modules/test_consistent_tensordot.py b/python/oneflow/test/modules/test_consistent_tensordot.py index 517d8ad1c38..cf0abaadd2a 100644 --- a/python/oneflow/test/modules/test_consistent_tensordot.py +++ b/python/oneflow/test/modules/test_consistent_tensordot.py @@ -20,7 +20,7 @@ from oneflow.test_utils.automated_test_util import * -@autotest(n=1, check_graph=False) +@autotest(n=1, check_graph=False, atol=1e-3) def _test_global_tensordot_against_pytorch(test_case, ndim, placement, sbp): k = random(1, 2) * 8 tensordot_dim = random(0, ndim + 1).to(int) diff --git a/python/oneflow/test/modules/test_consistent_var.py b/python/oneflow/test/modules/test_consistent_var.py index faf9f7e2427..5bd3f2a8a8f 100644 --- a/python/oneflow/test/modules/test_consistent_var.py +++ b/python/oneflow/test/modules/test_consistent_var.py @@ -25,28 +25,20 @@ @autotest(n=1, check_graph=False) def _test_flow_global_var_all_dim_with_random_data(test_case, placement, sbp): x = random_tensor( - ndim=4, - dim0=random(1, 3).to(int) * 8, - dim1=random(1, 3).to(int) * 8, - dim2=random(1, 3).to(int) * 8, - dim3=random(1, 3).to(int) * 8, + ndim=2, dim0=random(1, 3).to(int) * 8, dim1=random(1, 3).to(int) * 8, ).to_global(placement, sbp) y = torch.var(x) return y -@autotest(n=2, check_graph=False) +@autotest(n=1, check_graph=False) def _test_flow_global_var_one_dim_with_random_data(test_case, placement, sbp): x = random_tensor( - ndim=4, - dim0=random(1, 3).to(int) * 8, - dim1=random(1, 3).to(int) * 8, - dim2=random(1, 3).to(int) * 8, - dim3=random(1, 3).to(int) * 8, + ndim=2, dim0=random(1, 3).to(int) * 8, dim1=random(1, 3).to(int) * 8, ).to_global(placement, sbp) y = torch.var( x, - dim=random(low=0, high=4).to(int), + dim=random(low=0, high=2).to(int), unbiased=random().to(bool), keepdim=random().to(bool), ) @@ -55,10 +47,10 @@ def _test_flow_global_var_one_dim_with_random_data(test_case, placement, sbp): @autotest(n=1, auto_backward=True, check_graph=False) def _test_flow_var_0_size_data_with_random_data(test_case, placement, sbp): - x = random_tensor(4, 8, 16, 0, 8).to_global(placement, sbp) + x = random_tensor(3, 8, 0, 8).to_global(placement, sbp) y = torch.var( x, - dim=random(low=0, high=4).to(int), + dim=random(low=0, high=3).to(int), unbiased=random().to(bool), keepdim=random().to(bool), ) @@ -69,7 +61,7 @@ class TestVar(flow.unittest.TestCase): @globaltest def test_flow_global_var_all_dim_with_random_data(test_case): for placement in all_placement(): - for sbp in all_sbp(placement, max_dim=4): + for sbp in all_sbp(placement, max_dim=2): _test_flow_global_var_all_dim_with_random_data( test_case, placement, sbp ) @@ -77,7 +69,7 @@ def test_flow_global_var_all_dim_with_random_data(test_case): @globaltest def test_flow_global_var_one_dim_with_random_data(test_case): for placement in all_placement(): - for sbp in all_sbp(placement, max_dim=4): + for sbp in all_sbp(placement, max_dim=2): _test_flow_global_var_one_dim_with_random_data( test_case, placement, sbp ) @@ -85,7 +77,7 @@ def test_flow_global_var_one_dim_with_random_data(test_case): @globaltest def test_flow_var_0_size_data_with_random_data(test_case): for placement in all_placement(): - for sbp in all_sbp(placement, max_dim=4, valid_split_axis=[0, 1, 3]): + for sbp in all_sbp(placement, max_dim=2, valid_split_axis=[0]): _test_flow_var_0_size_data_with_random_data(test_case, placement, sbp) diff --git a/python/oneflow/test/modules/test_fused_dot_feature_interaction.py b/python/oneflow/test/modules/test_fused_dot_feature_interaction.py index b6034590233..dc86fc31afe 100644 --- a/python/oneflow/test/modules/test_fused_dot_feature_interaction.py +++ b/python/oneflow/test/modules/test_fused_dot_feature_interaction.py @@ -188,7 +188,7 @@ def test_fused_dot_feature_interaction_pooling_sum(test_case): arg_dict = OrderedDict() arg_dict["dtype"] = [flow.float16, flow.float32] arg_dict["feature_dims"] = [[39], [13, 26], [1, 10, 3]] - arg_dict["embedding_size"] = [127, 128, 16, 11, 12, 110] + arg_dict["embedding_size"] = [16, 11, 12] for kwargs in GenArgDict(arg_dict): _test_fused_dot_feature_interaction_pooling_sum(test_case, **kwargs) diff --git a/python/oneflow/test/modules/test_fused_scale_mask_softmax.py b/python/oneflow/test/modules/test_fused_scale_mask_softmax.py index 4697b01fd7f..56bed5f94cc 100644 --- a/python/oneflow/test/modules/test_fused_scale_mask_softmax.py +++ b/python/oneflow/test/modules/test_fused_scale_mask_softmax.py @@ -26,15 +26,17 @@ def _test_fused_scale_mask_softmax( - test_case, batch_size, num_heads, seq_length, fill_value, scale_value, + test_case, batch_size, num_heads, seq_length, fill_value, scale_value, broadcast_dim ): - - x = np.random.randn(batch_size, num_heads, seq_length, seq_length) - mask = np.random.randint( - 0, 2, size=(batch_size, num_heads, seq_length, seq_length), dtype=np.bool + x = np.random.randn(batch_size, num_heads, seq_length, seq_length).astype( + np.float32 ) + mask_size = [batch_size, num_heads, seq_length, seq_length] + if broadcast_dim: + mask_size[broadcast_dim] = 1 - fused_x_tensor = flow.tensor(x).to("cuda") + mask = np.random.randint(0, 2, size=mask_size, dtype=np.bool) + fused_x_tensor = flow.tensor(x, dtype=flow.float32).to("cuda") fused_mask_tensor = flow.tensor(mask, dtype=flow.bool).to("cuda") fused_x_tensor.requires_grad = True @@ -77,6 +79,7 @@ def test_fused_op(test_case): args_dict["seq_length"] = [16, 32, 64] args_dict["fill_value"] = [-10000.0] args_dict["scale_value"] = [1.0, 2.0, 4.0] + args_dict["broadcast_dim"] = [None, 0, 1, 2] for arg in GenArgList(args_dict): arg[0](test_case, *arg[1:]) diff --git a/python/oneflow/test/modules/test_fused_scale_mask_softmax_dropout.py b/python/oneflow/test/modules/test_fused_scale_mask_softmax_dropout.py index 8d101f4ff5b..ea4a22254c0 100644 --- a/python/oneflow/test/modules/test_fused_scale_mask_softmax_dropout.py +++ b/python/oneflow/test/modules/test_fused_scale_mask_softmax_dropout.py @@ -27,14 +27,22 @@ def _test_fused_scale_mask_softmax_dropout( - test_case, batch_size, num_heads, seq_length, fill_value, scale_value, p + test_case, + batch_size, + num_heads, + seq_length, + fill_value, + scale_value, + broadcast_dim, + p, ): x = np.random.randn(batch_size, num_heads, seq_length, seq_length) - mask = np.random.randint( - 0, 2, size=(batch_size, num_heads, seq_length, seq_length), dtype=np.bool - ) + mask_size = [batch_size, num_heads, seq_length, seq_length] + if broadcast_dim: + mask_size[broadcast_dim] = 1 + mask = np.random.randint(0, 2, size=mask_size, dtype=np.bool) - fused_x_tensor = flow.tensor(x).to("cuda") + fused_x_tensor = flow.tensor(x, dtype=flow.float32).to("cuda") fused_mask_tensor = flow.tensor(mask, dtype=flow.bool).to("cuda") fused_x_tensor.requires_grad = True @@ -47,7 +55,7 @@ def _test_fused_scale_mask_softmax_dropout( p=p, )[0] - origin_x_tensor = flow.tensor(x).to("cuda") + origin_x_tensor = flow.tensor(x, dtype=flow.float32).to("cuda") origin_mask_tensor = flow.tensor(mask, dtype=flow.float32).to("cuda") origin_x_tensor.requires_grad = True origin_out = flow.mul( @@ -83,6 +91,7 @@ def test_fused_op(test_case): args_dict["seq_length"] = [8, 16, 32, 64] args_dict["fill_value"] = [-10000.0] args_dict["scale_value"] = [1.0, 2.0, 4.0] + args_dict["broadcast_dim"] = [None, 0, 1, 2] args_dict["p"] = [0.0, 1.0] for arg in GenArgList(args_dict): diff --git a/python/oneflow/test/modules/test_hsplit.py b/python/oneflow/test/modules/test_hsplit.py index 26c8d77a9f8..5dc413ebaf4 100644 --- a/python/oneflow/test/modules/test_hsplit.py +++ b/python/oneflow/test/modules/test_hsplit.py @@ -34,7 +34,7 @@ def test_flow_hsplit_vec(test_case): dim3=random(3, 6), ).to(device) z = torch.hsplit(x, (1, 2)) - return z[0] + return z @autotest(n=5) def test_flow_hsplit_vec_with_stride(test_case): @@ -50,7 +50,7 @@ def test_flow_hsplit_vec_with_stride(test_case): shuffle(perm) y = x.permute(perm) z = torch.hsplit(y, (1, 2)) - return z[0] + return z @flow.unittest.skip_unless_1n1d() @@ -63,7 +63,7 @@ def test_flow_hsplit_int(test_case): ).to(device) split = oneof(2, 4, 6) z = torch.hsplit(x, split) - return z[0] + return z if __name__ == "__main__": diff --git a/python/oneflow/test/modules/test_matmul.py b/python/oneflow/test/modules/test_matmul.py index 279c184dee3..2d394f7b850 100644 --- a/python/oneflow/test/modules/test_matmul.py +++ b/python/oneflow/test/modules/test_matmul.py @@ -49,6 +49,19 @@ def test_flow_tensor_broadcast_matmul_with_random_data(test_case): y = random_tensor(ndim=2, dim0=k).to(device) return x.matmul(y) + @autotest(check_graph=True) + def test_flow_mv_with_random_data(test_case): + device = random_device() + k = random(1, 6) + x = random_tensor(ndim=2, dim1=k).to(device) + y = random_tensor(ndim=1, dim0=k).to(device) + z = torch.mv(x, y) + return z + + @profile(torch.mv) + def profile_mv(test_case): + torch.mv(torch.ones(32, 64), torch.ones(64)) + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_nll_loss.py b/python/oneflow/test/modules/test_nll_loss.py new file mode 100644 index 00000000000..301c3bc901a --- /dev/null +++ b/python/oneflow/test/modules/test_nll_loss.py @@ -0,0 +1,134 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +import unittest + +import oneflow as flow +import oneflow.unittest + +from oneflow.test_utils.automated_test_util import * + + +@autotest(n=1) +def _test_nll_loss( + test_case, has_weight=False, split_batch_dim=False, split_class_dim=False +): + N = random(1, 4) * 2 + C = random(1, 10) * 2 + ndim = random(2, 5).to(int).value() + dims = [random(2, 10) for i in range(ndim - 2)] + input_dims = [N, C] + dims + target_dims = [N] + dims + input = random_tensor(ndim, *input_dims) + target = random_tensor( + ndim - 1, *target_dims, low=0, high=C, dtype=int, requires_grad=False + ) + weight = None + if has_weight: + weight = random_tensor(1, C, requires_grad=False) + + device = random_device().value() + if not split_class_dim and not split_batch_dim: + input = input.to(device) + target = target.to(device) + if has_weight: + weight = weight.to(device) + else: + rank = flow.env.get_rank() + world_size = flow.env.get_world_size() + assert world_size % 2 == 0 + ranks = np.array(range(world_size)) + + if split_batch_dim and split_class_dim: + placement = flow.placement(device, ranks.reshape((ranks.size // 2, 2))) + input_sbp = [flow.sbp.split(0), flow.sbp.split(1)] + target_sbp = [flow.sbp.split(0), flow.sbp.broadcast()] + weight_sbp = [flow.sbp.broadcast(), flow.sbp.split(0)] + elif split_batch_dim: + placement = flow.placement(device, ranks) + input_sbp = flow.sbp.split(0) + target_sbp = flow.sbp.split(0) + weight_sbp = flow.sbp.broadcast() + else: + placement = flow.placement(device, ranks) + input_sbp = flow.sbp.split(1) + target_sbp = flow.sbp.broadcast() + weight_sbp = flow.sbp.split(0) + + input = input.to_global(placement=placement, sbp=input_sbp) + target = target.to_global(placement=placement, sbp=target_sbp) + # print( + # f"**[{rank}] input: {input.oneflow.shape} {input.oneflow.placement} {input.oneflow.sbp}" + # ) + # print( + # f"**[{rank}] target: {target.oneflow.shape} {target.oneflow.placement} {target.oneflow.sbp}" + # ) + if has_weight: + # print(f"**[{rank}] weight: {weight.oneflow.numpy()}") + weight = weight.to_global(placement=placement, sbp=weight_sbp) + + reduction = oneof("none", "sum", "mean") + if has_weight: + nll = torch.nn.NLLLoss(weight=weight, reduction=reduction) + else: + nll = torch.nn.NLLLoss(reduction=reduction) + return nll(input, target) + + +@flow.unittest.skip_unless_1n1d() +class NLLLossTestCase(flow.unittest.TestCase): + def test_local(test_case): + _test_nll_loss(test_case) + + def test_weighted(test_case): + _test_nll_loss(test_case, has_weight=True) + + +@flow.unittest.skip_unless_1n2d() +class ParallelNLLLossTestCase(flow.unittest.TestCase): + @globaltest + def test_data_parallel(test_case): + _test_nll_loss(test_case, split_batch_dim=True) + + @globaltest + def test_data_parallel_weighted(test_case): + _test_nll_loss(test_case, has_weight=True, split_batch_dim=True) + + @globaltest + def test_model_parallel(test_case): + _test_nll_loss(test_case, split_class_dim=True) + + @globaltest + def test_model_parallel_weighted(test_case): + _test_nll_loss(test_case, has_weight=True, split_class_dim=True) + + +@flow.unittest.skip_unless_1n4d() +class TowDParallelNLLLossTestCase(flow.unittest.TestCase): + @globaltest + def test_2d_parallel(test_case): + _test_nll_loss(test_case, split_batch_dim=True, split_class_dim=True) + + @globaltest + def test_2d_parallel_weighted(test_case): + _test_nll_loss( + test_case, has_weight=True, split_batch_dim=True, split_class_dim=True + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_repeat_interleave.py b/python/oneflow/test/modules/test_repeat_interleave.py index 95faea06ac5..5a636f0e66c 100644 --- a/python/oneflow/test/modules/test_repeat_interleave.py +++ b/python/oneflow/test/modules/test_repeat_interleave.py @@ -15,8 +15,10 @@ """ import unittest +import numpy as np import oneflow as flow import oneflow.unittest +import torch as torch_original from oneflow.test_utils.automated_test_util import * @@ -39,17 +41,37 @@ def test_flow_int_repeat_interleave_with_dim(test_case): @autotest(n=5) def test_flow_tensor_repeat_interleave_dim(test_case): x = random_tensor(ndim=3, dim0=2, dim1=2, dim2=3) - y = random_tensor(ndim=1, dim0=2, dtype=int, low=1, high=4) + y = random_tensor(ndim=1, dim0=2, dtype=int, low=0, high=4) z = torch.repeat_interleave(x, y, 1) return z @autotest(n=5) def test_flow_tensor_repeat_interleave_dim_with_output_size(test_case): x = random_tensor(ndim=3, dim0=2, dim1=2, dim2=3) - y = random_tensor(ndim=1, dim0=2, dtype=int, low=1, high=4) + y = random_tensor(ndim=1, dim0=2, dtype=int, low=0, high=4) z = torch.repeat_interleave(x, y, 1, output_size=2) return z + def test_flow_tensor_repeat_interleave_0size_tensor(test_case): + np_arr = np.array( + [ + [[0.8548, 0.0436, 0.7977], [0.1919, 0.4191, 0.2186]], + [[0.4741, 0.8896, 0.6859], [0.5223, 0.7803, 0.1134]], + ] + ) + x_torch = torch_original.tensor(np_arr) + x_torch.requires_grad = True + y_torch = torch_original.tensor([0, 0]) + z_torch = torch_original.repeat_interleave(x_torch, y_torch, 1) + z_torch.sum().backward() + + x_flow = flow.tensor(np_arr) + x_flow.requires_grad = True + y_flow = flow.tensor([0, 0]) + z_flow = flow.repeat_interleave(x_flow, y_flow, 1) + z_flow.sum().backward() + test_case.assertTrue(np.array_equal(x_torch.grad.numpy(), x_flow.grad.numpy())) + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_slice.py b/python/oneflow/test/modules/test_slice.py index 87f37b91ebe..a0cb1f8cc16 100644 --- a/python/oneflow/test/modules/test_slice.py +++ b/python/oneflow/test/modules/test_slice.py @@ -131,38 +131,6 @@ def _test_slice_backward(test_case, device): test_case.assertTrue(np.array_equal(x.grad.numpy(), np_grad)) -def _test_slice_update(test_case, device): - x = np.array([1, 1, 1, 1, 1]).astype(np.float32) - input = flow.tensor(x, requires_grad=True) - input.retain_grad() - update = flow.tensor(np.array([2, 3, 4]).astype(np.float32), requires_grad=True) - output = np.array([1.0, 2.0, 3.0, 4.0, 1.0]) - # Get the inplaced tensor grad by another tensor - t = input + 0 - flow._C.slice_update(t, update, [1,], [4,], [1,], inplace=True) - z = t.sum() - z.backward() - test_case.assertTrue(np.array_equal(t.numpy(), output)) - np_grad = np.zeros(x.shape) - np_grad[0] = 1 - np_grad[4] = 1 - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) - test_case.assertTrue(np.array_equal(update.grad.numpy(), np.ones(update.shape))) - - -def _test_slice_update_with_stride(test_case, device): - arr = np.arange(24).reshape(2, 2, 2, 3).astype(np.float32) - np_in = arr - np_out = np_in.transpose(1, 0, 2, 3) - np_out[0:1, 1:2, :, 1:2] = 3.1415 - - input = flow.tensor(arr, device=flow.device(device)) - output = input.permute(1, 0, 2, 3) - output[0:1, 1:2, :, 1:2] = 3.1415 - - test_case.assertTrue(np.array_equal(output.numpy(), np_out)) - - @flow.unittest.skip_unless_1n1d() class TestSlice(flow.unittest.TestCase): def test_slice(test_case): @@ -185,88 +153,22 @@ def test_slice(test_case): @flow.unittest.skip_unless_1n1d() class TestSliceUpdate(flow.unittest.TestCase): - def test_slice(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_slice_update, - # # TODO:(zhaoluyang) test when slice_update support stride - # _test_slice_update_with_stride - ] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) - - def test_slice_update_graph(test_case): - x = np.array([1, 1, 1, 1, 1]).astype(np.float32) - input = flow.tensor(x, requires_grad=True) - update = flow.tensor(np.array([2, 3, 4]).astype(np.float32), requires_grad=True) - output = np.array([1.0, 2.0, 3.0, 4.0, 1.0]) - - class TestModule(flow.nn.Module): - def __init__(self): - super().__init__() - self.weight = flow.nn.Parameter(flow.Tensor(x)) - - def forward(self, x, update): - flow._C.slice_update(x, update, [1,], [4,], [1,], inplace=True) - y = x + self.weight - return x, y - - test_m = TestModule() - of_sgd = flow.optim.SGD(test_m.parameters(), lr=0.001, momentum=0.9) - - class TestSliceUpdateGraph(flow.nn.Graph): - def __init__(self): - super().__init__() - self.m = test_m - self.add_optimizer(of_sgd) - - def build(self, x, update): - x, y = self.m(x, update) - z = y.sum() - z.backward() - return x - - slice_update_g = TestSliceUpdateGraph() - - y = slice_update_g(input, update) - test_case.assertTrue(np.array_equal(y.numpy(), output)) - # TODO(): check grad of slice_update in graph. - - -@flow.unittest.skip_unless_1n1d() -class TestLogicalSliceAssign(flow.unittest.TestCase): - def test_logical_slice_assign(test_case): + def test_slice_update(test_case): x = np.array([1, 1, 1, 1, 1]).astype(np.float32) input = flow.tensor(x) update = flow.tensor(np.array([2, 3, 4]).astype(np.float32)) output = np.array([1.0, 2.0, 3.0, 4.0, 1.0]) - flow.logical_slice_assign(input, update, slice_tup_list=[[1, 4, 1]]) + flow.slice_update(input, update, slice_tup_list=[[1, 4, 1]]) test_case.assertTrue(np.array_equal(input.numpy(), output)) - def test_logical_slice_assign_graph(test_case): - x = np.array([1, 1, 1, 1, 1]).astype(np.float32) - input = flow.tensor(x) - update = flow.tensor(np.array([2, 3, 4]).astype(np.float32)) - output = np.array([1.0, 2.0, 3.0, 4.0, 1.0]) - - @flow.nn.Graph.to_graph - def test_func(input): - flow.logical_slice_assign(input, update, slice_tup_list=[[1, 4, 1]]) - return input - - # NOTE(strint): input outside the graph has not been change yet currently. - out = test_func(input) - test_case.assertTrue(np.array_equal(out.numpy(), output)) - - def test_logical_slice_assign_negative_index(test_case): + def test_slice_update_negative_index(test_case): np_arr = np.zeros(shape=(2, 3, 4)) input = flow.tensor(np_arr, dtype=flow.float32) np_arr[-1] = 1 input[-1] = 1 test_case.assertTrue(np.array_equal(input.numpy(), np_arr)) - def test_logical_slice_assign_negative_index_graph(test_case): + def test_slice_update_negative_index_graph(test_case): np_arr = np.zeros(shape=(2, 3, 4)) input = flow.tensor(np_arr, dtype=flow.float32) np_arr[-1] = 1 @@ -279,14 +181,14 @@ def test_func(): out = test_func() test_case.assertTrue(np.array_equal(out.numpy(), np_arr)) - def test_logical_slice_assign_ellipsis_type(test_case): + def test_slice_update_ellipsis_type(test_case): np_arr = np.zeros(shape=(2, 3, 4, 5, 6)) input = flow.tensor(np_arr, dtype=flow.float32) np_arr[0, ::1, ..., 2:3] = 1 input[0, ::1, ..., 2:3] = 1 test_case.assertTrue(np.array_equal(input.numpy(), np_arr)) - def test_logical_slice_assign_ellipsis_type_graph(test_case): + def test_slice_update_ellipsis_type_graph(test_case): np_arr = np.zeros(shape=(2, 3, 4, 5, 6)) input = flow.tensor(np_arr, dtype=flow.float32) np_arr[0, ::1, ..., 2:3] = 1 @@ -299,6 +201,63 @@ def test_func(): out = test_func() test_case.assertTrue(np.array_equal(out.numpy(), np_arr)) + def test_slice_update_grad_graph(test_case): + x = np.array([1, 1, 1, 1, 1]).astype(np.float32) + input = flow.tensor(x, requires_grad=True) + update = flow.tensor(np.array([2, 3, 4]).astype(np.float32), requires_grad=True) + output = np.array([1.0, 2.0, 3.0, 4.0, 1.0]) + + class TestModule(flow.nn.Module): + def __init__(self): + super().__init__() + self.ref_grad = flow.nn.Parameter(flow.zeros(5)) + self.value_grad = flow.nn.Parameter(flow.zeros(3)) + + def forward(self, ref, value): + x = ref + self.ref_grad + y = value + self.value_grad + return flow._C.slice_update(x, y, [1,], [4,], [1,]) + + test_m = TestModule() + of_sgd = flow.optim.SGD(test_m.parameters(), lr=1.0, momentum=0.0) + + class TestSliceUpdateGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + self.m = test_m + self.add_optimizer(of_sgd) + + def build(self, ref, update): + x = self.m(ref, update) + x.sum().backward() + return x + + slice_update_g = TestSliceUpdateGraph() + + y = slice_update_g(input, update) + + # forward + test_case.assertTrue(np.array_equal(y.numpy(), output)) + # ref grad + ref_grad = np.array([1.0, 0.0, 0.0, 0.0, 1.0]).astype(np.float32) + test_case.assertTrue(np.array_equal(-test_m.ref_grad, ref_grad)) + # value grad + value_grad = np.array([1.0, 1.0, 1.0]).astype(np.float32) + test_case.assertTrue(np.array_equal(-test_m.value_grad, value_grad)) + + @unittest.skip("TODO:(zhaoluyang) test when slice_update support stride") + def test_slice_update_with_stride(test_case, device): + arr = np.arange(24).reshape(2, 2, 2, 3).astype(np.float32) + np_in = arr + np_out = np_in.transpose(1, 0, 2, 3) + np_out[0:1, 1:2, :, 1:2] = 3.1415 + + input = flow.tensor(arr, device=flow.device(device)) + output = input.permute(1, 0, 2, 3) + output[0:1, 1:2, :, 1:2] = 3.1415 + + test_case.assertTrue(np.array_equal(output.numpy(), np_out)) + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_stateful_kernel_with_cache.py b/python/oneflow/test/modules/test_stateful_kernel_with_cache.py index 0c1c783d5bf..76893f4680b 100644 --- a/python/oneflow/test/modules/test_stateful_kernel_with_cache.py +++ b/python/oneflow/test/modules/test_stateful_kernel_with_cache.py @@ -28,13 +28,13 @@ class TestStatefulKernelWithInpersistentState(flow.unittest.TestCase): def test_stateful_kernel_with_inpersistent_state(test_case): x = flow.arange(4).reshape(2, 2) x = x.to_global(flow.env.all_device_placement("cuda"), flow.sbp.split(0)) - y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) + y = x[0:3, 0:1] y_np = np.array([[0], [2], [0]]) test_case.assertTrue( np.array_equal(y.to_global(sbp=flow.sbp.broadcast).to_local().numpy(), y_np) ) x = x.to_global(sbp=flow.sbp.split(1)) - y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) + y = x[0:3, 0:1] test_case.assertTrue( np.array_equal(y.to_global(sbp=flow.sbp.broadcast).to_local().numpy(), y_np) ) diff --git a/python/oneflow/test/tensor/test_tensor_part_1.py b/python/oneflow/test/tensor/test_tensor_part_1.py index 55da4a4a373..90b0657d23a 100644 --- a/python/oneflow/test/tensor/test_tensor_part_1.py +++ b/python/oneflow/test/tensor/test_tensor_part_1.py @@ -416,6 +416,16 @@ def test_matmul_with_random_data(test_case): b = random_tensor(ndim=2, dim0=dim1, dim1=dim2) return a @ b + @flow.unittest.skip_unless_1n1d() + @autotest() + def test_mm_with_random_data(test_case): + device = random_device() + dim0 = random(low=2, high=10).to(int) + dim1 = random(low=3, high=20).to(int) + a = random_tensor(ndim=2, dim0=dim0, dim1=dim1).to(device) + b = random_tensor(ndim=1, dim0=dim1).to(device) + return a.mv(b) + @flow.unittest.skip_unless_1n1d() def test_tensor_to_list(test_case): list_data = [[1.0, 3.0], [5.0, 6.0]] @@ -940,23 +950,6 @@ def test_tensor_slice(test_case): np.allclose(input[0, :, 0:2].numpy(), x[0, :, 0:2], 1e-05, 1e-05) ) - @flow.unittest.skip_unless_1n1d() - def test_tensor_logical_slice_assign(test_case): - x = np.random.randn(2, 3, 4, 5).astype(np.float32) - input = flow.tensor(x) - input[:, 0] = 3.1415926 - x[:, 0] = 3.1415926 - test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05)) - input[:, 1:2] = 1 - x[:, 1:2] = 1 - test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05)) - input[:] = 1.234 - x[:] = 1.234 - test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05)) - input[0] = 0 - x[0] = 0 - test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05)) - @flow.unittest.skip_unless_1n1d() def test_zeros_(test_case): shape = (2, 3) diff --git a/python/oneflow/test/tensor/test_tensor_pin_memory.py b/python/oneflow/test/tensor/test_tensor_pin_memory.py index e619dd412df..4675c4b9abc 100644 --- a/python/oneflow/test/tensor/test_tensor_pin_memory.py +++ b/python/oneflow/test/tensor/test_tensor_pin_memory.py @@ -70,6 +70,17 @@ def test_tensor_construct_with_pin_memory_param(test_case): ) return x + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + @flow.unittest.skip_unless_1n1d() + @autotest(n=5, auto_backward=True, check_graph=False) + def test_tensor_is_pinned(test_case): + device = random_device() + x = random_tensor(ndim=4).to(device) + y = x.pin_memory() + test_case.assertTrue(x.oneflow.is_pinned() == x.pytorch.is_pinned()) + test_case.assertTrue(y.oneflow.is_pinned() == y.pytorch.is_pinned()) + return y + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test_utils/automated_test_util/profiler.py b/python/oneflow/test_utils/automated_test_util/profiler.py index 8e6551e9d9d..9d7ff2a24a3 100644 --- a/python/oneflow/test_utils/automated_test_util/profiler.py +++ b/python/oneflow/test_utils/automated_test_util/profiler.py @@ -20,7 +20,9 @@ import torch import oneflow as flow import oneflow.support.env_var_util -import oneflow.test_utils.automated_test_util.torch_flow_dual_object as dual_object_module +from oneflow.test_utils.automated_test_util import ( + torch_flow_dual_object as dual_object_module, +) __all__ = ["profile", "set_profiler_hook", "profile_dual_object"] diff --git a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py index 6b213180659..b0254129ca6 100644 --- a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py +++ b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py @@ -23,7 +23,7 @@ import numpy as np import oneflow as flow -import oneflow.test_utils.automated_test_util.profiler as auto_profiler +from oneflow.test_utils.automated_test_util import profiler as auto_profiler flow.backends.cudnn.deterministic = True @@ -65,9 +65,10 @@ def torch_tensor_to_flow(x): note_pytorch_method_names = [] note_pytorch_args = [] note_pytorch_kwargs = [] +vis_tensor = [] vis_parameters = {} call_tensor_id = [] -extra_input_tensor = set() +extra_input_tensor = [] class PyTorchDoesNotSupportError(Exception): @@ -591,22 +592,50 @@ def get_pytorch_oneflow_res( pytorch_res = pytorch(*pytorch_args, **pytorch_kwargs) if isinstance(pytorch_res, torch_original.Tensor): - if ( - hasattr(pytorch, "__name__") - and pytorch.__name__ == "to" - and ( - (len(pytorch_args) > 0 and pytorch_args[0] == "cpu") - or (len(pytorch_kwargs) > 0 and pytorch_kwargs["device"] == "cpu") - ) - ): - extra_input_tensor.add(pytorch_res) - elif ( - len(pytorch_args) > 0 - and isinstance(pytorch_args[0], torch_original.Tensor) - and id(pytorch_args[0]) == id(pytorch_res) - ): - extra_input_tensor.add(pytorch_res) - else: + call_flag = True + source_flag = True + for x in pytorch_args: + if isinstance(x, (tuple, list)): + for y in x: + if torch_original.is_tensor(y): + source_flag = False + if ( + id(pytorch_res) == id(y) + and pytorch_res.device.type == y.device.type + ): + call_flag = False + break + elif torch_original.is_tensor(x): + source_flag = False + if ( + id(pytorch_res) == id(x) + and pytorch_res.device.type == x.device.type + ): + call_flag = False + break + for x in pytorch_kwargs.values(): + if isinstance(x, (tuple, list)): + for y in x: + if torch_original.is_tensor(y): + source_flag = False + if ( + id(pytorch_res) == id(y) + and pytorch_res.device.type == y.device.type + ): + call_flag = False + break + elif torch_original.is_tensor(x): + source_flag = False + if ( + id(pytorch_res) == id(x) + and pytorch_res.device.type == x.device.type + ): + call_flag = False + break + if source_flag and pytorch.__name__ != "to": + call_tensor_id.append(id(pytorch_res)) + extra_input_tensor.append(pytorch_res) + elif call_flag: call_tensor_id.append(id(pytorch_res)) except Exception as e: @@ -650,7 +679,11 @@ def get_pytorch_oneflow_tensor_res( try: pytorch_res = pytorch_method(*pytorch_args, **pytorch_kwargs) if isinstance(pytorch_res, torch_original.Tensor): - call_tensor_id.append(id(pytorch_res)) + if ( + id(pytorch_res) != id(pytorch_method.__self__) + or pytorch_res.device.type == pytorch_method.__self__.device.type + ): + call_tensor_id.append(id(pytorch_res)) except Exception as e: if align_exception: try: @@ -791,7 +824,7 @@ def note_print_kwargs(x, y, end=True): print(f"\033[32m{x}={y}\033[0m", end="") -def print_note_fake_program(): +def print_note_fake_program(detail=False): code_len = len(note_pytorch_method_names) for i in range(code_len): note_pytorch_args_len = len(note_pytorch_args[i]) @@ -814,6 +847,58 @@ def print_note_fake_program(): x, note_pytorch_kwargs[i][x], index < note_pytorch_kwargs_len ) print(f"\033[32m)\033[0m") + if detail: + print( + f"\033[32m-----------------------------------------------------------\033[0m" + ) + unique_vis_tensor = [] + flag_vis_input_tensor = [False for _ in range(len(vis_tensor))] + for i in range(len(vis_tensor)): + if flag_vis_input_tensor[i] == True: + continue + unique_vis_tensor.append(vis_tensor[i]) + flag_vis_input_tensor[i] = True + for j in range(i + 1, len(vis_tensor)): + if ( + id(vis_tensor[i]) == id(vis_tensor[j]) + and flag_vis_input_tensor[j] == False + ): + flag_vis_input_tensor[j] = True + unique_extra_tensor = [] + flag_vis_extra_tensor = [False for _ in range(len(extra_input_tensor))] + for i in range(len(extra_input_tensor)): + if flag_vis_extra_tensor[i] == True: + continue + unique_extra_tensor.append(extra_input_tensor[i]) + flag_vis_extra_tensor[i] = True + for j in range(i + 1, len(extra_input_tensor)): + if ( + id(extra_input_tensor[i]) == id(extra_input_tensor[j]) + and flag_vis_extra_tensor[j] == False + ): + flag_vis_extra_tensor[j] = True + + print( + f"\033[32mThis program has {len(unique_extra_tensor) + len(unique_vis_tensor)} input tensor: \033[0m" + ) + for input_tensor in iter(unique_extra_tensor): + print(f"\033[32mShape{get_tensor_shape(input_tensor)}\033[0m") + print(f"\033[32m{input_tensor}\033[0m") + print( + f"\033[32m-----------------------------------------------------------\033[0m" + ) + for input_tensor in iter(unique_vis_tensor): + print(f"\033[32mShape{get_tensor_shape(input_tensor)}\033[0m") + print(f"\033[32m{input_tensor}\033[0m") + print( + f"\033[32m-----------------------------------------------------------\033[0m" + ) + if vis_parameters: + print( + f"\033[32m-------------------nn.Module Parameters---------------------\033[0m" + ) + for name, param in vis_parameters.items(): + print(f"\033[32m{name}: {param}\033[0m") def clear_note_fake_program(): @@ -821,6 +906,7 @@ def clear_note_fake_program(): note_pytorch_args.clear() note_pytorch_kwargs.clear() call_tensor_id.clear() + vis_tensor.clear() vis_parameters.clear() extra_input_tensor.clear() flow.set_printoptions(profile="full") @@ -962,7 +1048,7 @@ def check_tensor_equality( ): if torch_tensor.grad is not None: if flow_tensor.grad is None: - print_note_fake_program() + print_note_fake_program(detail=True) assert ( flow_tensor.grad is not None ), f"OneFlow tensor doesn't have grad while PyTorch tensor has one, PyTorch tensor is\n {torch_tensor}\n, OneFlow tensor is\n{flow_tensor} " @@ -971,13 +1057,7 @@ def check_tensor_equality( if not np.allclose( torch_grad, flow_grad, rtol=rtol, atol=atol, equal_nan=True, ): - print_note_fake_program() - print("---------Grad Shape--------") - print(torch_grad.shape) - print(flow_grad.shape) - print( - f"Grads are not equal. PyTorch grad: \n{torch_grad}\n, OneFlow grad: \n{flow_grad}" - ) + print_note_fake_program(detail=True) return False torch_numpy = torch_tensor.detach().cpu().numpy() oneflow_numpy = flow_tensor.numpy() @@ -989,7 +1069,7 @@ def check_tensor_equality( equality_res = equality_res and (torch_numpy.dtype == oneflow_numpy.dtype) if equality_res == False: - print_note_fake_program() + print_note_fake_program(detail=True) print("---------Tensor Shape--------") print(torch_tensor.shape) print(flow_tensor.shape) @@ -1022,7 +1102,7 @@ def check_basetype_equality(a, b, rtol=0.0001, atol=1e-05, check_dtype=False): if check_dtype: equality_res = equality_res and (torch_np.dtype == flow_np.dtype) if equality_res == False: - print_note_fake_program() + print_note_fake_program(detail=True) print("---------Tensor Shape--------") print(a[i].shape) print(b[i].shape) @@ -1125,6 +1205,7 @@ def new_f(test_case, *args, **kwargs): dtype=x.pytorch.dtype, device=x.pytorch.device, ) + call_tensor_id.append(id(pytorch_tensor)) diff_output = GetDualObject( "unused", pytorch_tensor, flow_tensor ) @@ -1155,6 +1236,13 @@ def new_f(test_case, *args, **kwargs): ) call_tensor_id.append(id(getattr(x.pytorch, key).grad)) + for x in dual_objects_to_test: + if ( + isinstance(x.pytorch, torch_original.Tensor) + and id(x.pytorch) not in call_tensor_id + ): + vis_tensor.append(x.pytorch) + # check eager for x in dual_objects_to_test: if check_allclose: diff --git a/python/oneflow/utils/data/dataloader.py b/python/oneflow/utils/data/dataloader.py index a78a0e83273..33e4aa24de5 100644 --- a/python/oneflow/utils/data/dataloader.py +++ b/python/oneflow/utils/data/dataloader.py @@ -171,9 +171,12 @@ class DataLoader(Generic[T_co]): prefetch_factor (int, optional, keyword-only arg): Number of samples loaded in advance by each worker. ``2`` means there will be a total of 2 * num_workers samples prefetched across all workers. (default: ``2``) - persistent_workers (bool, optional): If ``True``, the data loader will not shutdown - the worker processes after a dataset has been consumed once. This allows to - maintain the workers `Dataset` instances alive. (default: ``False``) + persistent_workers (bool, optional): If ``True``, the data loader will immediately + initialize worker preocesses and not shutdown them after a dataset has been + consumed once. This allows to maintain the workers `Dataset` instances alive. + If you are using oneflow with RDMA support in distributed training, the + ``persistent_workers`` must be ``True`` otherwise will encounter segmentation + fault. (default: ``False``) .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn` @@ -363,7 +366,7 @@ def __init__( None # See NOTE [ IterableDataset and __len__ ] ) - self._iterator = None + self._iterator = self._get_iterator() if self.persistent_workers else None def _get_iterator(self) -> "_BaseDataLoaderIter": if self.num_workers == 0: @@ -918,6 +921,12 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter): def __init__(self, loader): super(_MultiProcessingDataLoaderIter, self).__init__(loader) + assert not flow.env.rdma_is_initialized(), ( + "RDMA is initialized! Could not create _MultiProcessingDataLoaderIter any more. " + "Please make sure Dataloader is created before invoking oneflow.env.init_rdma(). " + "If this condition is met, you can pass the arg persistent_workers=True in " + "Dataloader to avoid this error!" + ) assert self._num_workers > 0 assert self._prefetch_factor > 0 From 900dcc8a6f11c47635bd8f83cae1b583e43f106e Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 23 Jun 2022 19:50:02 +0800 Subject: [PATCH 12/45] Ask general basic communication before middle nodes --- .../core/auto_parallel/boxing_collector.cpp | 25 +++++++++++-------- .../job_rewriter/boxing_with_middle_nodes.cpp | 11 ++++---- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index ec7456e060e..21aea76987d 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -548,19 +548,10 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const } } - // Middle nodes algorithm supports transfer for different machines or devices or hierarchies - if (producer_parallel_desc != consumer_parallel_desc) { - JUST(AskSbpCombination4DiffPlacement(sbp_producer, sbp_consumer, logical_blob_desc, - producer_parallel_desc, consumer_parallel_desc, - is_customized, middle_sbps, diag_node_pos, compute_cost)); - +#ifdef WITH_CUDA + if (producer_parallel_desc == consumer_parallel_desc && sbp_producer == sbp_consumer) { return Maybe::Ok(); } - // Transfer for the same machines, devices and hierarchy. - if (sbp_producer == sbp_consumer) { return Maybe::Ok(); } - const auto& parallel_hierarchy = producer_parallel_desc.hierarchy(); - -#ifdef WITH_CUDA // Use a general basic communication if no P in the consumer if ((!NdSbpHasPartialParallel(sbp_consumer))) { if (NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) { @@ -574,6 +565,18 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const } #endif // WITH_CUDA + // Middle nodes algorithm supports transfer for different machines or devices or hierarchies + if (producer_parallel_desc != consumer_parallel_desc) { + JUST(AskSbpCombination4DiffPlacement(sbp_producer, sbp_consumer, logical_blob_desc, + producer_parallel_desc, consumer_parallel_desc, + is_customized, middle_sbps, diag_node_pos, compute_cost)); + + return Maybe::Ok(); + } + // Transfer for the same machines, devices and hierarchy. + if (sbp_producer == sbp_consumer) { return Maybe::Ok(); } + const auto& parallel_hierarchy = producer_parallel_desc.hierarchy(); + *diag_node_pos = 0; // Dealing with nD sbp, n>2 if (parallel_hierarchy->NumAxes() > 2) { diff --git a/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp b/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp index d70068c9941..459d87f2ef6 100644 --- a/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp +++ b/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp @@ -61,13 +61,14 @@ Maybe BoxingWithMiddleNodes(const OpGraph& op_graph, JobBuilder* job_build if (ParseBooleanFromEnv("ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK", false)) { return Maybe::Ok(); } - if (!NeedBoxingCollector(op_graph)) { return Maybe::Ok(); } // Initialize boxing collector BoxingCollector boxing_collector; - // We assemble the boxing table from S(0) to S(5). - // Those splitting in higher axes are considered in the customized boxing. - constexpr int32_t kRegularMaxSplitAxes = 6; - JUST(boxing_collector.Init(kRegularMaxSplitAxes)); + if (NeedBoxingCollector(op_graph)) { + // We assemble the boxing table from S(0) to S(5). + // Those splitting in higher axes are considered in the customized boxing. + constexpr int32_t kRegularMaxSplitAxes = 6; + JUST(boxing_collector.Init(kRegularMaxSplitAxes)); + } std::vector middle_sbps; HashMap op_node2op_conf; // Fill other unsupported combinations From 5eb751037bc882e7a263c76688c4ef10ea5ca671 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 23 Jun 2022 20:00:16 +0800 Subject: [PATCH 13/45] Add a task type for general basic communication --- oneflow/core/graph/straighten_nodes.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/oneflow/core/graph/straighten_nodes.cpp b/oneflow/core/graph/straighten_nodes.cpp index 1e708e19df0..01d0187e583 100644 --- a/oneflow/core/graph/straighten_nodes.cpp +++ b/oneflow/core/graph/straighten_nodes.cpp @@ -104,6 +104,7 @@ bool IsTransferNode(TaskType task_type) { switch (task_type) { // We mark the number of occurrences in bert case TaskType::kCollectiveBoxingGeneric: // 76 + case TaskType::kNcclSendRecvBoxing: // ? case TaskType::kCopyHd: // 27 case TaskType::kSliceBoxing: // 16 case TaskType::kCopyCommNet: // 12 From 8f39d6de8cdcded6cec53e8cfb3053b4400a7b06 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 23 Jun 2022 23:26:35 +0800 Subject: [PATCH 14/45] Fix a bug --- oneflow/core/auto_parallel/boxing_collector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 21aea76987d..1688ea01d7e 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -1077,7 +1077,7 @@ Maybe BoxingCollector::AskCloseAllSplitSbp(const NdSbp& nd_sbp, int32_t min_split_num = 4096; // We need to pick a suitable axis for (int32_t i = 0; i < remain_shape.NumAxes(); i++) { - if (remain_shape.At(i) % split_num > 0) { + if (remain_shape.At(i) % split_num == 0) { if (rest_split_shape.At(i) < min_split_num) { // Pick the axis with smallest splitting number among the rest of the sbp min_split_num = rest_split_shape.At(i); From 0c9fd15338af187d71f65e2bc9a1ed607acba0e1 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Fri, 24 Jun 2022 18:39:15 +0800 Subject: [PATCH 15/45] Fix a bug --- oneflow/core/auto_parallel/boxing_collector.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 1688ea01d7e..a0a026b605b 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -544,8 +544,9 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const } middle_sbps.emplace_back(broadcast_nd); } - return Maybe::Ok(); } + // No middle nodes for another 1d-sbp combinations + return Maybe::Ok(); } #ifdef WITH_CUDA @@ -559,6 +560,9 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const JUST(AskSbpCombination4GeneralBasicCommunication( sbp_producer, sbp_consumer, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc, middle_sbps, diag_node_pos)); + if (GlobalProcessCtx::Rank() == 0) { + std::cout << "Middle size for gbc: " << middle_sbps.size() << std::endl; + } } // Otherwise, one-step transfer return Maybe::Ok(); From 0c95d76d32250080dea1baa725b72e8344da6735 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Sat, 25 Jun 2022 00:29:39 +0800 Subject: [PATCH 16/45] Fix the bug of transfer from 1d sbp to 2d sbp --- oneflow/core/framework/sbp_infer_util.cpp | 204 +++++++++++++--------- 1 file changed, 122 insertions(+), 82 deletions(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 7b1d1129e93..a6d169e93e7 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -526,8 +526,9 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, producer_parallel_desc, consumer_parallel_desc) - * logical_blob_desc.shape().elem_cnt() - * GetSizeOfDataType(logical_blob_desc.data_type()); + * logical_blob_desc.shape().elem_cnt() + * GetSizeOfDataType(logical_blob_desc.data_type()) + + GetTransferCost(); } #endif // WITH_CUDA @@ -661,20 +662,32 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, bool requires_same_sbp) { + // Reduce before cost computation + ParallelDesc reduced_in_parallel_desc = producer_parallel_desc; + NdSbp reduced_in_nd_sbp; + NdSbpDimReduce(producer_parallel_desc, producer_sbp_parallel, &reduced_in_parallel_desc, + &reduced_in_nd_sbp); + + ParallelDesc reduced_out_parallel_desc = consumer_parallel_desc; + NdSbp reduced_out_nd_sbp; + NdSbpDimReduce(consumer_parallel_desc, consumer_sbp_parallel, &reduced_out_parallel_desc, + &reduced_out_nd_sbp); // In 90% of the transfer, we would have the same parallel description for producer and consumer // We need to speed it up and give an approximation of the cost - if (producer_parallel_desc == consumer_parallel_desc) { - if (producer_sbp_parallel == consumer_sbp_parallel) { return 0.0; } + if (reduced_in_parallel_desc == reduced_out_parallel_desc + && reduced_in_nd_sbp == reduced_out_nd_sbp) { + return 0.0; + } #ifdef WITH_CUDA - // Use a general basic communication if no P in the consumer - if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { - return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, - producer_parallel_desc, consumer_parallel_desc) - * logical_blob_desc.shape().elem_cnt() - * GetSizeOfDataType(logical_blob_desc.data_type()); - } -#endif // WITH_CUDA + // Use a general basic communication if no P in the consumer + if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { + return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, + producer_parallel_desc, consumer_parallel_desc) + * logical_blob_desc.shape().elem_cnt() + * GetSizeOfDataType(logical_blob_desc.data_type()) + + GetTransferCost(); } +#endif // WITH_CUDA // Initialize boxing collector constexpr int32_t kRegularMaxSplitAxes = 6; @@ -757,6 +770,8 @@ double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, // The transfer ratio for general basic communication // Cost = ratio * data amount +// When we get the this function, either producer_sbp_parallel != consumer_sbp_parallel +// or producer_parallel_desc != consumer_parallel_desc double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, const NdSbp& consumer_sbp_parallel, const ParallelDesc& producer_parallel_desc, @@ -766,92 +781,117 @@ double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, PartialRatio4Producer(producer_sbp_parallel, producer_parallel_desc); int32_t consumer_broadcast_ratio = BroadcastRatio4Consumer(consumer_sbp_parallel, consumer_parallel_desc); + // More intersection on the same devices + bool on_same_devices = producer_parallel_desc.EqualsIgnoringHierarchy(consumer_parallel_desc); // approximate intersection ratio double intersection_ratio = 1.0; // (?, P, ?)->(Si, Sj)->(?, B, ?), two-step transfer if (producer_partial_ratio > 1 && consumer_broadcast_ratio > 1) { - // Pure P in the producer or B in the consumer - // (P, P, P) -> ? or ? -> (B, B) - if (producer_partial_ratio == producer_parallel_desc.parallel_num() - || consumer_broadcast_ratio == consumer_parallel_desc.parallel_num()) { - // There some cases which is not applicable to this ratio - // We just take the one with the largest possibility - // For example: (P, S0) -> (B, B) for 1-D blob with machine hierarchy [n, m] - // The path should be (P, S0) -> (S0, S0) -> (B, B) - // true intersection ratio = 1/m + 1 - intersection_ratio = 2.0; - } else { - // sbp_consumer = (B, Si) or (Si, B) - for (int32_t sbp_id = 0; sbp_id < consumer_sbp_parallel.sbp_parallel_size(); sbp_id++) { - if (consumer_sbp_parallel.sbp_parallel(sbp_id).has_split_parallel()) { - const auto& producer_sbp4sbp_id = producer_sbp_parallel.sbp_parallel(sbp_id); - // (B, P) or (Si, P) -> (Si, B) - // (P, B) or (P, Si) -> (B, Si) - if (producer_sbp4sbp_id.has_broadcast_parallel() - || producer_sbp4sbp_id == consumer_sbp_parallel.sbp_parallel(sbp_id)) { - intersection_ratio = 2.0; - break; + if (on_same_devices) { + // Pure P in the producer or B in the consumer + // (P, P, P) -> ? or ? -> (B, B) + if (producer_partial_ratio == producer_parallel_desc.parallel_num() + || consumer_broadcast_ratio == consumer_parallel_desc.parallel_num()) { + // There some cases which is not applicable to this ratio + // We just take the one with the largest possibility + // For example: (P, S0) -> (B, B) for 1-D blob with machine hierarchy [n, m] + // The path should be (P, S0) -> (S0, S0) -> (B, B) + // true intersection ratio = 1/m + 1 + intersection_ratio = 2.0; + } else { + // sbp_consumer = (B, Si) or (Si, B) + for (int32_t sbp_id = 0; sbp_id < std::min(producer_sbp_parallel.sbp_parallel_size(), + consumer_sbp_parallel.sbp_parallel_size()); + sbp_id++) { + if (consumer_sbp_parallel.sbp_parallel(sbp_id).has_split_parallel()) { + const auto& producer_sbp4sbp_id = producer_sbp_parallel.sbp_parallel(sbp_id); + // (B, P) or (Si, P) -> (Si, B) + // (P, B) or (P, Si) -> (B, Si) + if (producer_sbp4sbp_id.has_broadcast_parallel() + || producer_sbp4sbp_id == consumer_sbp_parallel.sbp_parallel(sbp_id)) { + intersection_ratio = 2.0; + break; + } } } - } - // Judge whether the intersection ratio is given a value (2.0) - if (intersection_ratio == 1.0) { - // The true intersection ratio range from 0 to 2, - // we just take a middle point of the range as the approximation - // For example: (P, S0) -> (S0, B), Path: (P, S0) -> (S1, S0) -> (S0, B) - // true intersection ratio = 1 + 1/m - // For example: (P, S0) -> (S1, B), Path: (P, S0) -> (S1, S0) -> (S1, B) - // true intersection ratio = 1 + 1 - // For example: (P, S0) -> (B, S0), with a 1D blob - // true intersection ratio = (n+p-1)/nm + (n+p-1)/nm - // For example: (S0, P) -> (B, S0), Path: (S0, P) -> (S0, S1) -> (B, S0) - // true intersection ratio = 1 + 1/n - - // We use the approximation 1 + (1/n + 1/m)/2 - intersection_ratio = 1.0 + 0.5 / producer_parallel_desc.hierarchy()->At(0) - + 0.5 / producer_parallel_desc.hierarchy()->At(1); + // Judge whether the intersection ratio is given a value (2.0) + if (intersection_ratio == 1.0) { + // The true intersection ratio range from 0 to 2, + // we just take a middle point of the range as the approximation + // For example: (P, S0) -> (S0, B), Path: (P, S0) -> (S1, S0) -> (S0, B) + // true intersection ratio = 1 + 1/m + // For example: (P, S0) -> (S1, B), Path: (P, S0) -> (S1, S0) -> (S1, B) + // true intersection ratio = 1 + 1 + // For example: (P, S0) -> (B, S0), with a 1D blob + // true intersection ratio = (n+p-1)/nm + (n+p-1)/nm + // For example: (S0, P) -> (B, S0), Path: (S0, P) -> (S0, S1) -> (B, S0) + // true intersection ratio = 1 + 1/n + + // We use the approximation 1 + (1/n + 1/m)/2 + intersection_ratio = 1.0 + 0.5 / producer_parallel_desc.hierarchy()->At(0) + + 0.5 / producer_parallel_desc.hierarchy()->At(1); + } } } + // Otherwise, on different devices + // intersection_ratio = 1.0; } else { // No P in the producer or no B in the consumer, one-step transfer - // The intersection ratio is design for two steps. - // However, we only have one step here, we would increase the ratio by 1.0 - // to eliminate the unused step - const auto& parallel_hierarchy = producer_parallel_desc.hierarchy(); - for (int32_t sbp_id = 0; sbp_id < consumer_sbp_parallel.sbp_parallel_size(); sbp_id++) { - const auto& producer_sbp4sbp_id = producer_sbp_parallel.sbp_parallel(sbp_id); - const auto& consumer_sbp4sbp_id = consumer_sbp_parallel.sbp_parallel(sbp_id); - // ? -> Si - if (consumer_sbp4sbp_id.has_split_parallel()) { - // Sj -> Si - if (producer_sbp4sbp_id.has_split_parallel() - && producer_sbp4sbp_id != consumer_sbp4sbp_id) { - intersection_ratio /= parallel_hierarchy->At(sbp_id); - } + if (on_same_devices) { + // We only deal with 1D and 2D sbp at this moment + // For higher dimension, we should use simulation. + std::shared_ptr parallel_hierarchy; + if (producer_sbp_parallel.sbp_parallel_size() < consumer_sbp_parallel.sbp_parallel_size()) { + parallel_hierarchy = consumer_parallel_desc.hierarchy(); } else { - // B/P -> B - if (!producer_sbp4sbp_id.has_split_parallel()) { - intersection_ratio *= parallel_hierarchy->At(sbp_id); + parallel_hierarchy = producer_parallel_desc.hierarchy(); + } + int32_t max_sbp_size = std::max(producer_sbp_parallel.sbp_parallel_size(), + consumer_sbp_parallel.sbp_parallel_size()); + + for (int32_t sbp_id = 0; sbp_id < max_sbp_size; sbp_id++) { + const auto& producer_sbp4sbp_id = producer_sbp_parallel.sbp_parallel( + std::min(sbp_id, producer_sbp_parallel.sbp_parallel_size())); + const auto& consumer_sbp4sbp_id = consumer_sbp_parallel.sbp_parallel( + std::min(sbp_id, consumer_sbp_parallel.sbp_parallel_size())); + // ? -> Si + if (consumer_sbp4sbp_id.has_split_parallel()) { + // Sj -> Si + if (producer_sbp4sbp_id.has_split_parallel() + && producer_sbp4sbp_id != consumer_sbp4sbp_id) { + intersection_ratio /= parallel_hierarchy->At(sbp_id); + } + } else { + // B/P -> B + if (!producer_sbp4sbp_id.has_split_parallel()) { + intersection_ratio *= parallel_hierarchy->At(sbp_id); + } } + // For B/P/Si -> Si and Si -> B + // intersection ratio remains the same } - // For B/P/Si -> Si and Si -> B - // intersection ratio remains the same - } - // With the approximation above, - // (S1, S0) -> (S0, S0) would have an approximate intersection ratio 1/n - // (B, S0) -> (S0, S0) would have an approximate intersection ratio 1 - // However, their actual intersection ratios are (n+p-1)/(n^2*m) and (n+p-1)/(nm), respectively - // We add a patch for this approximation, making them 1/nm and 1/m respectively - if (producer_sbp_parallel.sbp_parallel(0) != consumer_sbp_parallel.sbp_parallel(0) - && producer_sbp_parallel.sbp_parallel_size() >= 2) { - const auto& producer_sbp_parallel_1 = producer_sbp_parallel.sbp_parallel(1); - if (producer_sbp_parallel_1 == consumer_sbp_parallel.sbp_parallel(1) - && producer_sbp_parallel_1.has_split_parallel() - && (producer_sbp_parallel_1 == producer_sbp_parallel.sbp_parallel(0) - || producer_sbp_parallel_1 == consumer_sbp_parallel.sbp_parallel(0))) { + // With the approximation above, + // (S1, S0) -> (S0, S0) would have an approximate intersection ratio 1/n + // (B, S0) -> (S0, S0) would have an approximate intersection ratio 1 + // However, their actual intersection ratios are (n+p-1)/(n^2*m) and (n+p-1)/(nm), + // respectively We add a patch for this approximation, making them 1/nm and 1/m respectively + if (producer_sbp_parallel.sbp_parallel(0) != consumer_sbp_parallel.sbp_parallel(0) + && max_sbp_size >= 2 + && (NdSbpAllSameSplitParallel(producer_sbp_parallel) + || NdSbpAllSameSplitParallel(consumer_sbp_parallel)) + && (producer_sbp_parallel.sbp_parallel( + std::min(1, producer_sbp_parallel.sbp_parallel_size())) + == consumer_sbp_parallel.sbp_parallel( + std::min(1, consumer_sbp_parallel.sbp_parallel_size())))) { intersection_ratio /= parallel_hierarchy->At(1); } + // The intersection ratio is design for two steps. + // However, we only have one step here, we would increase the ratio by 1.0 + // to eliminate the unused step + intersection_ratio += 1.0; } + // Otherwise, on different devices + // intersection_ratio = 1.0; } // Subtract the intersection part return producer_partial_ratio + consumer_broadcast_ratio - intersection_ratio; From d4cf04c60b9ad80fb02cdc71eb149589c4d8d610 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Mon, 27 Jun 2022 22:22:41 +0800 Subject: [PATCH 17/45] Use the intersection to approximate the ratio --- oneflow/core/framework/sbp_infer_util.cpp | 79 ++++++----------------- oneflow/core/framework/sbp_infer_util.h | 1 + 2 files changed, 22 insertions(+), 58 deletions(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index a6d169e93e7..fb5fcc6f60e 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -17,6 +17,7 @@ limitations under the License. #include "oneflow/core/framework/sbp_infer_util.h" #include "oneflow/core/auto_parallel/boxing_collector.h" #include "oneflow/core/boxing/eager_boxing_interpreter_mgr.h" +#include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/common/util.h" #include "oneflow/core/job/lazy_mode.h" #include "oneflow/core/job/nd_sbp_util.h" @@ -518,16 +519,12 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel logical_blob_desc, reduced_in_parallel_desc, reduced_out_parallel_desc)); } - double logical_blob_size = - logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()); - #ifdef WITH_CUDA // Use a general basic communication if no P in the consumer if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, - producer_parallel_desc, consumer_parallel_desc) - * logical_blob_desc.shape().elem_cnt() - * GetSizeOfDataType(logical_blob_desc.data_type()) + logical_blob_desc, producer_parallel_desc, + consumer_parallel_desc) + GetTransferCost(); } #endif // WITH_CUDA @@ -537,6 +534,8 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel bool on_same_devices = reduced_in_parallel_desc.EqualsIgnoringHierarchy(reduced_out_parallel_desc); + double logical_blob_size = + logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()); if (in_dim == 2 && out_dim == 2) { // Not supporting different hierarchy @@ -682,9 +681,8 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, // Use a general basic communication if no P in the consumer if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, - producer_parallel_desc, consumer_parallel_desc) - * logical_blob_desc.shape().elem_cnt() - * GetSizeOfDataType(logical_blob_desc.data_type()) + logical_blob_desc, producer_parallel_desc, + consumer_parallel_desc) + GetTransferCost(); } #endif // WITH_CUDA @@ -774,6 +772,7 @@ double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, // or producer_parallel_desc != consumer_parallel_desc double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, const NdSbp& consumer_sbp_parallel, + const BlobDesc& logical_blob_desc, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc) { // The upper bound of the amount of the transferred data @@ -838,63 +837,27 @@ double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, } else { // No P in the producer or no B in the consumer, one-step transfer if (on_same_devices) { - // We only deal with 1D and 2D sbp at this moment - // For higher dimension, we should use simulation. - std::shared_ptr parallel_hierarchy; - if (producer_sbp_parallel.sbp_parallel_size() < consumer_sbp_parallel.sbp_parallel_size()) { - parallel_hierarchy = consumer_parallel_desc.hierarchy(); - } else { - parallel_hierarchy = producer_parallel_desc.hierarchy(); - } - int32_t max_sbp_size = std::max(producer_sbp_parallel.sbp_parallel_size(), - consumer_sbp_parallel.sbp_parallel_size()); - - for (int32_t sbp_id = 0; sbp_id < max_sbp_size; sbp_id++) { - const auto& producer_sbp4sbp_id = producer_sbp_parallel.sbp_parallel( - std::min(sbp_id, producer_sbp_parallel.sbp_parallel_size())); - const auto& consumer_sbp4sbp_id = consumer_sbp_parallel.sbp_parallel( - std::min(sbp_id, consumer_sbp_parallel.sbp_parallel_size())); - // ? -> Si - if (consumer_sbp4sbp_id.has_split_parallel()) { - // Sj -> Si - if (producer_sbp4sbp_id.has_split_parallel() - && producer_sbp4sbp_id != consumer_sbp4sbp_id) { - intersection_ratio /= parallel_hierarchy->At(sbp_id); - } - } else { - // B/P -> B - if (!producer_sbp4sbp_id.has_split_parallel()) { - intersection_ratio *= parallel_hierarchy->At(sbp_id); - } - } - // For B/P/Si -> Si and Si -> B - // intersection ratio remains the same - } - // With the approximation above, - // (S1, S0) -> (S0, S0) would have an approximate intersection ratio 1/n - // (B, S0) -> (S0, S0) would have an approximate intersection ratio 1 - // However, their actual intersection ratios are (n+p-1)/(n^2*m) and (n+p-1)/(nm), - // respectively We add a patch for this approximation, making them 1/nm and 1/m respectively - if (producer_sbp_parallel.sbp_parallel(0) != consumer_sbp_parallel.sbp_parallel(0) - && max_sbp_size >= 2 - && (NdSbpAllSameSplitParallel(producer_sbp_parallel) - || NdSbpAllSameSplitParallel(consumer_sbp_parallel)) - && (producer_sbp_parallel.sbp_parallel( - std::min(1, producer_sbp_parallel.sbp_parallel_size())) - == consumer_sbp_parallel.sbp_parallel( - std::min(1, consumer_sbp_parallel.sbp_parallel_size())))) { - intersection_ratio /= parallel_hierarchy->At(1); - } + // We use simulation for nD sbp with n=1,2,3,... + TensorSliceView in_second_slice = + GetTensorSliceView4ParallelId(*producer_parallel_desc.hierarchy(), producer_sbp_parallel, + logical_blob_desc.shape(), /*parallel_id=*/1); + TensorSliceView out_second_slice = + GetTensorSliceView4ParallelId(*consumer_parallel_desc.hierarchy(), consumer_sbp_parallel, + logical_blob_desc.shape(), /*parallel_id=*/1); + const TensorSliceView& intersection = in_second_slice.Intersect(out_second_slice); // The intersection ratio is design for two steps. // However, we only have one step here, we would increase the ratio by 1.0 // to eliminate the unused step - intersection_ratio += 1.0; + intersection_ratio += std::min( + 1.0, (double)(intersection.shape().elem_cnt() * producer_parallel_desc.parallel_num()) + / logical_blob_desc.shape().elem_cnt()); } // Otherwise, on different devices // intersection_ratio = 1.0; } // Subtract the intersection part - return producer_partial_ratio + consumer_broadcast_ratio - intersection_ratio; + return (producer_partial_ratio + consumer_broadcast_ratio - intersection_ratio) + * logical_blob_desc.shape().elem_cnt() * GetSizeOfDataType(logical_blob_desc.data_type()); } } // namespace oneflow diff --git a/oneflow/core/framework/sbp_infer_util.h b/oneflow/core/framework/sbp_infer_util.h index 42c01e29e57..91aa0f7ff52 100644 --- a/oneflow/core/framework/sbp_infer_util.h +++ b/oneflow/core/framework/sbp_infer_util.h @@ -100,6 +100,7 @@ double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, // Cost = ratio * data amount double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, const NdSbp& consumer_sbp_parallel, + const BlobDesc& logical_blob_desc, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc); From e843e7e038ebf040b58be731cf892f35e654362a Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Tue, 28 Jun 2022 00:47:18 +0800 Subject: [PATCH 18/45] Use a suitable virtual blob description --- oneflow/core/auto_parallel/boxing_collector.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index a0a026b605b..3adc8036b37 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -200,10 +200,13 @@ Maybe BoxingCollector::GenerateCombination4SamePlacement(int32_t max_middl // NOTE: The performance of this function are all the same with different hierarchy int32_t world_size = GlobalProcessCtx::WorldSize(); Shape hierarchy44({4 * world_size, 4 * world_size}); + int32_t virtual_range_size = hierarchy44.elem_cnt(); std::shared_ptr virtual_hierarchy = std::make_shared(hierarchy44); auto parallel_desc = JUST(ParallelDesc::New( "cpu", {"0:0-" + std::to_string(hierarchy44.elem_cnt() - 1)}, virtual_hierarchy)); - BlobDesc blob_desc({16, 16, 16, 16}, DataType::kInt8, /*is_dynamic=*/false); + BlobDesc blob_desc({virtual_range_size, virtual_range_size, virtual_range_size, + virtual_range_size, virtual_range_size, virtual_range_size}, + DataType::kInt8, /*is_dynamic=*/false); JUST(GenerateCombination4SamePlacement(max_middle_node_num, blob_desc, *parallel_desc)); return Maybe::Ok(); } @@ -319,7 +322,10 @@ Maybe BoxingCollector::GenerateCombination4DiffPlacement( BoxingCollector* boxing_collector_producer, BoxingCollector* boxing_collector_consumer) { // Virtual parallel and blob description int32_t world_size = GlobalProcessCtx::WorldSize(); - BlobDesc blob_desc({16, 16, 16, 16}, DataType::kInt8, /*is_dynamic=*/false); + int32_t virtual_range_size = 4 * world_size * (4 * world_size + 1); + BlobDesc blob_desc({virtual_range_size, virtual_range_size, virtual_range_size, + virtual_range_size, virtual_range_size, virtual_range_size}, + DataType::kInt8, /*is_dynamic=*/false); // Virtual placements before transfer Shape in_hierarchy44({4 * world_size + 1, 4 * world_size}); std::shared_ptr in_hierarchy = std::make_shared(in_hierarchy44); From 7e7724a2b5c177e8fd92ef46404d9cdc5e39b12f Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Tue, 28 Jun 2022 20:18:29 +0800 Subject: [PATCH 19/45] Remove the checking for balanced splitting --- oneflow/core/operator/nccl_send_recv_boxing_op.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp index 9e1481fa7aa..16f75fce813 100644 --- a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp +++ b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp @@ -109,12 +109,15 @@ Maybe NcclSendRecvBoxingOp::InferOutBlobDescs( const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); const Shape& logical_shape = Shape(conf.logical_shape()); if (conf.has_input()) { - const BlobDesc* in_blob_desc = GetBlobDesc4BnInOp("in"); + // const BlobDesc* in_blob_desc = GetBlobDesc4BnInOp("in"); const NdSbp& src_nd_sbp = conf.src_nd_sbp(); const ParallelDesc& src_parallel_desc = ParallelDesc(conf.src_parallel_conf()); - std::shared_ptr in_shape = - JUST(GetPhysicalShape(logical_shape, src_nd_sbp, src_parallel_desc, 0)); - CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()); + // std::shared_ptr in_shape = + JUST(GetPhysicalShape(logical_shape, src_nd_sbp, src_parallel_desc, 0)); + // We do not check the shape of "in" here since we might have inconsistency + // For example, a blob with shape [4] is transferring from [3]: S0 to [2, 2]: (B, S0) + // *in_shape = [2] and in_blob_desc->shape() = [1] + // CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()); } if (conf.has_output()) { BlobDesc* out_blob_desc = GetBlobDesc4BnInOp("out"); From e7d7fe3c1375e8b02289770e3fefc1c8efa814d7 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Wed, 29 Jun 2022 19:13:57 +0800 Subject: [PATCH 20/45] Fix the previous bug, still have another one --- oneflow/core/operator/nccl_send_recv_boxing_op.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp index 16f75fce813..0b255a10579 100644 --- a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp +++ b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp @@ -109,15 +109,12 @@ Maybe NcclSendRecvBoxingOp::InferOutBlobDescs( const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); const Shape& logical_shape = Shape(conf.logical_shape()); if (conf.has_input()) { - // const BlobDesc* in_blob_desc = GetBlobDesc4BnInOp("in"); + const BlobDesc* in_blob_desc = GetBlobDesc4BnInOp("in"); const NdSbp& src_nd_sbp = conf.src_nd_sbp(); const ParallelDesc& src_parallel_desc = ParallelDesc(conf.src_parallel_conf()); - // std::shared_ptr in_shape = - JUST(GetPhysicalShape(logical_shape, src_nd_sbp, src_parallel_desc, 0)); - // We do not check the shape of "in" here since we might have inconsistency - // For example, a blob with shape [4] is transferring from [3]: S0 to [2, 2]: (B, S0) - // *in_shape = [2] and in_blob_desc->shape() = [1] - // CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()); + std::shared_ptr in_shape = JUST(GetPhysicalShape( + logical_shape, src_nd_sbp, src_parallel_desc, parallel_ctx->parallel_id())); + CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()); } if (conf.has_output()) { BlobDesc* out_blob_desc = GetBlobDesc4BnInOp("out"); From afee00a94d45eecc15e8642eeeecdb55307e35b4 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Wed, 29 Jun 2022 19:48:25 +0800 Subject: [PATCH 21/45] Fix another bug --- oneflow/core/operator/nccl_send_recv_boxing_op.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp index 0b255a10579..a2d8d3d02ec 100644 --- a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp +++ b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp @@ -120,8 +120,8 @@ Maybe NcclSendRecvBoxingOp::InferOutBlobDescs( BlobDesc* out_blob_desc = GetBlobDesc4BnInOp("out"); const NdSbp& dst_nd_sbp = conf.dst_nd_sbp(); const ParallelDesc& dst_parallel_desc = ParallelDesc(conf.dst_parallel_conf()); - std::shared_ptr out_shape = - JUST(GetPhysicalShape(logical_shape, dst_nd_sbp, dst_parallel_desc, 0)); + std::shared_ptr out_shape = JUST(GetPhysicalShape( + logical_shape, dst_nd_sbp, dst_parallel_desc, parallel_ctx->parallel_id())); out_blob_desc->mut_shape() = *out_shape; out_blob_desc->set_data_type(conf.data_type()); } From 3b6baadf534fbb85ea8a3f13aabc6fbde6ef30e5 Mon Sep 17 00:00:00 2001 From: guo ran <360112263@qq.com> Date: Thu, 30 Jun 2022 11:23:21 +0800 Subject: [PATCH 22/45] Update oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp nccl_send_recv use different stream --- .../graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp index 7205cf2217f..7a6ebc9efce 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp @@ -132,7 +132,7 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB const int64_t machine_id = JUST(merged_parallel_desc.MachineId4ParallelId(id)); int64_t device_index = JUST(merged_parallel_desc.DeviceId4ParallelId(id)); int64_t thrd_id = EncodeStreamIdToInt64(GenerateNamedTaskStreamId( - machine_id, merged_parallel_desc.device_type(), device_index, "NCCL_SEND_RECV_BOXING")); + machine_id, merged_parallel_desc.device_type(), device_index, "NCCL_SEND_RECV_BOXING" + NewUniqueId())); bool has_input = in_parallel_desc.Containing(machine_id, device_index); bool has_output = out_parallel_desc.Containing(machine_id, device_index); node->Init(machine_id, thrd_id, lbi, logical_blob_desc.shape(), From 90b0a5df81c9f83ba9152fcdae4de16e1e1683ee Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 30 Jun 2022 16:44:39 +0800 Subject: [PATCH 23/45] Use machine 4-7 for hierarchy [2, 2] in the consumer --- ...hierarchical_sub_task_graph_builder_impl.cpp | 5 +++-- .../core/operator/nccl_send_recv_boxing_op.cpp | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp index 7a6ebc9efce..a3a936ddbe3 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp @@ -131,8 +131,9 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB NcclSendRecvBoxingTaskNode* node = ctx->task_graph()->NewNode(); const int64_t machine_id = JUST(merged_parallel_desc.MachineId4ParallelId(id)); int64_t device_index = JUST(merged_parallel_desc.DeviceId4ParallelId(id)); - int64_t thrd_id = EncodeStreamIdToInt64(GenerateNamedTaskStreamId( - machine_id, merged_parallel_desc.device_type(), device_index, "NCCL_SEND_RECV_BOXING" + NewUniqueId())); + int64_t thrd_id = EncodeStreamIdToInt64( + GenerateNamedTaskStreamId(machine_id, merged_parallel_desc.device_type(), device_index, + "NCCL_SEND_RECV_BOXING" + NewUniqueId())); bool has_input = in_parallel_desc.Containing(machine_id, device_index); bool has_output = out_parallel_desc.Containing(machine_id, device_index); node->Init(machine_id, thrd_id, lbi, logical_blob_desc.shape(), diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp index a2d8d3d02ec..2fdf31101d9 100644 --- a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp +++ b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp @@ -108,20 +108,29 @@ Maybe NcclSendRecvBoxingOp::InferOutBlobDescs( const ParallelContext* parallel_ctx) const { const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); const Shape& logical_shape = Shape(conf.logical_shape()); + const ParallelDesc& parallel_desc = ParallelDesc(conf.parallel_conf()); + const int64_t machine_id = + CHECK_JUST(parallel_desc.MachineId4ParallelId(parallel_ctx->parallel_id())); + const int64_t device_index = + CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_ctx->parallel_id())); if (conf.has_input()) { const BlobDesc* in_blob_desc = GetBlobDesc4BnInOp("in"); const NdSbp& src_nd_sbp = conf.src_nd_sbp(); const ParallelDesc& src_parallel_desc = ParallelDesc(conf.src_parallel_conf()); - std::shared_ptr in_shape = JUST(GetPhysicalShape( - logical_shape, src_nd_sbp, src_parallel_desc, parallel_ctx->parallel_id())); + int64_t src_parallel_id = + CHECK_JUST(src_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + std::shared_ptr in_shape = + JUST(GetPhysicalShape(logical_shape, src_nd_sbp, src_parallel_desc, src_parallel_id)); CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()); } if (conf.has_output()) { BlobDesc* out_blob_desc = GetBlobDesc4BnInOp("out"); const NdSbp& dst_nd_sbp = conf.dst_nd_sbp(); const ParallelDesc& dst_parallel_desc = ParallelDesc(conf.dst_parallel_conf()); - std::shared_ptr out_shape = JUST(GetPhysicalShape( - logical_shape, dst_nd_sbp, dst_parallel_desc, parallel_ctx->parallel_id())); + int64_t dst_parallel_id = + CHECK_JUST(dst_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + std::shared_ptr out_shape = + JUST(GetPhysicalShape(logical_shape, dst_nd_sbp, dst_parallel_desc, dst_parallel_id)); out_blob_desc->mut_shape() = *out_shape; out_blob_desc->set_data_type(conf.data_type()); } From bcede72ebd2d8c129c75d028456fdadb38d73c93 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 30 Jun 2022 18:12:32 +0800 Subject: [PATCH 24/45] Add a switch for general basic communication --- oneflow/core/auto_parallel/boxing_collector.cpp | 5 ++++- oneflow/core/framework/sbp_infer_util.cpp | 12 ++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 3adc8036b37..6f2723c3046 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -559,8 +559,11 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const if (producer_parallel_desc == consumer_parallel_desc && sbp_producer == sbp_consumer) { return Maybe::Ok(); } + static const bool enable_general_basic_communication = + Global::Get()->nccl_use_compute_stream() + || ParseBooleanFromEnv("Enable_General_Basic_Communication", false); // Use a general basic communication if no P in the consumer - if ((!NdSbpHasPartialParallel(sbp_consumer))) { + if (enable_general_basic_communication && (!NdSbpHasPartialParallel(sbp_consumer))) { if (NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) { // (?, P, ?)->(Si, Sj)->(?, B, ?), two-step transfer JUST(AskSbpCombination4GeneralBasicCommunication( diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index fb5fcc6f60e..224590ad348 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -19,9 +19,11 @@ limitations under the License. #include "oneflow/core/boxing/eager_boxing_interpreter_mgr.h" #include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/common/util.h" +#include "oneflow/core/job/global_for.h" #include "oneflow/core/job/lazy_mode.h" #include "oneflow/core/job/nd_sbp_util.h" #include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/job/resource_desc.h" #include "oneflow/core/job/sbp_parallel.pb.h" namespace oneflow { @@ -520,8 +522,11 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel } #ifdef WITH_CUDA + static const bool enable_general_basic_communication = + Global::Get()->nccl_use_compute_stream() + || ParseBooleanFromEnv("Enable_General_Basic_Communication", false); // Use a general basic communication if no P in the consumer - if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { + if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc) @@ -678,8 +683,11 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, return 0.0; } #ifdef WITH_CUDA + static const bool enable_general_basic_communication = + Global::Get()->nccl_use_compute_stream() + || ParseBooleanFromEnv("Enable_General_Basic_Communication", false); // Use a general basic communication if no P in the consumer - if ((!NdSbpHasPartialParallel(consumer_sbp_parallel))) { + if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc) From 79c905f88d6dd79467aa2017440f720ce7c414b3 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 30 Jun 2022 11:35:26 +0000 Subject: [PATCH 25/45] Add test script and of format --- .../core/auto_parallel/boxing_collector.cpp | 2 +- oneflow/core/framework/sbp_infer_util.cpp | 4 +- python/oneflow/test/graph/test_gbc2d.py | 90 ++++++++++++++++++ python/oneflow/test/modules/test_gbc1to2d.py | 94 +++++++++++++++++++ python/oneflow/test/modules/test_gbc2to1d.py | 93 ++++++++++++++++++ python/oneflow/test/modules/test_gbc2to2d.py | 91 ++++++++++++++++++ 6 files changed, 371 insertions(+), 3 deletions(-) create mode 100644 python/oneflow/test/graph/test_gbc2d.py create mode 100644 python/oneflow/test/modules/test_gbc1to2d.py create mode 100644 python/oneflow/test/modules/test_gbc2to1d.py create mode 100644 python/oneflow/test/modules/test_gbc2to2d.py diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 6f2723c3046..b1f1cb075ed 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -561,7 +561,7 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const } static const bool enable_general_basic_communication = Global::Get()->nccl_use_compute_stream() - || ParseBooleanFromEnv("Enable_General_Basic_Communication", false); + || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer if (enable_general_basic_communication && (!NdSbpHasPartialParallel(sbp_consumer))) { if (NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) { diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 224590ad348..87915b77780 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -524,7 +524,7 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel #ifdef WITH_CUDA static const bool enable_general_basic_communication = Global::Get()->nccl_use_compute_stream() - || ParseBooleanFromEnv("Enable_General_Basic_Communication", false); + || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, @@ -685,7 +685,7 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, #ifdef WITH_CUDA static const bool enable_general_basic_communication = Global::Get()->nccl_use_compute_stream() - || ParseBooleanFromEnv("Enable_General_Basic_Communication", false); + || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, diff --git a/python/oneflow/test/graph/test_gbc2d.py b/python/oneflow/test/graph/test_gbc2d.py new file mode 100644 index 00000000000..e511a5c498e --- /dev/null +++ b/python/oneflow/test/graph/test_gbc2d.py @@ -0,0 +1,90 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest +from collections import OrderedDict +import oneflow +import numpy as np +import oneflow as flow +import oneflow.unittest +from oneflow.test_utils.test_util import GenArgList + +from oneflow.test_utils.automated_test_util import * +import time +import os + +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" +os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "1" + + +def _test_general_basic_communication_same_placement(test_case, src_nd_sbp, dst_nd_sbp): + # can not process p in dst + if flow.sbp.partial_sum() in dst_nd_sbp: + return + + # input + placement = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) + local_np = np.arange(12 * 12).reshape(12, 12) + x = flow.tensor(local_np, sbp=src_nd_sbp, placement=placement) + + # check eager boxing + eager_out = x.to_global(sbp=dst_nd_sbp, placement=placement) + test_case.assertTrue(np.array_equal(eager_out.numpy(), x.numpy())) + + # check graph boxing + flow.boxing.nccl.enable_use_compute_stream(False) + + class TestGeneralBasicCommunicationGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + + def build(self, x): + y = x.to_global(sbp=dst_nd_sbp, placement=placement) + return y + + graph = TestGeneralBasicCommunicationGraph() + y = graph(x) + out_np = y.numpy() + in_np = x.numpy() + test_case.assertTrue(np.array_equal(out_np, in_np)) + + +def gen_nd_sbp(): + sbp_list = [ + flow.sbp.partial_sum(), + flow.sbp.broadcast(), + flow.sbp.split(0), + flow.sbp.split(1), + ] + nd_sbp_list = [] + for sbp0 in sbp_list: + for sbp1 in sbp_list: + nd_sbp_list.append([sbp0, sbp1]) + return nd_sbp_list + +@flow.unittest.skip_unless_1n4d() +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +class TestGeneralBasicCommunication(flow.unittest.TestCase): + def test_nccl_logical_send_recv(test_case): + arg_dict = OrderedDict() + arg_dict["src_nd_sbp"] = gen_nd_sbp() + arg_dict["dst_nd_sbp"] = gen_nd_sbp() + for arg in GenArgList(arg_dict): + _test_general_basic_communication_same_placement(test_case, *arg) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_gbc1to2d.py b/python/oneflow/test/modules/test_gbc1to2d.py new file mode 100644 index 00000000000..73856380f72 --- /dev/null +++ b/python/oneflow/test/modules/test_gbc1to2d.py @@ -0,0 +1,94 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest +from collections import OrderedDict +import oneflow +import numpy as np +import oneflow as flow +import oneflow.unittest +from oneflow.test_utils.test_util import GenArgList + +from oneflow.test_utils.automated_test_util import * +import time +import os + +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" +os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "1" + + +def _test_general_basic_communication_1d_to_2d(test_case, src_nd_sbp, dst_nd_sbp): + # can not process p in dst + if flow.sbp.partial_sum() in dst_nd_sbp: + return + + # input + placement_x = flow.placement("cuda", ranks=[0, 1, 2]) + placement_y = flow.placement("cuda", ranks=[[3, 4], [1, 2]]) + local_np = np.arange(4 * 12).reshape(4, 12) + x = flow.tensor(local_np, sbp=src_nd_sbp, placement=placement_x) + + # check eager boxing + eager_out = x.to_global(sbp=dst_nd_sbp, placement=placement_y) + test_case.assertTrue(np.array_equal(eager_out.numpy(), x.numpy())) + + # check graph boxing + flow.boxing.nccl.enable_use_compute_stream(False) + + class TestGeneralBasicCommunicationGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + + def build(self, x): + y = x.to_global(sbp=dst_nd_sbp, placement=placement_y) + return y + + graph = TestGeneralBasicCommunicationGraph() + y = graph(x) + out_np = y.numpy() + in_np = x.numpy() + test_case.assertTrue(np.array_equal(out_np, in_np)) + + +def gen_nd_sbp_1d(): + sbp_list = [ + flow.sbp.partial_sum(), + flow.sbp.broadcast(), + flow.sbp.split(0), + flow.sbp.split(1), + ] + return sbp_list + +def gen_nd_sbp_2d(): + nd_sbp_list = [] + for sbp0 in gen_nd_sbp_1d(): + for sbp1 in gen_nd_sbp_1d(): + nd_sbp_list.append([sbp0, sbp1]) + return nd_sbp_list + +@flow.unittest.skip_unless_2n4d() +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +class TestGeneralBasicCommunication(flow.unittest.TestCase): + def test_nccl_logical_send_recv(test_case): + arg_dict = OrderedDict() + arg_dict["src_nd_sbp"] = gen_nd_sbp_1d() + arg_dict["dst_nd_sbp"] = gen_nd_sbp_2d() + for arg in GenArgList(arg_dict): + _test_general_basic_communication_1d_to_2d(test_case, *arg) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_gbc2to1d.py b/python/oneflow/test/modules/test_gbc2to1d.py new file mode 100644 index 00000000000..628d67f753f --- /dev/null +++ b/python/oneflow/test/modules/test_gbc2to1d.py @@ -0,0 +1,93 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest +from collections import OrderedDict +import oneflow +import numpy as np +import oneflow as flow +import oneflow.unittest +from oneflow.test_utils.test_util import GenArgList + +from oneflow.test_utils.automated_test_util import * +import time +import os + +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" +os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "1" + + +def _test_general_basic_communication_2d_to_1d(test_case, src_nd_sbp, dst_nd_sbp): + # can not process p in dst + if flow.sbp.partial_sum() == dst_nd_sbp: + return + + # input + placement_x = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) + placement_y = flow.placement("cuda", ranks=[0, 3, 4]) + local_np = np.arange(12 * 12).reshape(12, 12) + x = flow.tensor(local_np, sbp=src_nd_sbp, placement=placement_x) + + # check eager boxing + eager_out = x.to_global(sbp=dst_nd_sbp, placement=placement_y) + test_case.assertTrue(np.array_equal(eager_out.numpy(), x.numpy())) + + # check graph boxing + flow.boxing.nccl.enable_use_compute_stream(False) + + class TestGeneralBasicCommunicationGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + + def build(self, x): + y = x.to_global(sbp=dst_nd_sbp, placement=placement_y) + return y + + graph = TestGeneralBasicCommunicationGraph() + y = graph(x) + out_np = y.numpy() + in_np = x.numpy() + test_case.assertTrue(np.array_equal(out_np, in_np)) + +def gen_nd_sbp_1d(): + sbp_list = [ + flow.sbp.partial_sum(), + flow.sbp.broadcast(), + flow.sbp.split(0), + flow.sbp.split(1), + ] + return sbp_list + +def gen_nd_sbp_2d(): + nd_sbp_list = [] + for sbp0 in gen_nd_sbp_1d(): + for sbp1 in gen_nd_sbp_1d(): + nd_sbp_list.append([sbp0, sbp1]) + return nd_sbp_list + +@flow.unittest.skip_unless_2n4d() +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +class TestGeneralBasicCommunication(flow.unittest.TestCase): + def test_nccl_logical_send_recv(test_case): + arg_dict = OrderedDict() + arg_dict["src_nd_sbp"] = gen_nd_sbp_2d() + arg_dict["dst_nd_sbp"] = gen_nd_sbp_1d() + for arg in GenArgList(arg_dict): + _test_general_basic_communication_2d_to_1d(test_case, *arg) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_gbc2to2d.py b/python/oneflow/test/modules/test_gbc2to2d.py new file mode 100644 index 00000000000..632d42884ac --- /dev/null +++ b/python/oneflow/test/modules/test_gbc2to2d.py @@ -0,0 +1,91 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest +from collections import OrderedDict +import oneflow +import numpy as np +import oneflow as flow +import oneflow.unittest +from oneflow.test_utils.test_util import GenArgList + +from oneflow.test_utils.automated_test_util import * +import time +import os + +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" +os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "1" + + +def _test_general_basic_communication_2d_to_2d(test_case, src_nd_sbp, dst_nd_sbp): + # can not process p in dst + if flow.sbp.partial_sum() in dst_nd_sbp: + return + + # input + placement_x = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) + placement_y = flow.placement("cuda", ranks=[[0, 3, 4], [2, 5, 6]]) + local_np = np.arange(12 * 12).reshape(12, 12) + x = flow.tensor(local_np, sbp=src_nd_sbp, placement=placement_x) + + # check eager boxing + eager_out = x.to_global(sbp=dst_nd_sbp, placement=placement_y) + test_case.assertTrue(np.array_equal(eager_out.numpy(), x.numpy())) + + # check graph boxing + flow.boxing.nccl.enable_use_compute_stream(False) + + class TestGeneralBasicCommunicationGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + + def build(self, x): + y = x.to_global(sbp=dst_nd_sbp, placement=placement_y) + return y + + graph = TestGeneralBasicCommunicationGraph() + y = graph(x) + out_np = y.numpy() + in_np = x.numpy() + test_case.assertTrue(np.array_equal(out_np, in_np)) + + +def gen_nd_sbp(): + sbp_list = [ + flow.sbp.partial_sum(), + flow.sbp.broadcast(), + flow.sbp.split(0), + flow.sbp.split(1), + ] + nd_sbp_list = [] + for sbp0 in sbp_list: + for sbp1 in sbp_list: + nd_sbp_list.append([sbp0, sbp1]) + return nd_sbp_list + +@flow.unittest.skip_unless_2n4d() +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +class TestGeneralBasicCommunication(flow.unittest.TestCase): + def test_nccl_logical_send_recv(test_case): + arg_dict = OrderedDict() + arg_dict["src_nd_sbp"] = gen_nd_sbp() + arg_dict["dst_nd_sbp"] = gen_nd_sbp() + for arg in GenArgList(arg_dict): + _test_general_basic_communication_2d_to_2d(test_case, *arg) + + +if __name__ == "__main__": + unittest.main() From fe8fd38d6a9858197fa36644d1e3bab00c5d2ffe Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 30 Jun 2022 12:33:45 +0000 Subject: [PATCH 26/45] Fix conflit of master and remove print-out information --- oneflow/core/auto_parallel/boxing_collector.cpp | 5 +---- oneflow/core/framework/sbp_infer_util.cpp | 4 ++-- oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index b1f1cb075ed..cce578aab10 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -560,7 +560,7 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const return Maybe::Ok(); } static const bool enable_general_basic_communication = - Global::Get()->nccl_use_compute_stream() + Singleton::Get()->nccl_use_compute_stream() || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer if (enable_general_basic_communication && (!NdSbpHasPartialParallel(sbp_consumer))) { @@ -569,9 +569,6 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const JUST(AskSbpCombination4GeneralBasicCommunication( sbp_producer, sbp_consumer, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc, middle_sbps, diag_node_pos)); - if (GlobalProcessCtx::Rank() == 0) { - std::cout << "Middle size for gbc: " << middle_sbps.size() << std::endl; - } } // Otherwise, one-step transfer return Maybe::Ok(); diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 1cf411a2440..bd713e070f1 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -523,7 +523,7 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel #ifdef WITH_CUDA static const bool enable_general_basic_communication = - Global::Get()->nccl_use_compute_stream() + Singleton::Get()->nccl_use_compute_stream() || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { @@ -684,7 +684,7 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, } #ifdef WITH_CUDA static const bool enable_general_basic_communication = - Global::Get()->nccl_use_compute_stream() + Singleton::Get()->nccl_use_compute_stream() || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { diff --git a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp index c573f9bf0ad..8b743fa0899 100644 --- a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp +++ b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp @@ -57,7 +57,7 @@ class NcclSendRecvBoxingKernel final : public Kernel { int64_t device_id = CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_id)); device_set.emplace(std::make_pair(machine_id, device_id)); } - EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Global::Get()); + EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Singleton::Get()); ncclComm_t comm; if (has_independent_stream_) { comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); @@ -124,7 +124,6 @@ void NcclSendRecvBoxingKernel::ForwardDataContent(KernelContext* ctx) const { } } } - const int64_t parallel_id = this->kernel_conf().parallel_ctx().parallel_id(); OF_NCCL_CHECK(ncclGroupStart()); for (int64_t i = 0; i < parallel_num; ++i) { if (this->has_input() && send_elem_cnts.at(i) != 0) { From 3605117af161e8abc716e6addeddf7a6d2997ceb Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Fri, 1 Jul 2022 12:15:46 +0800 Subject: [PATCH 27/45] Skip middle nodes if not enough gains --- .../core/auto_parallel/boxing_collector.cpp | 17 ++++++++++++++++ oneflow/core/framework/sbp_infer_util.cpp | 20 +++++++++---------- oneflow/core/framework/sbp_infer_util.h | 10 ++++++++++ 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index cce578aab10..57ab2de283b 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -1031,6 +1031,23 @@ Maybe BoxingCollector::AskSbpCombination4GeneralBasicCommunication( const NdSbp& sbp_producer, const NdSbp& sbp_consumer, const BlobDesc& logical_blob_desc, const ParallelDesc& producer_parallel_desc, const ParallelDesc& consumer_parallel_desc, std::vector& middle_sbps, int32_t* diag_node_pos) { + // (P, X) -> (B, X) || (X , P) -> (X, B), X is any SBP + // One step transfer, at most 50% reduction in the transfer cost, do not use middle nodes + if (producer_parallel_desc == consumer_parallel_desc + && producer_parallel_desc.hierarchy()->NumAxes() == 2 + && (sbp_producer.sbp_parallel(0) == sbp_consumer.sbp_parallel(0) + || sbp_producer.sbp_parallel(1) == sbp_consumer.sbp_parallel(1))) { + return Maybe::Ok(); + } + + // Not enough gain in transfer cost, do not use middle nodes + int32_t partial_ratio4producer = PartialRatio4Producer(sbp_producer, producer_parallel_desc); + int32_t broadcast_ratio4consumer = BroadcastRatio4Consumer(sbp_consumer, consumer_parallel_desc); + if (2 * (partial_ratio4producer + broadcast_ratio4consumer) + < partial_ratio4producer * broadcast_ratio4consumer) { + return Maybe::Ok(); + } + bool close2producer = true; if (producer_parallel_desc.parallel_num() == consumer_parallel_desc.parallel_num()) { // Get close to the one with more splits diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index bd713e070f1..dc94177acc6 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -69,16 +69,6 @@ int32_t Ratio4Sbp(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc, return ratio; } -int32_t PartialRatio4Producer(const NdSbp& sbp_producer, - const ParallelDesc& producer_parallel_desc) { - return Ratio4Sbp(sbp_producer, producer_parallel_desc, &SbpParallel::has_partial_sum_parallel); -} - -int32_t BroadcastRatio4Consumer(const NdSbp& sbp_consumer, - const ParallelDesc& consumer_parallel_desc) { - return Ratio4Sbp(sbp_consumer, consumer_parallel_desc, &SbpParallel::has_broadcast_parallel); -} - Maybe ComputCopyCostBetweenTwoSbpParallel(const SbpParallel& producer_sbp_parallel, const SbpParallel& consumer_sbp_parallel, const BlobDesc& logical_blob_desc, @@ -433,6 +423,16 @@ void CollaborativeParallelDimReduce(const ParallelDesc& in_parallel_desc, } // namespace +int32_t PartialRatio4Producer(const NdSbp& sbp_producer, + const ParallelDesc& producer_parallel_desc) { + return Ratio4Sbp(sbp_producer, producer_parallel_desc, &SbpParallel::has_partial_sum_parallel); +} + +int32_t BroadcastRatio4Consumer(const NdSbp& sbp_consumer, + const ParallelDesc& consumer_parallel_desc) { + return Ratio4Sbp(sbp_consumer, consumer_parallel_desc, &SbpParallel::has_broadcast_parallel); +} + void NdSbpDimReduce(const ParallelDesc& parallel_desc, const NdSbp& nd_sbp, ParallelDesc* reduced_parallel_desc, NdSbp* reduced_nd_sbp) { const auto& hierarchy = parallel_desc.hierarchy(); diff --git a/oneflow/core/framework/sbp_infer_util.h b/oneflow/core/framework/sbp_infer_util.h index 91aa0f7ff52..d74ff8b9bf9 100644 --- a/oneflow/core/framework/sbp_infer_util.h +++ b/oneflow/core/framework/sbp_infer_util.h @@ -33,6 +33,16 @@ enum Penalty4PartialInConsumerTag : int { kStrict = 3 // Not allow a transfer to P }; +// [2, 3, 4, 5, 9, 100, 8]: (P, S0, P, P, B, S1, P) +// partial ratio = 2 * 4 * 5 * 8 +int32_t PartialRatio4Producer(const NdSbp& sbp_producer, + const ParallelDesc& producer_parallel_desc); + +// [2, 3, 4, 5, 9, 100, 8]: (P, S0, B, P, B, S1, P) +// broadcast ratio = 4 * 9 +int32_t BroadcastRatio4Consumer(const NdSbp& sbp_consumer, + const ParallelDesc& consumer_parallel_desc); + void NdSbpDimReduce(const ParallelDesc& parallel_desc, const NdSbp& nd_sbp, ParallelDesc* reduced_parallel_desc, NdSbp* reduced_nd_sbp); From e9e2d4249fd726ae66622d3c9c6b90e69ae343f5 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Fri, 1 Jul 2022 19:23:08 +0800 Subject: [PATCH 28/45] Fix a typo --- oneflow/core/auto_parallel/boxing_collector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 57ab2de283b..656b4c699f0 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -1044,7 +1044,7 @@ Maybe BoxingCollector::AskSbpCombination4GeneralBasicCommunication( int32_t partial_ratio4producer = PartialRatio4Producer(sbp_producer, producer_parallel_desc); int32_t broadcast_ratio4consumer = BroadcastRatio4Consumer(sbp_consumer, consumer_parallel_desc); if (2 * (partial_ratio4producer + broadcast_ratio4consumer) - < partial_ratio4producer * broadcast_ratio4consumer) { + >= partial_ratio4producer * broadcast_ratio4consumer) { return Maybe::Ok(); } From 3cf45b2c4d38e3e2af5fc098e73db0e18e1f92b9 Mon Sep 17 00:00:00 2001 From: guo-ran <360112263@qq.com> Date: Sat, 2 Jul 2022 13:13:03 +0800 Subject: [PATCH 29/45] fix nccl send recv bug for different stream --- .../boxing/hierarchical_sub_task_graph_builder_impl.cpp | 8 ++++---- oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp | 5 ++++- oneflow/core/graph/nccl_send_recv_boxing_task_node.h | 4 +++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp index 9a56a0c3f5d..71afa8e5576 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp @@ -127,18 +127,18 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB ParallelDesc merged_parallel_desc(merged_parallel_conf); TaskNode* first_in_node = sorted_in_tasks.front(); sorted_ctrl_tasks->resize(out_parallel_desc.parallel_num()); + std::string stream_name = "NCCL_SEND_RECV_BOXING" + NewUniqueId(); FOR_RANGE(int64_t, id, 0, merged_parallel_desc.parallel_num()) { NcclSendRecvBoxingTaskNode* node = ctx->task_graph()->NewNode(); const int64_t machine_id = JUST(merged_parallel_desc.MachineId4ParallelId(id)); int64_t device_index = JUST(merged_parallel_desc.DeviceId4ParallelId(id)); - int64_t thrd_id = EncodeStreamIdToInt64( - GenerateNamedTaskStreamId(machine_id, merged_parallel_desc.device_type(), device_index, - "NCCL_SEND_RECV_BOXING" + NewUniqueId())); + int64_t thrd_id = EncodeStreamIdToInt64(GenerateNamedTaskStreamId( + machine_id, merged_parallel_desc.device_type(), device_index, stream_name)); bool has_input = in_parallel_desc.Containing(machine_id, device_index); bool has_output = out_parallel_desc.Containing(machine_id, device_index); node->Init(machine_id, thrd_id, lbi, logical_blob_desc.shape(), logical_blob_desc.data_type(), in_nd_sbp, out_nd_sbp, in_parallel_desc, - out_parallel_desc, id, merged_parallel_desc, has_input, has_output); + out_parallel_desc, id, merged_parallel_desc, has_input, has_output, stream_name); if (has_input) { int64_t in_id = JUST(in_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); diff --git a/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp index 95438c6d2b2..d00e3a2cacd 100644 --- a/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp +++ b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp @@ -24,7 +24,8 @@ void NcclSendRecvBoxingTaskNode::Init(int64_t machine_id, int64_t thrd_id, const const ParallelDesc& src_parallel_desc, const ParallelDesc& dst_parallel_desc, const int64_t parallel_id, const ParallelDesc& parallel_desc, - const bool has_input, const bool has_output) { + const bool has_input, const bool has_output, + const std::string& stream_name) { set_machine_id(machine_id); set_thrd_id(thrd_id); set_lbi(lbi); @@ -39,6 +40,7 @@ void NcclSendRecvBoxingTaskNode::Init(int64_t machine_id, int64_t thrd_id, const has_input_ = has_input; has_output_ = has_output; data_type_ = data_type; + stream_name_ = stream_name; } void NcclSendRecvBoxingTaskNode::ProduceAllRegstsAndBindEdges() { @@ -59,6 +61,7 @@ void NcclSendRecvBoxingTaskNode::BuildExecGphAndRegst() { OperatorConf op_conf; op_conf.set_name("System-Nccl-Send-Recv-Boxing-" + NewUniqueId()); op_conf.set_device_tag(*CHECK_JUST(DeviceTag4DeviceType(this->device_type()))); + op_conf.set_stream_name_hint(stream_name_); auto* nccl_send_recv_boxing_conf = op_conf.mutable_nccl_send_recv_boxing_conf(); *nccl_send_recv_boxing_conf->mutable_lbi() = lbi(); logical_shape_.ToProto(nccl_send_recv_boxing_conf->mutable_logical_shape()); diff --git a/oneflow/core/graph/nccl_send_recv_boxing_task_node.h b/oneflow/core/graph/nccl_send_recv_boxing_task_node.h index fee688222ca..1fcc4482f0e 100644 --- a/oneflow/core/graph/nccl_send_recv_boxing_task_node.h +++ b/oneflow/core/graph/nccl_send_recv_boxing_task_node.h @@ -30,7 +30,8 @@ class NcclSendRecvBoxingTaskNode : public TransportTaskNode { const Shape& logical_shape, const DataType& data_type, const NdSbp& src_nd_sbp, const NdSbp& dst_nd_sbp, const ParallelDesc& src_parallel_desc, const ParallelDesc& dst_parallel_desc, const int64_t parallel_id, - const ParallelDesc& parallel_desc, const bool has_input, const bool has_output); + const ParallelDesc& parallel_desc, const bool has_input, const bool has_output, + const std::string& stream_name); TaskType GetTaskType() const override { return TaskType::kNcclSendRecvBoxing; } const ParallelContext* parallel_ctx() const override { return ¶llel_ctx_; } @@ -50,6 +51,7 @@ class NcclSendRecvBoxingTaskNode : public TransportTaskNode { ParallelContext parallel_ctx_; bool has_input_; bool has_output_; + std::string stream_name_; }; } // namespace oneflow From 27aef146ddc07f153c0bd23b04c03119d659e32d Mon Sep 17 00:00:00 2001 From: guoran <360112263@qq.com> Date: Mon, 4 Jul 2022 07:05:05 +0000 Subject: [PATCH 30/45] hot fix for ncclComm init --- oneflow/core/job/eager_nccl_comm_manager.cpp | 18 +++++++++++------- .../kernel/nccl_send_recv_boxing_kernel.cpp | 15 ++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index 2fa0ab540f3..7fdf06f0226 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -169,14 +169,18 @@ void EagerNcclCommMgr::CreateCommFromPlan(const Plan& plan) { continue; } const auto& op_conf = op_attr->op_conf(); - if (!op_conf.has_user_conf()) { continue; } - if (!NeedUnifiedNcclCommInit(op_conf.user_conf().op_type_name())) { continue; } - - if (!op_attr->has_parallel_conf_signature()) { continue; } - if (!op_attr->parallel_conf_signature().has_op_parallel_conf()) { continue; } - + ParallelConf parallel_conf; + if(op_conf.has_nccl_send_recv_boxing_conf()) { + parallel_conf = op_conf.nccl_send_recv_boxing_conf().parallel_conf(); + } else { + if (!op_conf.has_user_conf()) { continue; } + if (!NeedUnifiedNcclCommInit(op_conf.user_conf().op_type_name())) { continue; } + if (!op_attr->has_parallel_conf_signature()) { continue; } + if (!op_attr->parallel_conf_signature().has_op_parallel_conf()) { continue; } + parallel_conf = op_attr->parallel_conf_signature().op_parallel_conf(); + } std::vector> device_vec; - ParallelDesc parallel_desc(op_attr->parallel_conf_signature().op_parallel_conf()); + ParallelDesc parallel_desc(parallel_conf); for (int64_t parallel_id = 0; parallel_id < parallel_desc.parallel_num(); ++parallel_id) { int64_t machine_id = CHECK_JUST(parallel_desc.MachineId4ParallelId(parallel_id)); int64_t device_id = CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_id)); diff --git a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp index 8b743fa0899..3b158a00560 100644 --- a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp +++ b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp @@ -58,12 +58,7 @@ class NcclSendRecvBoxingKernel final : public Kernel { device_set.emplace(std::make_pair(machine_id, device_id)); } EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Singleton::Get()); - ncclComm_t comm; - if (has_independent_stream_) { - comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); - } else { - comm = comm_mgr->GetCommForDevice(device_set); - } + ncclComm_t comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); comm_.reset(new Comm(comm)); } @@ -75,7 +70,6 @@ class NcclSendRecvBoxingKernel final : public Kernel { void VirtualKernelInit(KernelContext* ctx) override; void ForwardDataContent(KernelContext* ctx) const override; - bool has_independent_stream_; std::string stream_name_; ParallelConf parallel_conf_; mutable std::unique_ptr comm_; @@ -187,8 +181,11 @@ void NcclSendRecvBoxingKernel::ForwardDataContent(KernelContext* ctx) const { void NcclSendRecvBoxingKernel::VirtualKernelInit(KernelContext* ctx) { const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); - has_independent_stream_ = this->op_conf().has_stream_name_hint(); - if (has_independent_stream_) { stream_name_ = this->op_conf().stream_name_hint(); } + if (this->op_conf().has_stream_name_hint()) { + stream_name_ = this->op_conf().stream_name_hint(); + } else { + stream_name_ = EagerNcclCommMgr::kDefaultStreamName; + } parallel_conf_ = conf.parallel_conf(); const int64_t parallel_id = this->kernel_conf().parallel_ctx().parallel_id(); ParallelDesc parallel_desc(parallel_conf_); From d837d7386344bba6c48c814fa5f2295c54949514 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Fri, 8 Jul 2022 18:51:26 +0800 Subject: [PATCH 31/45] Reuse streams for different jobs --- .../hierarchical_sub_task_graph_builder_impl.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp index 71afa8e5576..fc7e5ff39a5 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp @@ -30,6 +30,7 @@ limitations under the License. #include "oneflow/core/graph/nccl_send_recv_boxing_task_node.h" #include "oneflow/core/job/nd_sbp_util.h" #include "oneflow/core/graph/task_stream_id.h" +#include "oneflow/core/job/job_desc.h" namespace oneflow { @@ -70,6 +71,16 @@ void MergeParallelConf(const ParallelDesc& parallel_desc_0, const ParallelDesc& } } +inline std::string NewUniqueIdGbc() { + static std::atomic counter(0); + static std::atomic curr_job_id(0); + if (curr_job_id != GlobalJobDesc().job_id()) { + curr_job_id = GlobalJobDesc().job_id(); + counter = 0; + } + return std::to_string(counter.fetch_add(1, std::memory_order_relaxed)); +} + } // namespace class FlatSubTskGphBuilder final : public HierarchicalSubTskGphBuilder { @@ -127,7 +138,7 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB ParallelDesc merged_parallel_desc(merged_parallel_conf); TaskNode* first_in_node = sorted_in_tasks.front(); sorted_ctrl_tasks->resize(out_parallel_desc.parallel_num()); - std::string stream_name = "NCCL_SEND_RECV_BOXING" + NewUniqueId(); + std::string stream_name = "NCCL_SEND_RECV_BOXING" + NewUniqueIdGbc(); FOR_RANGE(int64_t, id, 0, merged_parallel_desc.parallel_num()) { NcclSendRecvBoxingTaskNode* node = ctx->task_graph()->NewNode(); const int64_t machine_id = JUST(merged_parallel_desc.MachineId4ParallelId(id)); From c3c0074c29d501ff63f03fe2c64dcb5fda832316 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Fri, 8 Jul 2022 19:32:08 +0800 Subject: [PATCH 32/45] Rename and of format --- oneflow/core/job/eager_nccl_comm_manager.cpp | 2 +- .../kernel/nccl_send_recv_boxing_kernel.cpp | 4 +- python/oneflow/test/graph/test_gbc2d.py | 19 +++- python/oneflow/test/modules/test_gbc1to2d.py | 4 +- python/oneflow/test/modules/test_gbc2to1d.py | 5 +- python/oneflow/test/modules/test_gbc2to2d.py | 3 +- .../modules/test_nccl_send_recv_boxing.py | 103 ------------------ 7 files changed, 30 insertions(+), 110 deletions(-) delete mode 100644 python/oneflow/test/modules/test_nccl_send_recv_boxing.py diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index 7fdf06f0226..846986b6012 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -170,7 +170,7 @@ void EagerNcclCommMgr::CreateCommFromPlan(const Plan& plan) { } const auto& op_conf = op_attr->op_conf(); ParallelConf parallel_conf; - if(op_conf.has_nccl_send_recv_boxing_conf()) { + if (op_conf.has_nccl_send_recv_boxing_conf()) { parallel_conf = op_conf.nccl_send_recv_boxing_conf().parallel_conf(); } else { if (!op_conf.has_user_conf()) { continue; } diff --git a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp index 3b158a00560..c7b954bd9f9 100644 --- a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp +++ b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp @@ -181,8 +181,8 @@ void NcclSendRecvBoxingKernel::ForwardDataContent(KernelContext* ctx) const { void NcclSendRecvBoxingKernel::VirtualKernelInit(KernelContext* ctx) { const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); - if (this->op_conf().has_stream_name_hint()) { - stream_name_ = this->op_conf().stream_name_hint(); + if (this->op_conf().has_stream_name_hint()) { + stream_name_ = this->op_conf().stream_name_hint(); } else { stream_name_ = EagerNcclCommMgr::kDefaultStreamName; } diff --git a/python/oneflow/test/graph/test_gbc2d.py b/python/oneflow/test/graph/test_gbc2d.py index e511a5c498e..efb564697fb 100644 --- a/python/oneflow/test/graph/test_gbc2d.py +++ b/python/oneflow/test/graph/test_gbc2d.py @@ -35,6 +35,22 @@ def _test_general_basic_communication_same_placement(test_case, src_nd_sbp, dst_ if flow.sbp.partial_sum() in dst_nd_sbp: return + # skip src == dst + if src_nd_sbp == dst_nd_sbp: + return + + # in this case, use intra group boxing + if src_nd_sbp[0] == dst_nd_sbp[0]: + return + + # in this case, use inter group boxing + if ( + src_nd_sbp[1] == dst_nd_sbp[1] + and src_nd_sbp[0] != src_nd_sbp[1] + and dst_nd_sbp[0] != dst_nd_sbp[1] + ): + return + # input placement = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) local_np = np.arange(12 * 12).reshape(12, 12) @@ -75,10 +91,11 @@ def gen_nd_sbp(): nd_sbp_list.append([sbp0, sbp1]) return nd_sbp_list + @flow.unittest.skip_unless_1n4d() @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") class TestGeneralBasicCommunication(flow.unittest.TestCase): - def test_nccl_logical_send_recv(test_case): + def test_general_basic_communication(test_case): arg_dict = OrderedDict() arg_dict["src_nd_sbp"] = gen_nd_sbp() arg_dict["dst_nd_sbp"] = gen_nd_sbp() diff --git a/python/oneflow/test/modules/test_gbc1to2d.py b/python/oneflow/test/modules/test_gbc1to2d.py index 73856380f72..4025b81e69b 100644 --- a/python/oneflow/test/modules/test_gbc1to2d.py +++ b/python/oneflow/test/modules/test_gbc1to2d.py @@ -72,6 +72,7 @@ def gen_nd_sbp_1d(): ] return sbp_list + def gen_nd_sbp_2d(): nd_sbp_list = [] for sbp0 in gen_nd_sbp_1d(): @@ -79,10 +80,11 @@ def gen_nd_sbp_2d(): nd_sbp_list.append([sbp0, sbp1]) return nd_sbp_list + @flow.unittest.skip_unless_2n4d() @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") class TestGeneralBasicCommunication(flow.unittest.TestCase): - def test_nccl_logical_send_recv(test_case): + def test_general_basic_communication(test_case): arg_dict = OrderedDict() arg_dict["src_nd_sbp"] = gen_nd_sbp_1d() arg_dict["dst_nd_sbp"] = gen_nd_sbp_2d() diff --git a/python/oneflow/test/modules/test_gbc2to1d.py b/python/oneflow/test/modules/test_gbc2to1d.py index 628d67f753f..62903100dc0 100644 --- a/python/oneflow/test/modules/test_gbc2to1d.py +++ b/python/oneflow/test/modules/test_gbc2to1d.py @@ -62,6 +62,7 @@ def build(self, x): in_np = x.numpy() test_case.assertTrue(np.array_equal(out_np, in_np)) + def gen_nd_sbp_1d(): sbp_list = [ flow.sbp.partial_sum(), @@ -71,6 +72,7 @@ def gen_nd_sbp_1d(): ] return sbp_list + def gen_nd_sbp_2d(): nd_sbp_list = [] for sbp0 in gen_nd_sbp_1d(): @@ -78,10 +80,11 @@ def gen_nd_sbp_2d(): nd_sbp_list.append([sbp0, sbp1]) return nd_sbp_list + @flow.unittest.skip_unless_2n4d() @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") class TestGeneralBasicCommunication(flow.unittest.TestCase): - def test_nccl_logical_send_recv(test_case): + def test_general_basic_communication(test_case): arg_dict = OrderedDict() arg_dict["src_nd_sbp"] = gen_nd_sbp_2d() arg_dict["dst_nd_sbp"] = gen_nd_sbp_1d() diff --git a/python/oneflow/test/modules/test_gbc2to2d.py b/python/oneflow/test/modules/test_gbc2to2d.py index 632d42884ac..4a978c6e1bd 100644 --- a/python/oneflow/test/modules/test_gbc2to2d.py +++ b/python/oneflow/test/modules/test_gbc2to2d.py @@ -76,10 +76,11 @@ def gen_nd_sbp(): nd_sbp_list.append([sbp0, sbp1]) return nd_sbp_list + @flow.unittest.skip_unless_2n4d() @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") class TestGeneralBasicCommunication(flow.unittest.TestCase): - def test_nccl_logical_send_recv(test_case): + def test_general_basic_communication(test_case): arg_dict = OrderedDict() arg_dict["src_nd_sbp"] = gen_nd_sbp() arg_dict["dst_nd_sbp"] = gen_nd_sbp() diff --git a/python/oneflow/test/modules/test_nccl_send_recv_boxing.py b/python/oneflow/test/modules/test_nccl_send_recv_boxing.py deleted file mode 100644 index 20c8d09f4ed..00000000000 --- a/python/oneflow/test/modules/test_nccl_send_recv_boxing.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import unittest -from collections import OrderedDict -import oneflow -import numpy as np -import oneflow as flow -import oneflow.unittest -from oneflow.test_utils.test_util import GenArgList - -import time -import os - -os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "1" - - -def _test_nccl_send_recv_boxing( - test_case, src_nd_sbp, dst_nd_sbp, src_ranks, dst_ranks -): - # can not process p in dst - if flow.sbp.partial_sum() in dst_nd_sbp: - return - # skip src == dst - if src_nd_sbp == dst_nd_sbp: - return - # in this case, use intra group boxing - if src_nd_sbp[0] == dst_nd_sbp[0]: - return - # in this case, use inter group boxing - if ( - src_nd_sbp[1] == dst_nd_sbp[1] - and src_nd_sbp[0] != src_nd_sbp[1] - and src_nd_sbp[0] != src_nd_sbp[1] - ): - return - # in this case, use 1d boxing - if src_nd_sbp[0] == src_nd_sbp[1] and dst_nd_sbp[0] == dst_nd_sbp[1]: - return - src_placement = flow.placement("cuda", ranks=src_ranks) - dst_placement = flow.placement("cuda", ranks=dst_ranks) - - class TestGraph(flow.nn.Graph): - def __init__(self): - super().__init__() - - def build(self, x): - y = x.to_global(sbp=dst_nd_sbp, placement=dst_placement) - return y - - x = flow.tensor( - np.arange(12 * 16 * 16).reshape(12, 16, 16), - sbp=src_nd_sbp, - placement=src_placement, - ) - graph = TestGraph() - y = graph(x) - test_case.assertTrue(np.array_equal(y.numpy(), x.numpy())) - - -def gen_nd_sbp(): - sbp_list = [ - flow.sbp.partial_sum(), - flow.sbp.broadcast(), - flow.sbp.split(0), - flow.sbp.split(1), - flow.sbp.split(2), - ] - nd_sbp_list = [] - for sbp0 in sbp_list: - for sbp1 in sbp_list: - nd_sbp_list.append([sbp0, sbp1]) - return nd_sbp_list - - -@flow.unittest.skip_unless_1n4d() -@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") -class TestNcclSendRecvBoxing(flow.unittest.TestCase): - def test_nccl_send_recv_boxing(test_case): - arg_dict = OrderedDict() - arg_dict["src_nd_sbp"] = gen_nd_sbp() - arg_dict["dst_nd_sbp"] = gen_nd_sbp() - arg_dict["src_ranks"] = [[[0, 1], [2, 3]], [[0, 1]]] - arg_dict["dst_ranks"] = [[[0, 1], [2, 3]], [[2, 3]]] - for arg in GenArgList(arg_dict): - _test_nccl_send_recv_boxing(test_case, *arg) - - -if __name__ == "__main__": - unittest.main() From 0076c1db4a007fbc44416507e2119972661d7606 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Mon, 11 Jul 2022 10:45:28 +0800 Subject: [PATCH 33/45] Skip general basic communication for transfer between cpu and gpu --- oneflow/core/auto_parallel/boxing_collector.cpp | 5 ++++- oneflow/core/framework/sbp_infer_util.cpp | 9 +++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 656b4c699f0..532d66f9ce4 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -18,6 +18,7 @@ limitations under the License. #include #include "oneflow/core/auto_parallel/boxing_collector.h" #include "oneflow/core/common/data_type.h" +#include "oneflow/core/common/device_type.pb.h" #include "oneflow/core/common/maybe.h" #include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/job/global_for.h" @@ -563,7 +564,9 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const Singleton::Get()->nccl_use_compute_stream() || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if (enable_general_basic_communication && (!NdSbpHasPartialParallel(sbp_consumer))) { + if (enable_general_basic_communication && (!NdSbpHasPartialParallel(sbp_consumer)) + && producer_parallel_desc.device_type() == DeviceType::kCUDA + && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { if (NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) { // (?, P, ?)->(Si, Sj)->(?, B, ?), two-step transfer JUST(AskSbpCombination4GeneralBasicCommunication( diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index dc94177acc6..80a08b1fb49 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -17,6 +17,7 @@ limitations under the License. #include "oneflow/core/framework/sbp_infer_util.h" #include "oneflow/core/auto_parallel/boxing_collector.h" #include "oneflow/core/boxing/eager_boxing_interpreter_mgr.h" +#include "oneflow/core/common/device_type.pb.h" #include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/common/util.h" #include "oneflow/core/job/global_for.h" @@ -526,7 +527,9 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel Singleton::Get()->nccl_use_compute_stream() || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { + if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel)) + && producer_parallel_desc.device_type() == DeviceType::kCUDA + && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc) @@ -687,7 +690,9 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, Singleton::Get()->nccl_use_compute_stream() || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel))) { + if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel)) + && producer_parallel_desc.device_type() == DeviceType::kCUDA + && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc) From f09dc85f4fa8204f92983b0df278bc7f6804fc9c Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 14 Jul 2022 17:57:06 +0800 Subject: [PATCH 34/45] Address suggestion --- .../core/auto_parallel/boxing_collector.cpp | 97 +++++++++++-------- oneflow/core/framework/sbp_infer_util.cpp | 14 +-- .../test/{modules => graph}/test_gbc1to2d.py | 0 .../test/{modules => graph}/test_gbc2to1d.py | 0 .../test/{modules => graph}/test_gbc2to2d.py | 0 5 files changed, 62 insertions(+), 49 deletions(-) rename python/oneflow/test/{modules => graph}/test_gbc1to2d.py (100%) rename python/oneflow/test/{modules => graph}/test_gbc2to1d.py (100%) rename python/oneflow/test/{modules => graph}/test_gbc2to2d.py (100%) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 532d66f9ce4..567de6efc46 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -82,6 +82,50 @@ int32_t TotalNumSplit(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc) { return total_num_split; } +// Dealing with 1D sbp to 1D sbp +// Specifically, S -> P. +Maybe AskSbpCombinationFor1DSbp(const NdSbp& sbp_producer, const NdSbp& sbp_consumer, + const ParallelDesc& producer_parallel_desc, + const ParallelDesc& consumer_parallel_desc, + std::vector& middle_sbps, int32_t* diag_node_pos) { + if (sbp_consumer.sbp_parallel(0).has_partial_sum_parallel()) { + // Support [4]: P <--> [2, 2]: (P, P) + // Support {0, 1, 2, 3}: P <--> {2, 0, 6, 7}: (P, P) + if (producer_parallel_desc.parallel_num() == consumer_parallel_desc.parallel_num() + && sbp_producer.sbp_parallel(0).has_partial_sum_parallel()) { + return Maybe::Ok(); + } + + if (!sbp_producer.sbp_parallel(0).has_broadcast_parallel()) { + // S -> B -> P (Large cost!) + // TODO: Please implement S -> P directly. + // We do not support [3]: P <--> [2, 2]: (P, P) as well. + + int32_t hierarchy_size = 0; + if (producer_parallel_desc.hierarchy()->elem_cnt() + < consumer_parallel_desc.hierarchy()->elem_cnt()) { + // The diagonal node uses the parallel description from producer + // (S, S) -> (B, B) -> P/(P, P) or S -> B -> P/(P, P) + *diag_node_pos = 1; + hierarchy_size = producer_parallel_desc.hierarchy()->NumAxes(); + } else { + // The diagonal node uses the parallel description from consumer + // S/(S, S) -> B -> P or S/(S, S) -> (B, B) -> (P, P) + *diag_node_pos = 0; + hierarchy_size = consumer_parallel_desc.hierarchy()->NumAxes(); + } + + NdSbp broadcast_nd; + for (int32_t i = 0; i < hierarchy_size; i++) { + broadcast_nd.add_sbp_parallel(); + broadcast_nd.mutable_sbp_parallel(i)->mutable_broadcast_parallel(); + } + middle_sbps.emplace_back(broadcast_nd); + } + } + return Maybe::Ok(); +} + } // namespace // A constructor with init, designed for uncustomized boxing collector @@ -513,58 +557,25 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const if (ParseBooleanFromEnv("ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK", false)) { return Maybe::Ok(); } + if (producer_parallel_desc == consumer_parallel_desc && sbp_producer == sbp_consumer) { + return Maybe::Ok(); + } // Dealing with 1D sbp to 1D sbp - // Specifically, S -> P. if (Is1dSbp(sbp_producer) && Is1dSbp(sbp_consumer)) { - if (sbp_consumer.sbp_parallel(0).has_partial_sum_parallel()) { - // Support [4]: P <--> [2, 2]: (P, P) - // Support {0, 1, 2, 3}: P <--> {2, 0, 6, 7}: (P, P) - if (producer_parallel_desc.parallel_num() == consumer_parallel_desc.parallel_num() - && sbp_producer.sbp_parallel(0).has_partial_sum_parallel()) { - return Maybe::Ok(); - } - - if (!sbp_producer.sbp_parallel(0).has_broadcast_parallel()) { - // S -> B -> P (Large cost!) - // TODO: Please implement S -> P directly. - // We do not support [3]: P <--> [2, 2]: (P, P) as well. - - int32_t hierarchy_size = 0; - if (producer_parallel_desc.hierarchy()->elem_cnt() - < consumer_parallel_desc.hierarchy()->elem_cnt()) { - // The diagonal node uses the parallel description from producer - // (S, S) -> (B, B) -> P/(P, P) or S -> B -> P/(P, P) - *diag_node_pos = 1; - hierarchy_size = producer_parallel_desc.hierarchy()->NumAxes(); - } else { - // The diagonal node uses the parallel description from consumer - // S/(S, S) -> B -> P or S/(S, S) -> (B, B) -> (P, P) - *diag_node_pos = 0; - hierarchy_size = consumer_parallel_desc.hierarchy()->NumAxes(); - } - - NdSbp broadcast_nd; - for (int32_t i = 0; i < hierarchy_size; i++) { - broadcast_nd.add_sbp_parallel(); - broadcast_nd.mutable_sbp_parallel(i)->mutable_broadcast_parallel(); - } - middle_sbps.emplace_back(broadcast_nd); - } - } - // No middle nodes for another 1d-sbp combinations + JUST(AskSbpCombinationFor1DSbp(sbp_producer, sbp_consumer, producer_parallel_desc, + consumer_parallel_desc, middle_sbps, diag_node_pos)); + // No middle nodes for the other 1d-sbp combinations return Maybe::Ok(); } #ifdef WITH_CUDA - if (producer_parallel_desc == consumer_parallel_desc && sbp_producer == sbp_consumer) { - return Maybe::Ok(); - } static const bool enable_general_basic_communication = - Singleton::Get()->nccl_use_compute_stream() - || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); + ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if (enable_general_basic_communication && (!NdSbpHasPartialParallel(sbp_consumer)) + if ((Singleton::Get()->nccl_use_compute_stream() + || enable_general_basic_communication) + && (!NdSbpHasPartialParallel(sbp_consumer)) && producer_parallel_desc.device_type() == DeviceType::kCUDA && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { if (NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) { diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 80a08b1fb49..3712bcce7a9 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -524,10 +524,11 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel #ifdef WITH_CUDA static const bool enable_general_basic_communication = - Singleton::Get()->nccl_use_compute_stream() - || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); + ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel)) + if (((Singleton::Get()->nccl_use_compute_stream() + || enable_general_basic_communication) + && !NdSbpHasPartialParallel(consumer_sbp_parallel)) && producer_parallel_desc.device_type() == DeviceType::kCUDA && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, @@ -687,10 +688,11 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, } #ifdef WITH_CUDA static const bool enable_general_basic_communication = - Singleton::Get()->nccl_use_compute_stream() - || ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); + ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if ((enable_general_basic_communication && !NdSbpHasPartialParallel(consumer_sbp_parallel)) + if (((Singleton::Get()->nccl_use_compute_stream() + || enable_general_basic_communication) + && !NdSbpHasPartialParallel(consumer_sbp_parallel)) && producer_parallel_desc.device_type() == DeviceType::kCUDA && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, diff --git a/python/oneflow/test/modules/test_gbc1to2d.py b/python/oneflow/test/graph/test_gbc1to2d.py similarity index 100% rename from python/oneflow/test/modules/test_gbc1to2d.py rename to python/oneflow/test/graph/test_gbc1to2d.py diff --git a/python/oneflow/test/modules/test_gbc2to1d.py b/python/oneflow/test/graph/test_gbc2to1d.py similarity index 100% rename from python/oneflow/test/modules/test_gbc2to1d.py rename to python/oneflow/test/graph/test_gbc2to1d.py diff --git a/python/oneflow/test/modules/test_gbc2to2d.py b/python/oneflow/test/graph/test_gbc2to2d.py similarity index 100% rename from python/oneflow/test/modules/test_gbc2to2d.py rename to python/oneflow/test/graph/test_gbc2to2d.py From f4ea3c22c4a41202a77c9176627631a807e84f5b Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Mon, 18 Jul 2022 17:33:18 +0800 Subject: [PATCH 35/45] Use the more powerful GetRankSendRecvIntersection --- .../core/auto_parallel/boxing_collector.cpp | 2 + oneflow/core/job/nd_sbp_util.cpp | 83 ------------------- oneflow/core/job/nd_sbp_util.h | 6 -- .../kernels/nccl_logical_send_recv_kernel.cpp | 10 ++- 4 files changed, 10 insertions(+), 91 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 567de6efc46..55e555801f1 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -580,6 +580,8 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { if (NdSbpHasPartialParallel(sbp_producer) && NdSbpHasBroadcastParallel(sbp_consumer)) { // (?, P, ?)->(Si, Sj)->(?, B, ?), two-step transfer + // Directly applying general basic communication would have O(n^2) time complexity for P->B + // Using two-step transfer would reduce it to a linear cost JUST(AskSbpCombination4GeneralBasicCommunication( sbp_producer, sbp_consumer, logical_blob_desc, producer_parallel_desc, consumer_parallel_desc, middle_sbps, diag_node_pos)); diff --git a/oneflow/core/job/nd_sbp_util.cpp b/oneflow/core/job/nd_sbp_util.cpp index 9726e5e902b..934b30b2480 100644 --- a/oneflow/core/job/nd_sbp_util.cpp +++ b/oneflow/core/job/nd_sbp_util.cpp @@ -19,48 +19,6 @@ limitations under the License. #include "oneflow/core/common/nd_index_offset_helper.h" namespace oneflow { -namespace { -// Go through all the ranks while transfer between two nd sbps with no PartialSum under the same -// placement. -// NOTE: We need to make sure no partial sums in the sbps of the producer and consumer. -void DfsTraverseRanks4NdSbp( - int32_t depth, std::vector& in_parallel_ids, - const std::vector& out_parallel_ids, const Shape& parallel_hierarchy, - const NdIndexOffsetHelper& hierarchy_index_helper, - const NdSbp& in_nd_sbp, const std::function& visit) { - if (depth >= parallel_hierarchy.NumAxes()) { - visit(hierarchy_index_helper.NdIndexToOffset(in_parallel_ids.data(), - parallel_hierarchy.NumAxes())); - return; - } - if (in_nd_sbp.sbp_parallel(depth).has_broadcast_parallel()) { - // If Broadcast in the sbp of the producer, only visit those ranks with the same id as the - // current rank along the depth-dimension. - in_parallel_ids[depth] = out_parallel_ids[depth]; - DfsTraverseRanks4NdSbp(depth + 1, in_parallel_ids, out_parallel_ids, parallel_hierarchy, - hierarchy_index_helper, in_nd_sbp, visit); - } else { - // If Split or PartialSum, go through all the ranks along the depth-dimension. - for (int64_t i = 0; i < parallel_hierarchy.dim_vec().at(depth); i++) { - in_parallel_ids[depth] = i; - DfsTraverseRanks4NdSbp(depth + 1, in_parallel_ids, out_parallel_ids, parallel_hierarchy, - hierarchy_index_helper, in_nd_sbp, visit); - } - } -} - -void DfsTraverse4NdSbp(int64_t recv_id, const std::shared_ptr& parallel_hierarchy, - const NdSbp& in_nd_sbp, const std::function& visit) { - int32_t hierarchy_dimension = parallel_hierarchy->NumAxes(); - const NdIndexOffsetHelper hierarchy_index_helper( - parallel_hierarchy->dim_vec().data(), hierarchy_dimension); - std::vector in_parallel_ids(hierarchy_dimension); - std::vector out_parallel_ids(hierarchy_dimension); - hierarchy_index_helper.OffsetToNdIndex(recv_id, out_parallel_ids.data(), hierarchy_dimension); - DfsTraverseRanks4NdSbp(0, in_parallel_ids, out_parallel_ids, *parallel_hierarchy, - hierarchy_index_helper, in_nd_sbp, visit); -} -} // namespace std::vector GetTensorSliceView(const int64_t parallel_num, const SbpParallel& sbp_parallel, @@ -203,45 +161,4 @@ bool NdSbpIsAllSplit(const NdSbp& nd_sbp, int64_t axis) { return true; } -void GetRankSendRecvIntersection(int64_t parallel_id, - const std::shared_ptr& parallel_hierarchy, - const NdSbp& src_nd_sbp, const NdSbp& dst_nd_sbp, - const Shape& logical_shape, - std::vector* send_intersections, - std::vector* recv_intersections) { - CHECK(parallel_hierarchy != nullptr); - const int64_t parallel_num = parallel_hierarchy->elem_cnt(); - CHECK_LT(parallel_id, parallel_num); - - const std::vector& in_slices = - GetTensorSliceView(*parallel_hierarchy, src_nd_sbp, logical_shape); - const std::vector& out_slices = - GetTensorSliceView(*parallel_hierarchy, dst_nd_sbp, logical_shape); - - // cur rank recv from - recv_intersections->resize(parallel_num); - const TensorSliceView& cur_rank_out_slice = out_slices.at(parallel_id); - const auto& add_to_recv_intersections = [&](int32_t send_id) { - const TensorSliceView& in_slice = in_slices.at(send_id); - const TensorSliceView& intersection = cur_rank_out_slice.Intersect(in_slice); - if (intersection.IsEmpty()) { return; } - recv_intersections->at(send_id) = intersection; - }; - DfsTraverse4NdSbp(parallel_id, parallel_hierarchy, src_nd_sbp, add_to_recv_intersections); - - // cur rank send to - send_intersections->resize(parallel_num); - const TensorSliceView& cur_rank_in_slice = in_slices.at(parallel_id); - for (int64_t recv_i = 0; recv_i < parallel_num; ++recv_i) { - const auto& add_to_send_intersections = [&](int32_t send_id) { - if (send_id != parallel_id) { return; } - const TensorSliceView& out_slice = out_slices.at(recv_i); - const TensorSliceView& intersection = out_slice.Intersect(cur_rank_in_slice); - if (intersection.IsEmpty()) { return; } - send_intersections->at(recv_i) = intersection; - }; - DfsTraverse4NdSbp(recv_i, parallel_hierarchy, src_nd_sbp, add_to_send_intersections); - } -} - } // namespace oneflow diff --git a/oneflow/core/job/nd_sbp_util.h b/oneflow/core/job/nd_sbp_util.h index 7eac44a52fc..be8b72c7746 100644 --- a/oneflow/core/job/nd_sbp_util.h +++ b/oneflow/core/job/nd_sbp_util.h @@ -39,12 +39,6 @@ bool NdSbpIsAllSplit(const NdSbp& nd_sbp, int64_t axis); bool NdSbpHasPartialParallel(const NdSbp& nd_sbp); bool NdSbpHasBroadcastParallel(const NdSbp& nd_sbp); -void GetRankSendRecvIntersection(int64_t parallel_id, - const std::shared_ptr& parallel_hierarchy, - const NdSbp& src_nd_sbp, const NdSbp& dst_nd_sbp, - const Shape& logical_shape, - std::vector* send_intersections, - std::vector* recv_intersections); } // namespace oneflow #endif // ONEFLOW_CORE_JOB_SBP_PARALLEL_H_ diff --git a/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp b/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp index 6148e952101..51b946a8f9e 100644 --- a/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp +++ b/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp @@ -26,6 +26,7 @@ limitations under the License. #include "oneflow/core/register/tensor_slice_copier.h" #include "oneflow/core/ep/include/primitive/memset.h" #include "oneflow/core/ep/include/primitive/add.h" +#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h" #if defined(WITH_CUDA) && NCCL_VERSION_CODE > 2700 @@ -87,7 +88,9 @@ NcclLogicalSendRecvState::NcclLogicalSendRecvState(user_op::KernelInitContext* c std::vector src_send_intersections; std::vector dst_recv_intersections; - GetRankSendRecvIntersection(parallel_id, parallel_desc_->hierarchy(), src_nd_sbp, dst_nd_sbp, + GetRankSendRecvIntersection(parallel_id, /*merge_parallel_desc=*/*parallel_desc_, + /*in_parallel_desc=*/*parallel_desc_, + /*out_parallel_desc=*/*parallel_desc_, src_nd_sbp, dst_nd_sbp, logical_shape, &src_send_intersections, &dst_recv_intersections); CHECK_EQ(src_send_intersections.size(), parallel_num); @@ -264,7 +267,10 @@ size_t InferTmpBufferSize(user_op::InferContext* ctx) { std::vector src_send_intersections; std::vector dst_recv_intersections; - GetRankSendRecvIntersection(parallel_id, ctx->parallel_desc().hierarchy(), src_nd_sbp, dst_nd_sbp, + const auto& parallel_desc = ctx->parallel_desc(); + GetRankSendRecvIntersection(parallel_id, /*merge_parallel_desc=*/parallel_desc, + /*in_parallel_desc=*/parallel_desc, + /*out_parallel_desc=*/parallel_desc, src_nd_sbp, dst_nd_sbp, logical_shape, &src_send_intersections, &dst_recv_intersections); int64_t buf_count = 0; CHECK_EQ(src_send_intersections.size(), parallel_num); From e426c52dd04209e3242180c1abbc2045930310a1 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Mon, 18 Jul 2022 19:40:07 +0800 Subject: [PATCH 36/45] Register nccl send recv op for comm init before graph build Co-author-by: Wenxiao --- .../graph/nccl_send_recv_boxing_task_node.cpp | 1 + oneflow/core/job/eager_nccl_comm_manager.cpp | 29 ++++++++++--------- oneflow/core/job/eager_nccl_comm_manager.h | 5 ++++ .../kernel/nccl_send_recv_boxing_kernel.cpp | 2 ++ 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp index d00e3a2cacd..fdd318dcfcb 100644 --- a/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp +++ b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp @@ -75,6 +75,7 @@ void NcclSendRecvBoxingTaskNode::BuildExecGphAndRegst() { nccl_send_recv_boxing_conf->set_has_output(has_output_); std::shared_ptr sole_op = CHECK_JUST(ConstructOp(op_conf)); node->mut_op() = sole_op; + sole_op->FillOpParallelDesc(parallel_conf_); if (has_input_) { node->BindBnWithRegst(sole_op->SoleIbn(), GetSoleConsumedRegst("in")); } if (has_output_) { std::shared_ptr out_regst = GetProducedRegst("out"); diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index 846986b6012..a7646e2fcd6 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -14,12 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include "oneflow/core/control/ctrl_client.h" #include "oneflow/core/control/global_process_ctx.h" #include "oneflow/core/job/eager_nccl_comm_manager.h" #include "oneflow/core/device/nccl_util.h" #include "oneflow/core/job/id_manager.h" #include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/operator/op_conf.pb.h" #include "oneflow/core/vm/vm_util.h" #ifdef WITH_CUDA @@ -78,8 +80,15 @@ void CreateNcclComm(ncclComm_t* comm, const int dev, const std::string& key, << ", key = {" << key << "}\n"; } -bool NeedUnifiedNcclCommInit(const std::string& op_type_name) { - return UserKernelUnifiedNcclCommInitRegistry::Instance().IsRegistered(op_type_name); +bool NeedUnifiedNcclCommInit(const OperatorConf& op_conf) { + if (op_conf.has_user_conf()) { + return UserKernelUnifiedNcclCommInitRegistry::Instance().IsRegistered( + op_conf.user_conf().op_type_name()); + } else { + // Please check the .h file for hard-coding of the name + return UserKernelUnifiedNcclCommInitRegistry::Instance().IsRegistered( + "sys_op_" + std::to_string(op_conf.op_type_case())); + } } } // namespace @@ -169,18 +178,12 @@ void EagerNcclCommMgr::CreateCommFromPlan(const Plan& plan) { continue; } const auto& op_conf = op_attr->op_conf(); - ParallelConf parallel_conf; - if (op_conf.has_nccl_send_recv_boxing_conf()) { - parallel_conf = op_conf.nccl_send_recv_boxing_conf().parallel_conf(); - } else { - if (!op_conf.has_user_conf()) { continue; } - if (!NeedUnifiedNcclCommInit(op_conf.user_conf().op_type_name())) { continue; } - if (!op_attr->has_parallel_conf_signature()) { continue; } - if (!op_attr->parallel_conf_signature().has_op_parallel_conf()) { continue; } - parallel_conf = op_attr->parallel_conf_signature().op_parallel_conf(); - } + if (!NeedUnifiedNcclCommInit(op_conf)) { continue; } + if (!op_attr->has_parallel_conf_signature()) { continue; } + if (!op_attr->parallel_conf_signature().has_op_parallel_conf()) { continue; } + std::vector> device_vec; - ParallelDesc parallel_desc(parallel_conf); + ParallelDesc parallel_desc(op_attr->parallel_conf_signature().op_parallel_conf()); for (int64_t parallel_id = 0; parallel_id < parallel_desc.parallel_num(); ++parallel_id) { int64_t machine_id = CHECK_JUST(parallel_desc.MachineId4ParallelId(parallel_id)); int64_t device_id = CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_id)); diff --git a/oneflow/core/job/eager_nccl_comm_manager.h b/oneflow/core/job/eager_nccl_comm_manager.h index b57a2cd92fe..ac6e572424c 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.h +++ b/oneflow/core/job/eager_nccl_comm_manager.h @@ -89,6 +89,11 @@ class UserKernelUnifiedNcclCommInitRegistry final { static auto OF_PP_CAT(g_nccl_comm_reg_, __COUNTER__) = \ ::oneflow::UserKernelUnifiedNcclCommInitRegistry::Trigger(op_type_name) +#define REGISTER_SYSTEM_OP_KERNEL_UNIFIED_NCCL_COMM_INIT(op_type_case) \ + static auto OF_PP_CAT(g_nccl_comm_reg_, __COUNTER__) = \ + ::oneflow::UserKernelUnifiedNcclCommInitRegistry::Trigger("sys_op_" \ + + std::to_string(op_type_case)) + #endif // WITH_CUDA #endif // ONEFLOW_CORE_JOB_EAGER_NCCL_COMM_MANAGER_H_ diff --git a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp index c7b954bd9f9..6bb52bedbd6 100644 --- a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp +++ b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp @@ -249,6 +249,8 @@ void NcclSendRecvBoxingKernel::VirtualKernelInit(KernelContext* ctx) { REGISTER_KERNEL(OperatorConf::kNcclSendRecvBoxingConf, NcclSendRecvBoxingKernel); +REGISTER_SYSTEM_OP_KERNEL_UNIFIED_NCCL_COMM_INIT(OperatorConf::kNcclSendRecvBoxingConf); + } // namespace oneflow #endif // WITH_CUDA && NCCL_VERSION_CODE > 2700 From 9d2808e50f0ade28747f42476a4d021666556ad8 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Wed, 20 Jul 2022 05:25:53 +0000 Subject: [PATCH 37/45] Remove irrelevant scripts --- oneflow/ir/test/Frontend/test_iree_resnet.py | 107 ------------------- oneflow/ir/test/Frontend/test_iree_runner.py | 71 ------------ 2 files changed, 178 deletions(-) delete mode 100644 oneflow/ir/test/Frontend/test_iree_resnet.py delete mode 100644 oneflow/ir/test/Frontend/test_iree_runner.py diff --git a/oneflow/ir/test/Frontend/test_iree_resnet.py b/oneflow/ir/test/Frontend/test_iree_resnet.py deleted file mode 100644 index 885291f4251..00000000000 --- a/oneflow/ir/test/Frontend/test_iree_resnet.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -# RUN: python3 %s - -from oneflow_iree.compiler import Runner -from flowvision.models import resnet50 -import oneflow as flow -import oneflow.unittest -import unittest -import os -import numpy as np -import time - -os.environ["ONEFLOW_MLIR_ENABLE_ROUND_TRIP"] = "1" -os.environ["ONEFLOW_MLIR_ENABLE_CODEGEN_FUSERS"] = "1" - - -def _test_iree_resnet_cpu(test_case): - model = resnet50(pretrained=True) - model.eval() - - class GraphModuleForIree(flow.nn.Graph): - def __init__(self): - super().__init__() - self.model = model - - def build(self, x): - return self.model(x) - - class GraphModuleForOFMLIR(flow.nn.Graph): - def __init__(self): - super().__init__() - self.model = model - - def build(self, x): - return self.model(x) - - func = Runner(GraphModuleForIree, return_numpy=True) - input = flow.ones([1, 3, 224, 224]) - f = GraphModuleForOFMLIR() - for iter in range(2): - iree_output = func(input) - graph_output = f(input) - graph_output = graph_output.cpu().detach().numpy() - # the rtol accumulate layer by layer - test_case.assertTrue( - np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3) - ) - - -def _test_iree_resnet_cuda(test_case): - model = resnet50(pretrained=True).cuda() - model.eval() - - class GraphModuleForIree(flow.nn.Graph): - def __init__(self): - super().__init__() - self.model = model - - def build(self, x): - return self.model(x) - - class GraphModuleForOFMLIR(flow.nn.Graph): - def __init__(self): - super().__init__() - self.model = model - - def build(self, x): - return self.model(x) - - func = Runner(GraphModuleForIree, return_numpy=True) - input = flow.ones([1, 3, 224, 224]).cuda() - f = GraphModuleForOFMLIR() - for iter in range(2): - iree_output = func(input) - graph_output = f(input) - graph_output = graph_output.cpu().detach().numpy() - # the rtol accumulate layer by layer - test_case.assertTrue( - np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3) - ) - - -@flow.unittest.skip_unless_1n1d() -class TestIreeResnet(oneflow.unittest.TestCase): - def test_iree_resnet_cpu(test_case): - _test_iree_resnet_cpu(test_case) - - def test_iree_resnet_cuda(test_case): - _test_iree_resnet_cuda(test_case) - - -if __name__ == "__main__": - unittest.main() diff --git a/oneflow/ir/test/Frontend/test_iree_runner.py b/oneflow/ir/test/Frontend/test_iree_runner.py deleted file mode 100644 index a0caa90fecd..00000000000 --- a/oneflow/ir/test/Frontend/test_iree_runner.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -# RUN: python3 %s - -from oneflow_iree.compiler import Runner -import oneflow as flow -import oneflow.unittest -import unittest -import numpy as np - - -class RELU(flow.nn.Module): - def __init__(self): - super().__init__() - self.relu = flow.nn.ReLU() - - def forward(self, x): - return self.relu(x) - - -class GraphModule(flow.nn.Graph): - def __init__(self): - super().__init__() - self.fw = RELU() - - def build(self, x): - return self.fw(x) - - -def _test_check_iree_runner(test_case): - func = Runner(GraphModule, return_numpy=True).cuda() - # run on iree cuda backend - input = flow.Tensor([-1.0, 1.0]) - output = func(input) - test_case.assertTrue(np.allclose(output, [0.0, 1.0])) - # change input shape - input = flow.Tensor([-1.0, 1.0, -1]) - output = func(input) - test_case.assertTrue(np.allclose(output, [0.0, 1.0, 0.0])) - # change on iree cpu backend - func = func.cpu() - input = flow.Tensor([-1.0, 0.0, 1.0]) - output = func(input) - test_case.assertTrue(np.allclose(output, [0.0, 0.0, 1.0])) - # change input shape - input = flow.Tensor([-1, 1.0]) - output = func(input) - test_case.assertTrue(np.allclose(output, [0.0, 1.0])) - - -@flow.unittest.skip_unless_1n1d() -class TestCheckIreeRunner(oneflow.unittest.TestCase): - def test_check_iree_runner(test_case): - _test_check_iree_runner(test_case) - - -if __name__ == "__main__": - unittest.main() From b32c1332f1f34fbab61c668ad035c2440c9cf155 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 21 Jul 2022 14:17:40 +0800 Subject: [PATCH 38/45] Address suggestion and of format --- oneflow/core/framework/sbp_infer_util.cpp | 22 ++++++++++---------- oneflow/core/framework/sbp_infer_util.h | 10 ++++----- oneflow/core/job/eager_nccl_comm_manager.cpp | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 48f70316a86..c7a7b0e87ce 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -531,9 +531,9 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel && !NdSbpHasPartialParallel(consumer_sbp_parallel)) && producer_parallel_desc.device_type() == DeviceType::kCUDA && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { - return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, - logical_blob_desc, producer_parallel_desc, - consumer_parallel_desc) + return Cost4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, + logical_blob_desc, producer_parallel_desc, + consumer_parallel_desc) + GetTransferCost(); } #endif // WITH_CUDA @@ -695,9 +695,9 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, && !NdSbpHasPartialParallel(consumer_sbp_parallel)) && producer_parallel_desc.device_type() == DeviceType::kCUDA && consumer_parallel_desc.device_type() == DeviceType::kCUDA) { - return Ratio4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, - logical_blob_desc, producer_parallel_desc, - consumer_parallel_desc) + return Cost4GeneralBasicCommunication(producer_sbp_parallel, consumer_sbp_parallel, + logical_blob_desc, producer_parallel_desc, + consumer_parallel_desc) + GetTransferCost(); } #endif // WITH_CUDA @@ -804,11 +804,11 @@ double ComputeSbpInferPriority(const NdSbp& producer_nd_sbp, const NdSbp& consum // Cost = ratio * data amount // When we get the this function, either producer_sbp_parallel != consumer_sbp_parallel // or producer_parallel_desc != consumer_parallel_desc -double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, - const NdSbp& consumer_sbp_parallel, - const BlobDesc& logical_blob_desc, - const ParallelDesc& producer_parallel_desc, - const ParallelDesc& consumer_parallel_desc) { +double Cost4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, + const NdSbp& consumer_sbp_parallel, + const BlobDesc& logical_blob_desc, + const ParallelDesc& producer_parallel_desc, + const ParallelDesc& consumer_parallel_desc) { // The upper bound of the amount of the transferred data int32_t producer_partial_ratio = PartialRatio4Producer(producer_sbp_parallel, producer_parallel_desc); diff --git a/oneflow/core/framework/sbp_infer_util.h b/oneflow/core/framework/sbp_infer_util.h index d74ff8b9bf9..21d7da6ae90 100644 --- a/oneflow/core/framework/sbp_infer_util.h +++ b/oneflow/core/framework/sbp_infer_util.h @@ -108,11 +108,11 @@ double ComputeSbpInferPriority(const NdSbp& producer_sbp_parallel, // The transfer ratio for general basic communication // Cost = ratio * data amount -double Ratio4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, - const NdSbp& consumer_sbp_parallel, - const BlobDesc& logical_blob_desc, - const ParallelDesc& producer_parallel_desc, - const ParallelDesc& consumer_parallel_desc); +double Cost4GeneralBasicCommunication(const NdSbp& producer_sbp_parallel, + const NdSbp& consumer_sbp_parallel, + const BlobDesc& logical_blob_desc, + const ParallelDesc& producer_parallel_desc, + const ParallelDesc& consumer_parallel_desc); } // namespace oneflow diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index a7646e2fcd6..679e2d37f9f 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -181,7 +181,7 @@ void EagerNcclCommMgr::CreateCommFromPlan(const Plan& plan) { if (!NeedUnifiedNcclCommInit(op_conf)) { continue; } if (!op_attr->has_parallel_conf_signature()) { continue; } if (!op_attr->parallel_conf_signature().has_op_parallel_conf()) { continue; } - + std::vector> device_vec; ParallelDesc parallel_desc(op_attr->parallel_conf_signature().op_parallel_conf()); for (int64_t parallel_id = 0; parallel_id < parallel_desc.parallel_num(); ++parallel_id) { From d65ae0d7bd1d356fe5b6dd551143ef41a4392c9c Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 21 Jul 2022 18:56:56 +0800 Subject: [PATCH 39/45] Address suggestion --- oneflow/core/job/eager_nccl_comm_manager.cpp | 2 +- oneflow/core/job/eager_nccl_comm_manager.h | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index 679e2d37f9f..00ffc0bbb74 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -87,7 +87,7 @@ bool NeedUnifiedNcclCommInit(const OperatorConf& op_conf) { } else { // Please check the .h file for hard-coding of the name return UserKernelUnifiedNcclCommInitRegistry::Instance().IsRegistered( - "sys_op_" + std::to_string(op_conf.op_type_case())); + kSystemOpPrefix + std::to_string(op_conf.op_type_case())); } } diff --git a/oneflow/core/job/eager_nccl_comm_manager.h b/oneflow/core/job/eager_nccl_comm_manager.h index ac6e572424c..33b27e930a8 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.h +++ b/oneflow/core/job/eager_nccl_comm_manager.h @@ -83,15 +83,17 @@ class UserKernelUnifiedNcclCommInitRegistry final { std::set reg_set_; }; +static const std::string kSystemOpPrefix = "sys_op_"; + } // namespace oneflow #define REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT(op_type_name) \ static auto OF_PP_CAT(g_nccl_comm_reg_, __COUNTER__) = \ ::oneflow::UserKernelUnifiedNcclCommInitRegistry::Trigger(op_type_name) -#define REGISTER_SYSTEM_OP_KERNEL_UNIFIED_NCCL_COMM_INIT(op_type_case) \ - static auto OF_PP_CAT(g_nccl_comm_reg_, __COUNTER__) = \ - ::oneflow::UserKernelUnifiedNcclCommInitRegistry::Trigger("sys_op_" \ +#define REGISTER_SYSTEM_OP_KERNEL_UNIFIED_NCCL_COMM_INIT(op_type_case) \ + static auto OF_PP_CAT(g_nccl_comm_reg_, __COUNTER__) = \ + ::oneflow::UserKernelUnifiedNcclCommInitRegistry::Trigger(::oneflow::kSystemOpPrefix \ + std::to_string(op_type_case)) #endif // WITH_CUDA From 5869aa5d4f298dbfd4b90c125a17fe3c302b709d Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Thu, 21 Jul 2022 22:02:34 +0800 Subject: [PATCH 40/45] Static analysis --- .../core/operator/nccl_send_recv_boxing_op.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp index 2fdf31101d9..d0d3417c413 100644 --- a/oneflow/core/operator/nccl_send_recv_boxing_op.cpp +++ b/oneflow/core/operator/nccl_send_recv_boxing_op.cpp @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/common/container_util.h" #include "oneflow/core/operator/operator.h" #include "oneflow/core/common/protobuf.h" #include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h" @@ -75,7 +76,7 @@ Maybe NcclSendRecvBoxingOp::InferInternalBlobDescs( buf->set_data_type(in->data_type()); CHECK_EQ(src_send_intersections.size(), parallel_num); for (int64_t i = 0; i < parallel_num; ++i) { - const TensorSliceView& intersection = src_send_intersections.at(i); + const TensorSliceView& intersection = JUST(VectorAt(src_send_intersections, i)); if (!intersection.IsEmpty()) { buf_count += intersection.shape().elem_cnt(); } } } @@ -83,7 +84,7 @@ Maybe NcclSendRecvBoxingOp::InferInternalBlobDescs( const BlobDesc* out = GetBlobDesc4BnInOp("out"); buf->set_data_type(out->data_type()); for (int64_t i = 0; i < parallel_num; ++i) { - const TensorSliceView& intersection = dst_recv_intersections.at(i); + const TensorSliceView& intersection = JUST(VectorAt(dst_recv_intersections, i)); if (!intersection.IsEmpty()) { buf_count += intersection.shape().elem_cnt(); } } if (NdSbpHasPartialParallel(src_nd_sbp)) { @@ -109,26 +110,25 @@ Maybe NcclSendRecvBoxingOp::InferOutBlobDescs( const NcclSendRecvBoxingOpConf& conf = this->op_conf().nccl_send_recv_boxing_conf(); const Shape& logical_shape = Shape(conf.logical_shape()); const ParallelDesc& parallel_desc = ParallelDesc(conf.parallel_conf()); - const int64_t machine_id = - CHECK_JUST(parallel_desc.MachineId4ParallelId(parallel_ctx->parallel_id())); - const int64_t device_index = - CHECK_JUST(parallel_desc.DeviceId4ParallelId(parallel_ctx->parallel_id())); + const int64_t machine_id = JUST(parallel_desc.MachineId4ParallelId(parallel_ctx->parallel_id())); + const int64_t device_index = JUST(parallel_desc.DeviceId4ParallelId(parallel_ctx->parallel_id())); if (conf.has_input()) { const BlobDesc* in_blob_desc = GetBlobDesc4BnInOp("in"); const NdSbp& src_nd_sbp = conf.src_nd_sbp(); const ParallelDesc& src_parallel_desc = ParallelDesc(conf.src_parallel_conf()); int64_t src_parallel_id = - CHECK_JUST(src_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + JUST(src_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); std::shared_ptr in_shape = JUST(GetPhysicalShape(logical_shape, src_nd_sbp, src_parallel_desc, src_parallel_id)); - CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()); + CHECK_EQ_OR_RETURN(*in_shape, in_blob_desc->shape()) + << "Non-matching shape of blobs for pieces of nccl send recv"; } if (conf.has_output()) { BlobDesc* out_blob_desc = GetBlobDesc4BnInOp("out"); const NdSbp& dst_nd_sbp = conf.dst_nd_sbp(); const ParallelDesc& dst_parallel_desc = ParallelDesc(conf.dst_parallel_conf()); int64_t dst_parallel_id = - CHECK_JUST(dst_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); + JUST(dst_parallel_desc.ParallelId4MachineDeviceId(machine_id, device_index)); std::shared_ptr out_shape = JUST(GetPhysicalShape(logical_shape, dst_nd_sbp, dst_parallel_desc, dst_parallel_id)); out_blob_desc->mut_shape() = *out_shape; From 983ae756ac790f630c2ad0088613f8155c3d2c5c Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Fri, 22 Jul 2022 03:57:18 +0800 Subject: [PATCH 41/45] Static analysis. Still have another one --- oneflow/core/framework/sbp_infer_util.cpp | 2 +- oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index c7a7b0e87ce..639f3a05a26 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -62,7 +62,7 @@ double Penalty4PartialInConsumer(double logical_blob_size, int32_t producer_para } int32_t Ratio4Sbp(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc, - std::function classifier) { + const std::function& classifier) { int32_t ratio = 1; for (int32_t sbp_id = 0; sbp_id < nd_sbp.sbp_parallel_size(); sbp_id++) { if (classifier(nd_sbp.sbp_parallel(sbp_id))) { ratio *= parallel_desc.hierarchy()->At(sbp_id); } diff --git a/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp index fdd318dcfcb..e6ab2530c36 100644 --- a/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp +++ b/oneflow/core/graph/nccl_send_recv_boxing_task_node.cpp @@ -75,7 +75,7 @@ void NcclSendRecvBoxingTaskNode::BuildExecGphAndRegst() { nccl_send_recv_boxing_conf->set_has_output(has_output_); std::shared_ptr sole_op = CHECK_JUST(ConstructOp(op_conf)); node->mut_op() = sole_op; - sole_op->FillOpParallelDesc(parallel_conf_); + CHECK_JUST(sole_op->FillOpParallelDesc(parallel_conf_)); if (has_input_) { node->BindBnWithRegst(sole_op->SoleIbn(), GetSoleConsumedRegst("in")); } if (has_output_) { std::shared_ptr out_regst = GetProducedRegst("out"); From 5b01f68166c0c92e710a936ce612d4c335625bb6 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Fri, 22 Jul 2022 11:28:39 +0800 Subject: [PATCH 42/45] Static analysis --- .../boxing/hierarchical_sub_task_graph_builder_impl.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp index fc7e5ff39a5..7592e50c9f2 100644 --- a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp +++ b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp @@ -166,10 +166,11 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB } return BuildSubTskGphBuilderStatus("NDNcclSendRecvBoxingSubTskGphBuilder", ""); #else - return Error::BoxingNotSupportedError(); + return Error::BoxingNotSupportedError() << "No CUDA or low NCCL version"; #endif } else { - return Error::BoxingNotSupportedError(); + return Error::BoxingNotSupportedError() + << "Partial SBP in the consumer or not running on CUDA"; } } }; From cb446176f616e566c32852d79595a889add7c4dc Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Sat, 23 Jul 2022 00:22:46 +0800 Subject: [PATCH 43/45] Alleviate on test time --- python/oneflow/test/graph/test_comb1to2d.py | 3 --- python/oneflow/test/graph/test_comb2d.py | 5 +++-- python/oneflow/test/graph/test_gbc2d.py | 2 +- python/oneflow/test/graph/test_gbc2to1d.py | 2 +- python/oneflow/test/graph/test_gbc2to2d.py | 3 +++ 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/python/oneflow/test/graph/test_comb1to2d.py b/python/oneflow/test/graph/test_comb1to2d.py index eae8c04ec1d..2a38899c6c4 100644 --- a/python/oneflow/test/graph/test_comb1to2d.py +++ b/python/oneflow/test/graph/test_comb1to2d.py @@ -32,7 +32,6 @@ def forward(self, x): flow.sbp.split(0), flow.sbp.split(1), flow.sbp.split(2), - flow.sbp.split(3), ] for sbp1 in sbp_1ds: @@ -63,7 +62,6 @@ def forward(self, x): flow.sbp.split(0), flow.sbp.split(1), flow.sbp.split(2), - flow.sbp.split(3), ] for sbp1 in sbp_1ds: @@ -106,7 +104,6 @@ def test_lazy_boxing_2d_all_combination(test_case): 4, 12, 4, - 12, sbp=[flow.sbp.broadcast, flow.sbp.broadcast], placement=flow.placement( type="cuda", ranks=np.array(range(4)).reshape(2, 2) diff --git a/python/oneflow/test/graph/test_comb2d.py b/python/oneflow/test/graph/test_comb2d.py index 7b746017bdb..74f19d38bcc 100644 --- a/python/oneflow/test/graph/test_comb2d.py +++ b/python/oneflow/test/graph/test_comb2d.py @@ -32,7 +32,6 @@ def forward(self, x): flow.sbp.split(0), flow.sbp.split(1), flow.sbp.split(2), - flow.sbp.split(3), ] y = x @@ -40,6 +39,9 @@ def forward(self, x): for sbp2 in sbp_1ds: for sbp3 in sbp_1ds: + # in this case, use intra group boxing + if sbp1 == sbp3: + continue for sbp4 in sbp_1ds: # (2, 2) -> (2, 2) x = x.to_global(sbp=[sbp1, sbp2]) @@ -69,7 +71,6 @@ def test_lazy_boxing_2d_all_combination(test_case): 4, 4, 4, - 4, sbp=[flow.sbp.broadcast, flow.sbp.broadcast], placement=flow.placement( type="cuda", ranks=np.array(range(4)).reshape(2, 2) diff --git a/python/oneflow/test/graph/test_gbc2d.py b/python/oneflow/test/graph/test_gbc2d.py index efb564697fb..d08ce287d17 100644 --- a/python/oneflow/test/graph/test_gbc2d.py +++ b/python/oneflow/test/graph/test_gbc2d.py @@ -53,7 +53,7 @@ def _test_general_basic_communication_same_placement(test_case, src_nd_sbp, dst_ # input placement = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) - local_np = np.arange(12 * 12).reshape(12, 12) + local_np = np.arange(4 * 4).reshape(4, 4) x = flow.tensor(local_np, sbp=src_nd_sbp, placement=placement) # check eager boxing diff --git a/python/oneflow/test/graph/test_gbc2to1d.py b/python/oneflow/test/graph/test_gbc2to1d.py index 62903100dc0..95f74f97661 100644 --- a/python/oneflow/test/graph/test_gbc2to1d.py +++ b/python/oneflow/test/graph/test_gbc2to1d.py @@ -38,7 +38,7 @@ def _test_general_basic_communication_2d_to_1d(test_case, src_nd_sbp, dst_nd_sbp # input placement_x = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) placement_y = flow.placement("cuda", ranks=[0, 3, 4]) - local_np = np.arange(12 * 12).reshape(12, 12) + local_np = np.arange(12 * 4).reshape(12, 4) x = flow.tensor(local_np, sbp=src_nd_sbp, placement=placement_x) # check eager boxing diff --git a/python/oneflow/test/graph/test_gbc2to2d.py b/python/oneflow/test/graph/test_gbc2to2d.py index 4a978c6e1bd..5a2d00809e8 100644 --- a/python/oneflow/test/graph/test_gbc2to2d.py +++ b/python/oneflow/test/graph/test_gbc2to2d.py @@ -35,6 +35,9 @@ def _test_general_basic_communication_2d_to_2d(test_case, src_nd_sbp, dst_nd_sbp if flow.sbp.partial_sum() in dst_nd_sbp: return + if dst_nd_sbp[0] == dst_nd_sbp[1] and src_nd_sbp[0] == src_nd_sbp[1]: + return + # input placement_x = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) placement_y = flow.placement("cuda", ranks=[[0, 3, 4], [2, 5, 6]]) From 1deb764af492d83ef25826720ab81e72971f0cc6 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Sun, 24 Jul 2022 12:49:20 +0800 Subject: [PATCH 44/45] nccl logical send recv do not support different hierarchy --- oneflow/core/auto_parallel/boxing_collector.cpp | 3 ++- oneflow/core/framework/sbp_infer_util.cpp | 6 ++++-- python/oneflow/test/graph/test_comb1to2d.py | 6 ++++++ python/oneflow/test/graph/test_comb2d.py | 6 ++++++ python/oneflow/test/modules/test_comb2to2d.py | 6 ++++++ 5 files changed, 24 insertions(+), 3 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 55e555801f1..9a601fc0aeb 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -573,7 +573,8 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const static const bool enable_general_basic_communication = ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if ((Singleton::Get()->nccl_use_compute_stream() + if (((Singleton::Get()->nccl_use_compute_stream() + && producer_parallel_desc == consumer_parallel_desc) || enable_general_basic_communication) && (!NdSbpHasPartialParallel(sbp_consumer)) && producer_parallel_desc.device_type() == DeviceType::kCUDA diff --git a/oneflow/core/framework/sbp_infer_util.cpp b/oneflow/core/framework/sbp_infer_util.cpp index 639f3a05a26..2687433c9ef 100644 --- a/oneflow/core/framework/sbp_infer_util.cpp +++ b/oneflow/core/framework/sbp_infer_util.cpp @@ -526,7 +526,8 @@ Maybe ComputeLazyCopyCostBetweenNdSbp(const NdSbp& producer_sbp_parallel static const bool enable_general_basic_communication = ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if (((Singleton::Get()->nccl_use_compute_stream() + if ((((Singleton::Get()->nccl_use_compute_stream() + && producer_parallel_desc == consumer_parallel_desc) || enable_general_basic_communication) && !NdSbpHasPartialParallel(consumer_sbp_parallel)) && producer_parallel_desc.device_type() == DeviceType::kCUDA @@ -690,7 +691,8 @@ Maybe ComputeCopyCostWithMiddleNodes(const NdSbp& producer_sbp_parallel, static const bool enable_general_basic_communication = ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer - if (((Singleton::Get()->nccl_use_compute_stream() + if ((((Singleton::Get()->nccl_use_compute_stream() + && producer_parallel_desc == consumer_parallel_desc) || enable_general_basic_communication) && !NdSbpHasPartialParallel(consumer_sbp_parallel)) && producer_parallel_desc.device_type() == DeviceType::kCUDA diff --git a/python/oneflow/test/graph/test_comb1to2d.py b/python/oneflow/test/graph/test_comb1to2d.py index 2a38899c6c4..bd9db7c69a0 100644 --- a/python/oneflow/test/graph/test_comb1to2d.py +++ b/python/oneflow/test/graph/test_comb1to2d.py @@ -24,6 +24,12 @@ import oneflow.unittest +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" +os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "0" + +flow.boxing.nccl.enable_use_compute_stream(False) + + class _TestModuleDiffHierarchy(nn.Module): def forward(self, x): sbp_1ds = [ diff --git a/python/oneflow/test/graph/test_comb2d.py b/python/oneflow/test/graph/test_comb2d.py index 74f19d38bcc..f4ea5fa2d37 100644 --- a/python/oneflow/test/graph/test_comb2d.py +++ b/python/oneflow/test/graph/test_comb2d.py @@ -24,6 +24,12 @@ import oneflow.unittest +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" +os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "0" + +flow.boxing.nccl.enable_use_compute_stream(False) + + class _TestModule(nn.Module): def forward(self, x): sbp_1ds = [ diff --git a/python/oneflow/test/modules/test_comb2to2d.py b/python/oneflow/test/modules/test_comb2to2d.py index dc05016242a..670f20885c4 100644 --- a/python/oneflow/test/modules/test_comb2to2d.py +++ b/python/oneflow/test/modules/test_comb2to2d.py @@ -24,6 +24,12 @@ import oneflow.unittest +os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" +os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "0" + +flow.boxing.nccl.enable_use_compute_stream(False) + + class _TestModuleDiffHierarchy(nn.Module): def forward(self, x): sbp_1ds = [ From 2abf19406b367ecbf29efd38968a8577b6638993 Mon Sep 17 00:00:00 2001 From: Yipeng Li Date: Tue, 26 Jul 2022 00:59:26 +0800 Subject: [PATCH 45/45] Init boxing collector when asked --- .../core/auto_parallel/boxing_collector.cpp | 29 ++++++++++++++- oneflow/core/auto_parallel/boxing_collector.h | 5 +++ .../job_rewriter/boxing_with_middle_nodes.cpp | 37 ------------------- python/oneflow/test/graph/test_comb1to2d.py | 4 +- 4 files changed, 34 insertions(+), 41 deletions(-) diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp index 9a601fc0aeb..a0c2f44b21e 100644 --- a/oneflow/core/auto_parallel/boxing_collector.cpp +++ b/oneflow/core/auto_parallel/boxing_collector.cpp @@ -147,6 +147,8 @@ Maybe BoxingCollector::Init(int32_t max_axis) { JUST(GenerateCombination4SamePlacement(3)); JUST(GenerateCombination4DiffHierarchy(this, this)); JUST(GenerateCombination4DiffPlacement(this, this)); + init_type_ = int32_t(enable_general_basic_communication + || Singleton::Get()->nccl_use_compute_stream()); return Maybe::Ok(); } @@ -161,6 +163,8 @@ Maybe BoxingCollector::Init(const BlobDesc& logical_blob_desc, // Get copy cost in lazy mode LazyMode::Guard enable_lazy_mode(true); JUST(GenerateCombination4SamePlacement(5, logical_blob_desc, parallel_desc)); + init_type_ = int32_t(enable_general_basic_communication + || Singleton::Get()->nccl_use_compute_stream()); return Maybe::Ok(); } @@ -228,6 +232,7 @@ void BoxingCollector::GenerateMap1d2nd() { // Generate the id Map from 1d sbp to nd sbp NdSbp nd_sbp; for (int32_t dim_sbp = 0; dim_sbp < hierarchy_num_; dim_sbp++) { nd_sbp.add_sbp_parallel(); } + id_1d_2_nd_.clear(); id_1d_2_nd_.resize(m, -1); for (int32_t id_1d = 0; id_1d < m; id_1d++) { for (int32_t dim_sbp = 0; dim_sbp < hierarchy_num_; dim_sbp++) { @@ -262,7 +267,9 @@ Maybe BoxingCollector::GenerateCombination4SamePlacement(int32_t max_middl const ParallelDesc& parallel_desc) { // Store the origin transfer cost information int32_t n = nd_sbp_lists_.size(); + minimum_copy_cost_.clear(); minimum_copy_cost_.resize(n); + middle_nodes_.clear(); middle_nodes_.resize(n); for (int32_t i = 0; i < n; i++) { minimum_copy_cost_[i].resize(n); @@ -349,6 +356,7 @@ Maybe BoxingCollector::GenerateCombination4DiffHierarchy( // Search the path that contains one of the diagonal sbp int32_t n = nd_sbp_lists_.size(); + diag_node_diff_hierarchy_.clear(); diag_node_diff_hierarchy_.resize(n); for (int32_t i = 0; i < n; i++) { diag_node_diff_hierarchy_[i].resize(n); @@ -395,6 +403,7 @@ Maybe BoxingCollector::ComputeCostFor1DSbpDiffPlacement( // Number of 1d sbp int32_t m = id2sbp_parallel_.size(); // Compute the cost while transferring a 1D sbp between different placements + cost_4_diff_placement.clear(); cost_4_diff_placement.resize(m); for (int32_t id_1d_producer = 0; id_1d_producer < m; id_1d_producer++) { cost_4_diff_placement[id_1d_producer].resize(m, GetMaxVal()); @@ -425,6 +434,7 @@ Maybe BoxingCollector::GenerateCombination4DiffPlacement( // Search the path that contains two of the diagonal sbp int32_t n = nd_sbp_lists_.size(); + diag_node_diff_placement_.clear(); diag_node_diff_placement_.resize(n); for (int32_t i = 0; i < n; i++) { diag_node_diff_placement_[i].resize(n); @@ -570,8 +580,6 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const } #ifdef WITH_CUDA - static const bool enable_general_basic_communication = - ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); // Use a general basic communication if no P in the consumer if (((Singleton::Get()->nccl_use_compute_stream() && producer_parallel_desc == consumer_parallel_desc) @@ -592,6 +600,23 @@ Maybe BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const } #endif // WITH_CUDA + if (JUST(ComputeLazyCopyCostBetweenNdSbp(sbp_producer, sbp_consumer, logical_blob_desc, + producer_parallel_desc, consumer_parallel_desc, + /*requires_same_sbp=*/false)) + < GetValidMaxCopyCost()) { + return Maybe::Ok(); + } else { + int32_t require_init_type = + int32_t(enable_general_basic_communication + || Singleton::Get()->nccl_use_compute_stream()); + if (init_type_ != require_init_type) { + // We assemble the boxing table from S(0) to S(5). + // Those splitting in higher axes are considered in the customized boxing. + constexpr int32_t kRegularMaxSplitAxes = 6; + JUST(Init(kRegularMaxSplitAxes)); + } + } + // Middle nodes algorithm supports transfer for different machines or devices or hierarchies if (producer_parallel_desc != consumer_parallel_desc) { JUST(AskSbpCombination4DiffPlacement(sbp_producer, sbp_consumer, logical_blob_desc, diff --git a/oneflow/core/auto_parallel/boxing_collector.h b/oneflow/core/auto_parallel/boxing_collector.h index c0fda578dd3..4661d6feb32 100644 --- a/oneflow/core/auto_parallel/boxing_collector.h +++ b/oneflow/core/auto_parallel/boxing_collector.h @@ -163,6 +163,11 @@ class BoxingCollector final { std::vector id_1d_2_nd_; // The sbp size in the combination table int32_t hierarchy_num_; + // How the boxing collector is initialized + int32_t init_type_ = -1; + // Enable general basic communication or not + const bool enable_general_basic_communication = + ParseBooleanFromEnv("ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION", false); }; // class BoxingCollector } // namespace oneflow diff --git a/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp b/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp index 459d87f2ef6..79fb1fb429d 100644 --- a/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp +++ b/oneflow/core/job_rewriter/boxing_with_middle_nodes.cpp @@ -25,37 +25,6 @@ limitations under the License. namespace oneflow { -namespace { -bool NeedBoxingCollector(const OpGraph& op_graph) { - bool need_boxing_collector = false; - op_graph.ForEachNode([&](const OpNode* node) { - if (need_boxing_collector) { return; } - OperatorConf::OpTypeCase op_type_case = node->op().op_conf().op_type_case(); - if (IsClassRegistered(op_type_case)) { return; } - for (const std::string& ibn : node->op().input_bns()) { - const LogicalBlobId& lbi = node->op().BnInOp2Lbi(ibn); - const OpNode& producer = node->ProducerOpNode4Lbi(lbi); - const NdSbp& producer_nd_sbp = producer.NdSbp4Lbi(lbi); - const NdSbp& consumer_nd_sbp = node->NdSbp4BnInOp(ibn); - // If dealing with different placement - if (producer.parallel_desc().parallel_num() != 1 - || node->parallel_desc().parallel_num() != 1) { - const auto& logical_blob_desc = producer.LogicalBlobDesc4Lbi(lbi); - if (CHECK_JUST(ComputeLazyCopyCostBetweenNdSbp(producer_nd_sbp, consumer_nd_sbp, - logical_blob_desc, producer.parallel_desc(), - node->parallel_desc(), - /*requires_same_sbp=*/false)) - > GetValidMaxCopyCost()) { - need_boxing_collector = true; - return; - } - } - } - }); - return need_boxing_collector; -} -} // namespace - Maybe BoxingWithMiddleNodes(const OpGraph& op_graph, JobBuilder* job_builder) { // Not allowed two-step boxing and disable checking for debugging if (ParseBooleanFromEnv("ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK", false)) { @@ -63,12 +32,6 @@ Maybe BoxingWithMiddleNodes(const OpGraph& op_graph, JobBuilder* job_build } // Initialize boxing collector BoxingCollector boxing_collector; - if (NeedBoxingCollector(op_graph)) { - // We assemble the boxing table from S(0) to S(5). - // Those splitting in higher axes are considered in the customized boxing. - constexpr int32_t kRegularMaxSplitAxes = 6; - JUST(boxing_collector.Init(kRegularMaxSplitAxes)); - } std::vector middle_sbps; HashMap op_node2op_conf; // Fill other unsupported combinations diff --git a/python/oneflow/test/graph/test_comb1to2d.py b/python/oneflow/test/graph/test_comb1to2d.py index bd9db7c69a0..cce4d3292de 100644 --- a/python/oneflow/test/graph/test_comb1to2d.py +++ b/python/oneflow/test/graph/test_comb1to2d.py @@ -27,8 +27,6 @@ os.environ["ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK"] = "0" os.environ["ONEFLOW_BOXING_ENABLE_GENERAL_BASIC_COMMUNICATION"] = "0" -flow.boxing.nccl.enable_use_compute_stream(False) - class _TestModuleDiffHierarchy(nn.Module): def forward(self, x): @@ -116,6 +114,8 @@ def test_lazy_boxing_2d_all_combination(test_case): ), ) + flow.boxing.nccl.enable_use_compute_stream(False) + model_diff_hierarchy = _TestModuleDiffHierarchy() graph_diff_hierarchy = _TestGraph(model_diff_hierarchy) y = graph_diff_hierarchy(x)