csarofeen · naoyam · Jul 25, 2022 · Jul 23, 2022 · Jul 24, 2022 · Jul 24, 2022
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -1382,12 +1382,6 @@ WelfordResult::WelfordResult(
   TORCH_INTERNAL_ASSERT(avg->definition()->sameAs(n->definition()));
 }
 
-WelfordResult WelfordResult::rFactor(const std::vector<int>& axes) {
-  auto o_tv = avg->definition()->as<WelfordOp>()->out()->as<TensorView>();
-  auto rf_tvs = o_tv->rFactor(axes, std::vector<TensorView*>{avg, var_sum, n});
-  return WelfordResult{rf_tvs.at(0), rf_tvs.at(1), rf_tvs.at(2)};
-}
-
 // COMPOUND OPERATIONS
 
 // add_alpha

diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h
@@ -107,8 +107,6 @@ class TORCH_CUDA_CU_API WelfordResult {
       TensorView* in_avg,
       TensorView* in_var_sum,
       TensorView* in_n);
-
-  WelfordResult rFactor(const std::vector<int>& axes);
 };
 
 //! Welford operator on specified axes. This is currently the only scan op with

diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp
@@ -230,15 +230,6 @@ void Fusion::addOutput(Val* output) {
   all_tv_uses_valid_ = false;
 }
 
-void Fusion::addOutput(WelfordResult& wr) {
-  // Want to always make sure the avg gets added last
-  //  since avg will be the out() value of welfordOp,
-  //  and want to make it the top of the computeAt chain
-  addOutput(wr.var_sum);
-  addOutput(wr.n);
-  addOutput(wr.avg);
-}
-
 void Fusion::removeInput(Val* input) {
   auto find_input = std::find(inputs_.begin(), inputs_.end(), input);
   if (find_input != inputs_.end()) {

diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h
@@ -110,9 +110,6 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
   //! Register output as an output of the fusion
   void addOutput(Val* output);
 
-  //! Register output as an output of the fusion
-  void addOutput(WelfordResult& output);
-
   //! Deregister input as an input of the fusion
   void removeInput(Val* input);
 

diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.cpp b/torch/csrc/jit/codegen/cuda/ir_utils.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
 #include <set>
 
@@ -473,25 +474,23 @@ TensorView* rfactorHelper(
     TensorView* reduction_tv,
     const std::vector<int>& axes) {
   TORCH_INTERNAL_ASSERT(reduction_tv->definition() != nullptr);
-  const bool is_welford = reduction_tv->definition()->isA<WelfordOp>();
-  if (!is_welford) {
+  const bool has_multiple_tvs = reduction_tv->definition()->inputs().size() > 1;
+  if (!has_multiple_tvs) {
     return reduction_tv->rFactor(axes);
   }
-  auto welford = reduction_tv->definition()->as<WelfordOp>();
-  auto w_avg = welford->outAvg()->as<TensorView>();
-  auto w_var = welford->outVar()->as<TensorView>();
-  auto w_n = welford->outN()->as<TensorView>();
 
-  auto rtvs =
-      reduction_tv->rFactor(axes, std::vector<TensorView*>{w_avg, w_var, w_n});
+  std::vector<TensorView*> out_tvs;
+  std::transform(
+      reduction_tv->definition()->outputs().begin(),
+      reduction_tv->definition()->outputs().end(),
+      std::back_inserter(out_tvs),
+      [](Val* val) { return val->as<TensorView>(); });
 
-  if (reduction_tv == w_n) {
-    return rtvs.at(2);
-  } else if (reduction_tv == w_var) {
-    return rtvs.at(1);
-  } else {
-    return rtvs.at(0);
-  }
+  auto rf_tvs = reduction_tv->rFactor(axes, out_tvs);
+
+  return rf_tvs.at(std::distance(
+      out_tvs.begin(),
+      std::find(out_tvs.begin(), out_tvs.end(), reduction_tv)));
 }
 
 namespace {
@@ -809,6 +808,18 @@ Val* getReductionInitValOf(TensorView* tv) {
   return init;
 }
 
+// TODO: Should mma be in here? Should we return true if it's a trivial
+// reduction?
+bool isReductionOp(const Expr* expr) {
+  // Note that GridReduction inherits ReductionOp
+  return expr->isA<ReductionOp>() || expr->isA<GroupedReductionOp>() ||
+      expr->isA<WelfordOp>() || expr->isA<kir::GridWelford>();
+}
+
+bool isReductionTvOp(const Expr* expr) {
+  return ir_utils::isTvOp(expr) && isReductionOp(expr);
+}
+
 namespace {
 
 struct ReplaceValInIndexVal : public OptInDispatch {

diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.h b/torch/csrc/jit/codegen/cuda/ir_utils.h
@@ -307,6 +307,12 @@ TORCH_CUDA_CU_API std::vector<Expr*> getReductionOps(
 // Returns the initialization value of tv or nullptr if not initialized.
 TORCH_CUDA_CU_API Val* getReductionInitValOf(TensorView* tv);
 
+// Returns if Expr is a reduction op
+TORCH_CUDA_CU_API bool isReductionOp(const Expr*);
+
+// Returns if Expr is a reduction op with TensorView or TensorIndex
+TORCH_CUDA_CU_API bool isReductionTvOp(const Expr*);
+
 template <typename T>
 std::string toString(const T& nodes) {
   std::stringstream ss;

diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
@@ -213,6 +213,7 @@ void ThreadPredicateMap::updateBitSet(const Expr* expr) {
     // Change for welford Op, we want the users of all outputs of welfordOp
     //  to use a single predicate name.
     if (auto tv_def = tv_inp->definition()) {
+      // TODO: Do we need to do anything for grouped reduction here?
       if (auto wop = dynamic_cast<WelfordOp*>(tv_def)) {
         tv_inp = wop->out()->as<TensorView>();
       }

diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp
@@ -23,29 +23,46 @@ bool traverseToRFactorTensor(TensorView* tv, IterDomain* root_id) {
   TORCH_INTERNAL_ASSERT(
       root_id->definition() == nullptr, "Not root IterDomain: ", root_id);
 
-  if (tv->definition() == nullptr) {
+  auto def = tv->definition();
+
+  if (def == nullptr) {
     // This is an input tensor, so no rfactor tensor to traverse.
     return false;
   }
 
-  const auto& inputs = tv->definition()->inputs();
-
   // Check the reduction expression that produces tv
-  if (inputs.size() != 1 || !inputs[0]->isA<TensorView>() ||
-      (tv->definition()->getExprType() != ExprType::ReductionOp &&
-       tv->definition()->getExprType() != ExprType::WelfordOp)) {
-    // No rfactor producer found
+  if (!ir_utils::isReductionOp(def)) {
     return false;
   }
 
-  auto producer = inputs[0]->as<TensorView>();
+  // Find the corresponding input TV. Note that the reduction expr may
+  // have multiple inputs.
+  auto producer = def->inputs().at(std::distance(
+      def->outputs().begin(),
+      std::find(def->outputs().begin(), def->outputs().end(), tv)));
+
+  auto producer_tv = dynamic_cast<TensorView*>(producer);
+
+  // WelfordOp may have an Int input. Traverse to the avg input
+  if (def->isA<WelfordOp>() && producer_tv == nullptr) {
+    TORCH_INTERNAL_ASSERT(
+        producer == def->as<WelfordOp>()->inVar() ||
+            producer == def->as<WelfordOp>()->inN(),
+        "Invalid expr: ",
+        def->toString(),
+        ", out TV: ",
+        tv->toString());
+    producer_tv = def->as<WelfordOp>()->inAvg()->as<TensorView>();
+  }
+
+  TORCH_INTERNAL_ASSERT(producer_tv != nullptr);
 
-  if (!producer->hasRFactor()) {
+  if (!producer_tv->hasRFactor()) {
     return false;
   }
 
-  auto c2p = PairwiseRootDomainMap(producer, tv)
-                 .mapConsumerToProducer(tv->domain(), producer->domain());
+  auto c2p = PairwiseRootDomainMap(producer_tv, tv)
+                 .mapConsumerToProducer(tv->domain(), producer_tv->domain());
 
   auto producer_id_it = c2p.find(root_id);
   if (producer_id_it == c2p.end()) {
@@ -55,7 +72,7 @@ bool traverseToRFactorTensor(TensorView* tv, IterDomain* root_id) {
 
   auto producer_root_id = producer_id_it->second;
 
-  return analyzeIfDerivedFromTrivialReduction(producer, producer_root_id);
+  return analyzeIfDerivedFromTrivialReduction(producer_tv, producer_root_id);
 }
 
 bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id) {

diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
@@ -183,16 +183,6 @@ TensorView* getTvOutput(const Expr* expr) {
   return nullptr;
 }
 
-bool isReductionOp(const Expr* expr) {
-  // Note that GridReduction inherits ReductionOp
-  return expr->isA<ReductionOp>() || expr->isA<GroupedReductionOp>() ||
-      expr->isA<WelfordOp>() || expr->isA<kir::GridWelford>();
-}
-
-bool isReductionTvOp(const Expr* expr) {
-  return isTvOp(expr) && isReductionOp(expr);
-}
-
 bool isScalarOp(const Expr* expr) {
   for (auto out : expr->outputs())
     if (!out->isScalar())

diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/torch/csrc/jit/codegen/cuda/lower_utils.h
@@ -79,12 +79,6 @@ TORCH_CUDA_CU_API bool isTvOp(const Expr*);
 // Returns the first output of Expr that is a TensorView
 TORCH_CUDA_CU_API TensorView* getTvOutput(const Expr*);
 
-// Returns if Expr is a reduction op
-TORCH_CUDA_CU_API bool isReductionOp(const Expr*);
-
-// Returns if Expr is a reduction op with TensorView or TensorIndex
-TORCH_CUDA_CU_API bool isReductionTvOp(const Expr*);
-
 bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map);
 
 //! Returns the iterdomain that maps to the thread dimension grouped

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
@@ -822,8 +822,7 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getPersistentHeuristics(
 
   TORCH_INTERNAL_ASSERT(
       red_expr->getExprType() != c10::nullopt &&
-          (red_expr->getExprType().value() == ExprType::ReductionOp ||
-           red_expr->getExprType().value() == ExprType::WelfordOp),
+          ir_utils::isReductionOp(red_expr),
       "TensorView doesn't have a reduction.");
 
   auto tv_inps = ir_utils::filterByType<TensorView>(fusion->inputs());

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
@@ -908,8 +908,7 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
 
   TORCH_INTERNAL_ASSERT(
       red_expr->getExprType() != c10::nullopt &&
-          (red_expr->getExprType().value() == ExprType::ReductionOp ||
-           red_expr->getExprType().value() == ExprType::WelfordOp),
+          ir_utils::isReductionOp(red_expr),
       "TensorView doesn't have a reduction.");
 
   auto properties =

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -239,8 +239,8 @@ class SchedulerTopologyChecker {
   static bool hasPostReductionBCast(Fusion* fusion) {
     auto all_vals = fusion->usedMathVals();
     for (auto tv : ir_utils::filterByType<TensorView>(all_vals)) {
-      // Welford can have 2 outputs, so do this on all found reduction tensor
-      // views
+      // Reductions can have multiple outputs, so do this on all found reduction
+      // tensor views
       if (tv->hasReduction() && !tv->isFusionInput()) {
         auto tv_chains = tvChains(DependencyCheck::getAllUseChains(tv));
         // Propagate forward from reduction through all uses of the reduction
@@ -301,18 +301,17 @@ class SchedulerTopologyChecker {
 
     // When checking post reduction vals, we need to make sure
     //  we are really checking paths starting from all outputs
-    //  of multi-output reductions, i.e. welford. The reduction_tv
-    //  vector is assumed to only have one of them.
+    //  of multi-output reductions, i.e. welford/grouped reduction. The
+    //  reduction_tv vector is assumed to only have one of them.
     std::unordered_set<Val*> reduction_tv_set(
         reduction_tvs.begin(), reduction_tvs.end());
 
     for (auto red : reduction_tvs) {
       if (red->definition()) {
-        if (auto wop = dynamic_cast<WelfordOp*>(red->definition())) {
-          for (auto wop_output : wop->outputs()) {
-            if (wop_output->isA<TensorView>()) {
-              reduction_tv_set.insert(wop_output);
-            }
+        if (ir_utils::isReductionOp(red->definition())) {
+          auto outs = red->definition()->outputs();
+          for (auto out_tv : ir_utils::filterByType<TensorView>(outs)) {
+            reduction_tv_set.insert(out_tv);
           }
         }
       }
@@ -1000,9 +999,8 @@ class PointWiseScheduler : public SchedulerEntry {
 
     auto reduction_ops =
         ir_utils::getReductionOps(fusion, true /* ignore_trivial */);
-    auto welford_ops = ir_utils::filterByType<WelfordOp>(reduction_ops);
 
-    if (!reduction_ops.empty() || !welford_ops.empty()) {
+    if (!reduction_ops.empty()) {
       scheduler_debug_utils::canScheduleRejectReason(
           ScheduleHeuristic::PointWise, "no support for reduction ops");
       return false;
@@ -1065,15 +1063,6 @@ class PersistentKernelScheduler : public SchedulerEntry {
 
     auto reduction_ops =
         ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
-    auto welford_ops = ir_utils::filterByType<WelfordOp>(reduction_ops);
-    // For persistent schedule we want welford translated to average and
-    // standard deviation reductions.
-    if (welford_ops.begin() != welford_ops.end()) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Persistent,
-          "no support for un-translated welford");
-      return false;
-    }
 
     auto view_tvs = scheduler_utils::getViewTVs(fusion);
     if (view_tvs.size() > 0) {

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.h b/torch/csrc/jit/codegen/cuda/scheduler/utils.h
@@ -194,7 +194,6 @@ std::pair<bool, bool> canonicalDimReduction(
 
 // Return a list of tensor views that are outputs of reduction operations. If
 // multiple outputs of an expression are found, only include one in the list
-// (WelfordOp)
 TORCH_CUDA_CU_API std::vector<TensorView*> getReductionTvs(
     Fusion* fusion,
     bool ignore_trivial = true);

diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
@@ -735,11 +735,10 @@ TensorView* TensorView::multiOutputRfactorHelper(
       !container()->isA<kir::Kernel>(),
       "Function invalid for kernel container.");
   // Hack:
-  // Semantically we should always keep the outputs of welfordOp scheduled
-  // the same but the user end cannot guarantee that.
-  // In order to guarantee that the rFactor is defined meaningfully the
-  // scheduling of the output TV that got the rfactor call is force replayed
-  // towards the other two
+  // Semantically we should always keep the outputs of multi reduction ops
+  // scheduled the same but the user end cannot guarantee that. In order to
+  // guarantee that the rFactor is defined meaningfully the scheduling of the
+  // output TV that got the rfactor call is force replayed towards the other two
 
   if (!sameAs(tv)) {
     auto root = tv->getRootDomain();
@@ -758,7 +757,7 @@ TensorView* TensorView::multiOutputRfactorHelper(
     std::vector<IterDomain*> new_id;
     for (auto id : domain()->domain()) {
       TORCH_INTERNAL_ASSERT(
-          replay.getReplay().count(id), "Welford Replay Failed");
+          replay.getReplay().count(id), "Multi-output reduction replay failed");
       new_id.push_back(replay.getReplay().at(id));
     }
 
@@ -795,12 +794,11 @@ std::vector<TensorView*> TensorView::rFactor(
   TORCH_CHECK(nDims() > 0, "Tried to rFactor a 0-dim TensorView");
   FusionGuard fg(fusion());
   TORCH_CHECK(
-      definition() != nullptr &&
-          (definition()->getExprType() == ExprType::GroupedReductionOp ||
-           definition()->getExprType() == ExprType::WelfordOp),
-      "Error rfactoring welford ",
+      definition() != nullptr && ir_utils::isReductionOp(definition()),
+      "Error rfactoring multi-output reduction op ",
       this,
-      " its definition is either a nullptr or not a GroupedReductionOp or a WelfordOp.");
+      " its definition is either a nullptr or not a GroupedReductionOp or a multi-output reduction op.");
+
   TORCH_CHECK(
       !domain()->hasRFactor(), "Cannot call rfactor on the same view twice.");