csarofeen · csarofeen · Sep 16, 2022 · Sep 16, 2022 · naoyam · Sep 16, 2022
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
@@ -665,7 +665,7 @@ void ComputeAtMap::allocateIndexVariables() {
                   // Halo extended parallel loops currently are handled
                   // differently and an index variable would still
                   // be allocated in this case.
-                  (GpuLower::current()->haloInfo().getExtent(id) == nullptr)) {
+                  (GpuLower::current()->haloInfo()->getExtent(id) == nullptr)) {
                 ptype = id->getParallelType();
                 return true;
               }

diff --git a/torch/csrc/jit/codegen/cuda/contiguity.cpp b/torch/csrc/jit/codegen/cuda/contiguity.cpp
@@ -51,7 +51,7 @@ ContigIDs::ContigIDs(
         (ignore_halo_constraint ||
          !GpuLower::current()
               ->haloInfo()
-              .getRootAxisInfo(root_domain_i)
+              ->getRootAxisInfo(root_domain_i)
               .hasHalo())) {
       contig_ids_.emplace(root_domain_i);
       is_contig_root_[root_domain_i] = true;

diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -51,8 +51,8 @@ int getProducerHaloOffset(
   IterDomain* consumer_id = it->second;
 
   const auto& halo_map = GpuLower::current()->haloInfo();
-  const auto p_pad = halo_map.getRootAxisInfo(producer_id).width(0);
-  const auto c_pad = halo_map.getRootAxisInfo(consumer_id).width(0);
+  const auto p_pad = halo_map->getRootAxisInfo(producer_id).width(0);
+  const auto c_pad = halo_map->getRootAxisInfo(consumer_id).width(0);
 
   auto offset = p_pad - c_pad;
 
@@ -985,7 +985,7 @@ Val* getHaloExtentOfRootAxis(IterDomain* id, Val* normal_extent = nullptr) {
     normal_extent = id->extent();
   }
 
-  const auto& halo = GpuLower::current()->haloInfo().getRootAxisInfo(id);
+  const auto& halo = GpuLower::current()->haloInfo()->getRootAxisInfo(id);
   if (halo.hasHalo()) {
     auto halo_extent = SimplifyingIrBuilder::addExpr(
         normal_extent, SimplifyingIrBuilder::create<Int>(halo.width()));
@@ -2351,7 +2351,7 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
   std::unordered_set<IterDomain*> excluded_ids;
 
   for (auto consumer_root_id : consumer_root_domain) {
-    if (gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id).hasHalo()) {
+    if (gpu_lower->haloInfo()->getRootAxisInfo(consumer_root_id).hasHalo()) {
       excluded_ids.insert(consumer_root_id);
       continue;
     }
@@ -2487,7 +2487,7 @@ int getUnswitchStopOffset(
   const auto gpu_lower = GpuLower::current();
 
   AxisHaloInfo halo_info =
-      gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id);
+      gpu_lower->haloInfo()->getRootAxisInfo(consumer_root_id);
 
   // If the consumer root domain to predicate does not have halo, no
   // adjustment is required.
@@ -2511,7 +2511,7 @@ int getUnswitchStopOffset(
           unswitch_it,
           consumer_tv->domain()->domain().end(),
           [&gpu_lower, &consumer_root_id](auto leaf_id) {
-            return gpu_lower->haloInfo().isHaloInherited(
+            return gpu_lower->haloInfo()->isHaloInherited(
                 consumer_root_id, leaf_id);
           })) {
     return halo_info.width();
@@ -2669,7 +2669,8 @@ std::pair<Val*, Val*> getStartAndStopLimitOffsets(
   Val* stop_limit = SimplifyingIrBuilder::negExpr(consumer_id->stopOffset());
 
   if (!non_divisible_pred) {
-    AxisHaloInfo halo_info = gpu_lower->haloInfo().getRootAxisInfo(consumer_id);
+    AxisHaloInfo halo_info =
+        gpu_lower->haloInfo()->getRootAxisInfo(consumer_id);
 
     // Below, "left" and "right" halo mean halo at offset zero and
     // axis extent, respectively.
@@ -2693,8 +2694,8 @@ std::pair<Val*, Val*> getStartAndStopLimitOffsets(
     // that it is less than the extent of the predicated ID +
     // halo. Note that getRootAxisInfo doesn't work since consumer_id
     // isn't a root domain.
-    if (gpu_lower->haloInfo().hasHaloWidth(consumer_id)) {
-      auto halo = gpu_lower->haloInfo().getHaloWidth(consumer_id);
+    if (gpu_lower->haloInfo()->hasHaloWidth(consumer_id)) {
+      auto halo = gpu_lower->haloInfo()->getHaloWidth(consumer_id);
       stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo);
     }
   }
@@ -2841,8 +2842,8 @@ bool canOmitStopPredicate(
   // to be predicated, not its merged contig id even if it exists. So,
   // if contig_id does not have root axis info, contig_id is
   // guaranteed to have no halo.
-  auto halo_ext = gpu_lower->haloInfo().hasRootAxisInfo(contig_id)
-      ? gpu_lower->haloInfo().getRootAxisInfo(contig_id).width()
+  auto halo_ext = gpu_lower->haloInfo()->hasRootAxisInfo(contig_id)
+      ? gpu_lower->haloInfo()->getRootAxisInfo(contig_id).width()
       : 0;
 
   if (halo_ext + stop_offset_val.value() > 0) {

diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -248,7 +248,7 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
   // mappings of all iteration domains across the fusion. There are three types
   // of mappings Permissive, Exact, and Loop, see compute_at_map.h/cpp for more
   // information.
-  compute_at_map_ = std::make_unique<ComputeAtMap>(fusion_);
+  compute_at_map_ = std::make_shared<ComputeAtMap>(fusion_);
 
   if (isDebugDumpEnabled(DebugDumpOption::ComputeAtMap)) {
     std::cout << compute_at_map_->toString() << std::endl;
@@ -281,7 +281,7 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
 
   // Scan the whole fusion and build mappings about halo extensions of
   // all IterDomains
-  haloInfo().build(fusion_);
+  halo_info_ = std::make_shared<HaloInfo>(fusion_, compute_at_map_);
 
   // Want to run this after parallel map and halo info map are
   // created. vectorized_accesses_ and vectorized_set_info_ are filled.

diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h
@@ -76,20 +76,16 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
     return thread_pred_map_;
   }
 
-  const std::unique_ptr<ComputeAtMap>& caMap() const {
-    return compute_at_map_;
+  std::shared_ptr<const ComputeAtMap> caMap() const {
+    return std::const_pointer_cast<const ComputeAtMap>(compute_at_map_);
   }
 
   const TrivialReductionInfo& trivialReductionInfo() const {
     return trivial_reduction_info_;
   }
 
-  const HaloInfo& haloInfo() const {
-    return halo_info_;
-  }
-
-  HaloInfo& haloInfo() {
-    return halo_info_;
+  std::shared_ptr<const HaloInfo> haloInfo() const {
+    return std::const_pointer_cast<const HaloInfo>(halo_info_);
   }
 
   const ParallelDimensionMap& parallelDimensionMap() const {
@@ -201,9 +197,9 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   ConcretizedBroadcastDomains concretized_broadcast_domains_;
   ThreadPredicateMap thread_pred_map_;
   PredicateElimination pred_elimination_;
-  std::unique_ptr<ComputeAtMap> compute_at_map_;
+  std::shared_ptr<ComputeAtMap> compute_at_map_;
   TrivialReductionInfo trivial_reduction_info_;
-  HaloInfo halo_info_;
+  std::shared_ptr<HaloInfo> halo_info_;
   LocalAllocationInfoMap local_allocation_info_map_;
   WarpPaddedParallelInfo warp_pad_info_;
   ParallelDimensionMap parallel_dimension_map_;

diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
@@ -131,7 +131,7 @@ class AllocationInserter : public kir::ExprMutator {
          ++init_loop_it) {
       auto id = *init_loop_it;
       kir::ForLoop* new_loop = nullptr;
-      auto extent_with_halo = gpu_lower->haloInfo().getExtent(id);
+      auto extent_with_halo = gpu_lower->haloInfo()->getExtent(id);
       if (extent_with_halo) {
         new_loop = IrBuilder::create<kir::ForLoop>(
             id,
@@ -166,7 +166,7 @@ class AllocationInserter : public kir::ExprMutator {
       }
       auto extent = id->extent();
       // Use halo-extended extent if found
-      auto halo_extent = gpu_lower->haloInfo().getRootAxisInfo(id);
+      auto halo_extent = gpu_lower->haloInfo()->getRootAxisInfo(id);
       if (halo_extent.hasHalo()) {
         extent = IrBuilder::addExpr(
             extent, IrBuilder::create<Int>(halo_extent.width()));
@@ -213,7 +213,7 @@ class AllocationInserter : public kir::ExprMutator {
 
     // Get the halo extent if found
     auto getExtent = [this](IterDomain* id) {
-      auto extent = gpu_lower->haloInfo().getExtent(id);
+      auto extent = gpu_lower->haloInfo()->getExtent(id);
       if (extent == nullptr) {
         extent = id->extent();
       }
@@ -368,7 +368,7 @@ class AllocationInserter : public kir::ExprMutator {
 
       auto extent = concrete_id->extent();
 
-      if (gpu_lower->haloInfo().getExtent(info.buffer->axis(axis_i)) !=
+      if (gpu_lower->haloInfo()->getExtent(info.buffer->axis(axis_i)) !=
           nullptr) {
         has_halo = true;
       }

diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
@@ -125,7 +125,8 @@ IndexingParameters getLinearIndexParameters(
 
   // Derive the halo extents from the loop indexing result.
   index_parameters.concrete_id_to_halo_extent =
-      GpuLower::current()->haloInfo().buildConcreteHaloExtentMap(loop_indexing);
+      GpuLower::current()->haloInfo()->buildConcreteHaloExtentMap(
+          loop_indexing);
 
   protectNonPredicateIndexWithMagicZero(
       loops,
@@ -233,7 +234,8 @@ IndexingParameters getNonGlobalInitialIndexParameters(
 
   // Derive the halo extents from the loop indexing result.
   index_parameters.concrete_id_to_halo_extent =
-      GpuLower::current()->haloInfo().buildConcreteHaloExtentMap(loop_indexing);
+      GpuLower::current()->haloInfo()->buildConcreteHaloExtentMap(
+          loop_indexing);
 
   return index_parameters;
 }
@@ -408,7 +410,8 @@ IndexingParameters getPredicateInitialIndexParameters(
 
   // Derive the halo extents from the loop indexing result.
   index_parameters.concrete_id_to_halo_extent =
-      GpuLower::current()->haloInfo().buildConcreteHaloExtentMap(loop_indexing);
+      GpuLower::current()->haloInfo()->buildConcreteHaloExtentMap(
+          loop_indexing);
 
   return index_parameters;
 }

diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
@@ -33,7 +33,7 @@ LoopNestGenerator::LoopNestGenerator(const std::vector<Expr*>& exprs) {
 namespace {
 
 kir::ForLoop* openForHelper(kir::ForLoop* scope, IterDomain* id) {
-  auto extent_with_halo = GpuLower::current()->haloInfo().getExtent(id);
+  auto extent_with_halo = GpuLower::current()->haloInfo()->getExtent(id);
   kir::ForLoop* new_scope = nullptr;
   if (extent_with_halo) {
     // When an axis is extended with halo, unrolling and vectorization

diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
@@ -303,12 +303,12 @@ class PredicateChcker : public IterVisitor {
 
   // Shift is not supported yet.
   bool predicateShift(Expr* expr) const {
-    auto& halo_info = GpuLower::current()->haloInfo();
+    auto halo_info = GpuLower::current()->haloInfo();
     auto input_tvs = ir_utils::filterByType<TensorView>(expr->inputs());
-    return halo_info.needsShiftPredicate(expr) ||
+    return halo_info->needsShiftPredicate(expr) ||
         std::any_of(input_tvs.begin(), input_tvs.end(), [&](auto input_tv) {
              return input_tv->definition() != nullptr &&
-                 halo_info.needsShiftPredicate(input_tv->definition());
+                 halo_info->needsShiftPredicate(input_tv->definition());
            });
   }
 

diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.cpp b/torch/csrc/jit/codegen/cuda/lower_shift.cpp
@@ -28,7 +28,7 @@ void ShiftPredicateInserter::insert(
   TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output");
 
   const bool needs_shift_predicate =
-      gpu_lower->haloInfo().needsShiftPredicate(out_tv->definition());
+      gpu_lower->haloInfo()->needsShiftPredicate(out_tv->definition());
   if (!needs_shift_predicate) {
     return;
   }
@@ -145,13 +145,6 @@ const AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) const {
   return it->second;
 }
 
-AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  return const_cast<AxisHaloInfo&>(
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-      const_cast<const HaloInfo*>(this)->getRootAxisInfo(id));
-}
-
 void HaloInfo::setRootAxisInfo(
     IterDomain* id,
     const AxisHaloInfo& root_axis_info) {
@@ -161,7 +154,9 @@ void HaloInfo::setRootAxisInfo(
   return;
 }
 
-void HaloInfo::build(Fusion* fusion) {
+HaloInfo::HaloInfo(Fusion* fusion, std::shared_ptr<const ComputeAtMap> ca_map)
+    // Make a copy of the permissive map for extent comparators
+    : permissive_map_(ca_map->idGraph().permissiveNodes()) {
   const auto vals = fusion->usedMathVals();
   auto tvs = ir_utils::filterByType<TensorView>(vals);
 
@@ -202,7 +197,7 @@ void HaloInfo::build(Fusion* fusion) {
 
   // Note that validation requires consumer halo info
   for (auto tv : tvs) {
-    validate(tv);
+    validate(tv, ca_map);
   }
 }
 
@@ -474,12 +469,13 @@ void HaloInfo::build(TensorDomain* td) {
 //! Other types of parallelization should be supported except for
 //! vectorization. Vectorization should be eventually supported but
 //! needs further work.
-void HaloInfo::validate(TensorView* tv) const {
+void HaloInfo::validate(
+    TensorView* tv,
+    std::shared_ptr<const ComputeAtMap> ca_map) const {
   const auto mem_type = tv->getMemoryType();
 
   for (auto axis : tv->domain()->domain()) {
-    auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
-        axis, IdMappingMode::LOOP);
+    auto concrete_id = ca_map->getConcreteMappedID(axis, IdMappingMode::LOOP);
 
     // The extent is assumed to be the same
     TORCH_INTERNAL_ASSERT(
@@ -526,7 +522,7 @@ void HaloInfo::validate(TensorView* tv) const {
           consumer->domain()->domain().begin(),
           consumer->domain()->domain().end(),
           [&](IterDomain* consumer_axis) {
-            return GpuLower::current()->caMap()->areMapped(
+            return ca_map->areMapped(
                 axis, consumer_axis, IdMappingMode::PERMISSIVE);
           });
       if (it == consumer->domain()->domain().end()) {
@@ -626,11 +622,10 @@ bool extentCompare(
     const HaloInfo& halo_map,
     IterDomain* id1,
     IterDomain* id2,
-    Cmp cmp) {
-  auto gpu_lower = GpuLower::current();
+    Cmp cmp,
+    const DisjointSets<IterDomain*>& permissive_map) {
   TORCH_INTERNAL_ASSERT(
-      gpu_lower->caMap()->areMapped(id1, id2, IdMappingMode::PERMISSIVE),
-      "Invalid axes to compare");
+      permissive_map.strictAreMapped(id1, id2), "Invalid axes to compare");
 
   // It's invalid to compare two axes and when only either of them has
   // halo.
@@ -652,10 +647,10 @@ bool extentCompare(
       auto merge2 = dynamic_cast<Merge*>(id2->definition());
       TORCH_INTERNAL_ASSERT(
           merge2 != nullptr, "Invalid comparison: ", id1, " and ", id2);
-      auto inner_le =
-          extentCompare(halo_map, merge1->inner(), merge2->inner(), cmp);
-      auto outer_le =
-          extentCompare(halo_map, merge1->outer(), merge2->outer(), cmp);
+      auto inner_le = extentCompare(
+          halo_map, merge1->inner(), merge2->inner(), cmp, permissive_map);
+      auto outer_le = extentCompare(
+          halo_map, merge1->outer(), merge2->outer(), cmp, permissive_map);
       return inner_le && outer_le;
     } else {
       // This is not considered. Should never reach here.
@@ -667,11 +662,11 @@ bool extentCompare(
 } // namespace
 
 bool HaloInfo::extentLessEqual(IterDomain* id1, IterDomain* id2) const {
-  return extentCompare(*this, id1, id2, std::less_equal<>());
+  return extentCompare(*this, id1, id2, std::less_equal<>(), permissive_map_);
 }
 
 bool HaloInfo::extentEqual(IterDomain* id1, IterDomain* id2) const {
-  return extentCompare(*this, id1, id2, std::equal_to<>());
+  return extentCompare(*this, id1, id2, std::equal_to<>(), permissive_map_);
 }
 
 std::string HaloInfo::toString() const {
@@ -722,19 +717,19 @@ bool HaloInfo::needsShiftPredicate(Expr* expr) const {
 }
 
 std::unordered_map<IterDomain*, Val*> HaloInfo::buildConcreteHaloExtentMap(
-    const LoopIndexing& loop_indexing) {
+    const LoopIndexing& loop_indexing) const {
   // Use a local workspace to avoid re-defining halo info.
-  HaloInfo local_halo_info;
+  HaloInfo local_halo_info = *GpuLower::current()->haloInfo();
 
-  auto& global_halo_info = GpuLower::current()->haloInfo();
+  auto global_halo_info = GpuLower::current()->haloInfo();
 
   // Setup root:
   for (auto consumer_root_id : loop_indexing.consumerTv()->getRootDomain()) {
     auto consumer_index_concrete_id =
         ir_utils::caMapExactConcreteId(consumer_root_id);
     local_halo_info.setRootAxisInfo(
         consumer_index_concrete_id,
-        global_halo_info.getRootAxisInfo(consumer_root_id));
+        global_halo_info->getRootAxisInfo(consumer_root_id));
   }
 
   // Track IDs that are generated by merging halo-extended IDs
@@ -801,7 +796,8 @@ std::unordered_map<IterDomain*, Val*> HaloInfo::buildConcreteHaloExtentMap(
         merged_shifted_ids.insert(ir_utils::caMapExactConcreteId(merge->out()));
         // Note that halo_width_map_ is not updated
       } else {
-        setHaloWidth(ir_utils::caMapExactConcreteId(merge->out()), 0);
+        local_halo_info.setHaloWidth(
+            ir_utils::caMapExactConcreteId(merge->out()), 0);
       }
     } else if (auto swizzle_2d = dynamic_cast<Swizzle2D*>(expr)) {
       // Swizzle with halo not yet supported, just set the width