csarofeen · shmsong · Sep 20, 2021 · Sep 4, 2021 · Sep 5, 2021 · Sep 13, 2021
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -358,6 +358,16 @@ LaunchParams FusionExecutor::computeLaunchParams(
           });
   auto& parallel_iter_extents = parallel_iter_extent_entry.get();
 
+  auto simplified_parallel_iter_extent_entry =
+      executor_utils::caching::ExecutorCompileTimeEntry<
+          executor_utils::caching::SimplifiedParallelIterExtentMap>(
+          data_cache, [&parallel_binding_ids, &lower]() {
+            return executor_utils::getSimplifiedParallelIterExtents(
+                lower, parallel_binding_ids);
+          });
+  auto& simplified_parallel_iter_extents =
+      simplified_parallel_iter_extent_entry.get();
+
   auto warp_padded_parallel_entry =
       executor_utils::caching::ExecutorCompileTimeEntry<
           executor_utils::caching::WarpPaddedParallelExtents>(
@@ -409,7 +419,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
   }
 
   // Run through the rest of the parallel IterDomains and infer their size
-  for (auto& entry : parallel_iter_extents) {
+  for (auto& entry : simplified_parallel_iter_extents) {
     FUSER_PERF_SCOPE("FusionExecutor::ParallelBindingResolution");
     auto p_type = entry.first;
     auto parallel_extents = entry.second;

diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -966,6 +966,7 @@ ExecutorCompileTimeEntry<EntryClass>::ExecutorCompileTimeEntry(
 // Template instantiation
 template class ExecutorCompileTimeEntry<ParallelBindingIterDomains>;
 template class ExecutorCompileTimeEntry<ParallelIterExtentMap>;
+template class ExecutorCompileTimeEntry<SimplifiedParallelIterExtentMap>;
 template class ExecutorCompileTimeEntry<WarpPaddedParallelExtents>;
 template class ExecutorCompileTimeEntry<VectorizedTensorValidation>;
 template class ExecutorCompileTimeEntry<InputAliasIndices>;
@@ -986,20 +987,55 @@ std::vector<IterDomain*> getParallelBindingsIterDomains(
   return parallel_ids;
 }
 
+void insertParallelExtent(
+    GpuLower& lower,
+    IterDomain* binding_id,
+    const std::unique_ptr<ParallelExtentMap>& parallel_iter_extents_ptr) {
+  auto kir_extent = lower.lowerValue(binding_id->extent());
+  const auto it =
+      parallel_iter_extents_ptr->find(binding_id->getParallelType());
+  if (it != parallel_iter_extents_ptr->end()) {
+    it->second.push_back(kir_extent);
+  } else {
+    parallel_iter_extents_ptr->operator[](binding_id->getParallelType()) = {
+        kir_extent};
+  }
+}
+
 std::unique_ptr<ParallelExtentMap> getParallelIterExtents(
     GpuLower& lower,
     std::vector<IterDomain*>& parallel_binding_ids) {
   auto parallel_iter_extents_ptr = std::make_unique<ParallelExtentMap>();
   for (auto id : parallel_binding_ids) {
-    // TODO(kir): we should rewrite this logic based on the Kernel object
-    auto kir_extent = lower.lowerValue(id->extent());
-    const auto it = parallel_iter_extents_ptr->find(id->getParallelType());
-    if (it != parallel_iter_extents_ptr->end()) {
-      it->second.push_back(kir_extent);
-    } else {
-      parallel_iter_extents_ptr->operator[](id->getParallelType()) = {
-          kir_extent};
+    insertParallelExtent(lower, id, parallel_iter_extents_ptr);
+  }
+
+  return parallel_iter_extents_ptr;
+}
+
+std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
+    GpuLower& lower,
+    std::vector<IterDomain*>& parallel_binding_ids) {
+  auto parallel_iter_extents_ptr = std::make_unique<ParallelExtentMap>();
+  auto& parallel_map = lower.caParallelMap();
+  std::vector<IterDomain*> mapped;
+  bool is_tidx_warp_padded = lower.getWarpPaddedParallelInfo().is_tidx_padded;
+
+  for (auto id : parallel_binding_ids) {
+    if (std::any_of(
+            mapped.begin(),
+            mapped.end(),
+            [id, &parallel_map](IterDomain* mapped_id) {
+              return parallel_map.areMapped(mapped_id, id);
+            })) {
+      if (id->getParallelType() != ParallelType::TIDx || !is_tidx_warp_padded) {
+        continue;
+      }
     }
+
+    insertParallelExtent(
+        lower, parallel_map.getConcreteMappedID(id), parallel_iter_extents_ptr);
+    mapped.push_back(id);
   }
 
   return parallel_iter_extents_ptr;

diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -80,6 +80,7 @@ namespace caching {
 enum class CompileTimeEntryType {
   PARALLEL_BINDING_ITERDOMAINS,
   PARALLEL_ITER_EXTENT_MAP,
+  SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP,
   WARP_PADDED_PARALLEL_EXTENTS,
   VECTORIZED_TENSOR_VALIDATION,
   INPUT_ALIAS_INDICES,
@@ -114,6 +115,27 @@ class ParallelIterExtentMap {
       CompileTimeEntryType::PARALLEL_ITER_EXTENT_MAP;
 };
 
+//! Compile-time info to be cached in each FusionExecutor:
+//!  SimplifiedParallelIterExtentMap
+//!    This entry type is a simplified version of ParallelIterExtentMap.
+//!
+//!    For launch parameter binding we only need the most concrete iterdomain
+//!      in each disjoint set stored in CaParallelMap. This entry stores the
+//!      remaining list of extents for binding after this simplification.
+//!
+//!    We still need ParallelIterExtentMap since we want to bind the concrete
+//!      values to the extents of all parallelized iterdomains. We would be
+//!      able to save these bindings if the integer machine has a notion of
+//!      equality and could be configured compile time. But that'd be a longer
+//!      term target.
+class SimplifiedParallelIterExtentMap {
+ public:
+  using DataType =
+      std::unordered_map<ParallelType, std::vector<const kir::Val*>, TypeHash>;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP;
+};
+
 //!  WarpPaddedExtentsInfo:
 //!    Auxiliary data type for entry class WarpPaddedParallelExtents
 struct WarpPaddedExtentsInfo {
@@ -269,6 +291,12 @@ std::unique_ptr<ParallelExtentMap> getParallelIterExtents(
     GpuLower& lower,
     std::vector<IterDomain*>& parallel_binding_ids);
 
+//! Returns the simplified set of extents necessary for launch parameter
+//!  binding.
+std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
+    GpuLower& lower,
+    std::vector<IterDomain*>& parallel_binding_ids);
+
 //! Returns the symbolic or constant extetns of warp padded parallel
 //!  iterdomains in the given vector.
 std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(

diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
@@ -30,7 +30,6 @@ c10::optional<Int::ScalarType> ExpressionEvaluator::evaluate(Val* value) {
   if (evaluator_precomputed_integers_ != nullptr) {
     return evaluator_precomputed_integers_->getMaybeValueFor(value);
   } else {
-    FUSER_PERF_SCOPE("ExpressionEvaluator::evaluate");
     auto maybe_concrete_value = getValue(value);
     if (!maybe_concrete_value.has_value()) {
       if (value->definition() != nullptr) {