Heuristic cache PR (#275)

switched to scheduleReduction instead of naive scheduleFusion for reduction-fusion; update FusionExecutorCache to reuse kernel with ReductionParamsHash Note: It's failing CI test due to: #273; but luckily we have the other PR merged that disabled broadcasting, so CI is green.
csarofeen · Aug 11, 2020 · eccbf78 · eccbf78
1 parent d851ecd
commit eccbf78
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 26 deletions.
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -58,6 +58,7 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
       structured_code,
       (kernelNamespace() + "::" + kernelName()).c_str(),
       fusion_id);
+  compiled_ = true;
 }
 
 namespace {

diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
@@ -35,6 +35,12 @@ class TORCH_CUDA_API FusionExecutor {
     return runFusion(inputs, {}, launch_constraints);
   }
 
+  // function to query whether a `FusionExecutor` has a compiled kernel to
+  // execute
+  bool compiled() const {
+    return compiled_;
+  };
+
  private:
   std::string kernelName() const {
     std::stringstream ss;
@@ -59,6 +65,8 @@ class TORCH_CUDA_API FusionExecutor {
   std::vector<at::Tensor> allocOutputs(EvaluationContext& ec);
 
  private:
+  bool compiled_ = false;
+
   Fusion fusion_;
 
   CompileOptions options_;
@@ -78,4 +86,4 @@ class TORCH_CUDA_API FusionExecutor {
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
-} // namespace torch
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -193,29 +193,46 @@ FusionExecutorCache::FusionExecutorCache(
 // TODO: dummy cache
 std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
     const at::ArrayRef<IValue>& inputs) {
-  if (fusion_executor_cache_.empty()) {
-    // TODO: enable Kevin's scheduleReduction, right now it's breaking CI tests
-    // if (fusion_->hasReduction()) {
-    if (false) {
-      TensorView* red_tv = nullptr;
-      FusionGuard fg(fusion_.get());
-      for (auto expr : fusion_->exprs()) {
-        if (expr->getExprType().has_value() &&
-            expr->getExprType().value() == ExprType::ReductionOp) {
-          red_tv = expr->outputs()[0]->as<TensorView>();
-          break;
-        }
+  // caching strategy is different for pw-fusion and reduction-fusion.
+  if (fusion_->hasReduction()) {
+    // copy the fusion, since each FusionExecutor needs to manipulate the fusion
+    // in order to generate kernel.
+    Fusion fusion = *fusion_;
+    FusionGuard fg(&fusion);
+    TensorView* red_tv = nullptr;
+    for (auto expr : fusion.exprs()) {
+      if (expr->getExprType().has_value() &&
+          expr->getExprType().value() == ExprType::ReductionOp) {
+        red_tv = expr->outputs()[0]->as<TensorView>();
+        break;
       }
-      scheduleReduction(fusion_.get(), inputs, red_tv);
-    } else {
+    }
+    auto reduction_params = scheduleReduction(&fusion, inputs, red_tv);
+    TORCH_INTERNAL_ASSERT(
+        reduction_params.has_value(),
+        "reduction schedule failed in `scheduleReduction`");
+    auto& fusion_executor =
+        red_fusion_executor_cache_[reduction_params.value()];
+    if (!fusion_executor.compiled()) {
+      // This means we have not found a previously generated kernel that's
+      // compatible with the new reduction params. We need to finish codegen.
+      CompileOptions options;
+      options.device = device_;
+      fusion_executor.compileFusion(&fusion, options);
+    }
+    return fusion_executor.runFusion(inputs);
+  } else {
+    if (!pw_fusion_executor_cache_) {
+      pw_fusion_executor_cache_ = std::make_unique<FusionExecutor>();
+      CompileOptions options;
+      options.device = device_;
+      // no need to copy fusion_, as we are not generating more than 1 kernel
+      // for PW.
       scheduleFusion(fusion_.get(), inputs);
+      pw_fusion_executor_cache_->compileFusion(fusion_.get(), options);
     }
-    fusion_executor_cache_.emplace_back(std::make_unique<FusionExecutor>());
-    CompileOptions options;
-    options.device = device_;
-    fusion_executor_cache_.back()->compileFusion(fusion_.get(), options);
+    return pw_fusion_executor_cache_->runFusion(inputs);
   }
-  return fusion_executor_cache_.back()->runFusion(inputs);
 }
 
 GraphCache::InputsRequirement::InputsRequirement(

diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler.h>
 
 #include <c10/util/ArrayRef.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
@@ -36,9 +37,10 @@ namespace cuda {
 //          the GraphCache instance (We push back to both `input_stacks_` and
 //          `fe_cache_`, fusion executor cache.
 //     b. FusionExecutorCache
-//        - holds a vector of `FusionExecutor` to handle dynamic shape (varying
+//        - holds a group of `FusionExecutor` to handle dynamic shape (varying
 //          tensor sizes)
-//        - currently this is only a dummy implementation;
+//        - currently this is a dummy implementation and has branching to handle
+//          different scheduler for point-wise fusion and reduction fusion;
 //
 // * note computational graph
 // In theory, computational graph should refer to only the computational nodes
@@ -54,8 +56,6 @@ namespace cuda {
 // information now by generating an entry in GraphCache with the given profiling
 // record.
 
-// TODO: FusionExecutorCache is only a place holder here. It's populated in a
-// later PR.
 class FusionExecutorCache {
  public:
   // create new fusion executor cache at a given device to handle kernel
@@ -75,8 +75,23 @@ class FusionExecutorCache {
   // original un-scheduled `Fusion`;
   std::unique_ptr<Fusion> fusion_;
 
-  // TODO: placeholder that will be updated;
-  std::vector<std::unique_ptr<FusionExecutor>> fusion_executor_cache_;
+  // TODO: ugly logic for now. We should integrate the hashing of cache for
+  //       different kernels. (alternatively we could do so in scheduler).
+  // ugly bits now:
+  // The fact that we have heuristics only for reduction, but use a general
+  // kernel for all point-wise fusion ended up with this:
+  // 1. For point-wise fusion, we have a single `FusionExecutor` in
+  //    `pw_fusion_executor_cache_`
+  // 2. For reduction fusion we have a hash table with ReductionParams as entry
+  //    pointing to the actual `FusionExecutor` in `red_fusion_executor_cache_`
+  //
+  // Unfortunately, at run-time in order to search compatible `FusionExecutor`,
+  // we have to call `scheduleReduction` in order to get an instance of
+  // `ReductionParams` for indexing. This is not very efficient. Hence the TODO:
+  // add a direct cache from inputs shapes to `FusionExecutor` entries.
+  std::unique_ptr<FusionExecutor> pw_fusion_executor_cache_;
+  std::unordered_map<ReductionParams, FusionExecutor, ReductionParamsHash>
+      red_fusion_executor_cache_;
 };
 
 class GraphCache {