Skip to content

Commit

Permalink
Heuristic cache PR (#275)
Browse files Browse the repository at this point in the history
switched to scheduleReduction instead of naive scheduleFusion for reduction-fusion;
update FusionExecutorCache to reuse kernel with ReductionParamsHash
Note:
It's failing CI test due to: #273; but luckily we have the other PR merged that disabled broadcasting, so CI is green.
  • Loading branch information
jjsjann123 authored Aug 11, 2020
1 parent d851ecd commit eccbf78
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 26 deletions.
1 change: 1 addition & 0 deletions torch/csrc/jit/codegen/cuda/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
structured_code,
(kernelNamespace() + "::" + kernelName()).c_str(),
fusion_id);
compiled_ = true;
}

namespace {
Expand Down
10 changes: 9 additions & 1 deletion torch/csrc/jit/codegen/cuda/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ class TORCH_CUDA_API FusionExecutor {
return runFusion(inputs, {}, launch_constraints);
}

// function to query whether a `FusionExecutor` has a compiled kernel to
// execute
bool compiled() const {
return compiled_;
};

private:
std::string kernelName() const {
std::stringstream ss;
Expand All @@ -59,6 +65,8 @@ class TORCH_CUDA_API FusionExecutor {
std::vector<at::Tensor> allocOutputs(EvaluationContext& ec);

private:
bool compiled_ = false;

Fusion fusion_;

CompileOptions options_;
Expand All @@ -78,4 +86,4 @@ class TORCH_CUDA_API FusionExecutor {
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch
} // namespace torch
55 changes: 36 additions & 19 deletions torch/csrc/jit/codegen/cuda/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,29 +193,46 @@ FusionExecutorCache::FusionExecutorCache(
// TODO: dummy cache
std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
const at::ArrayRef<IValue>& inputs) {
if (fusion_executor_cache_.empty()) {
// TODO: enable Kevin's scheduleReduction, right now it's breaking CI tests
// if (fusion_->hasReduction()) {
if (false) {
TensorView* red_tv = nullptr;
FusionGuard fg(fusion_.get());
for (auto expr : fusion_->exprs()) {
if (expr->getExprType().has_value() &&
expr->getExprType().value() == ExprType::ReductionOp) {
red_tv = expr->outputs()[0]->as<TensorView>();
break;
}
// caching strategy is different for pw-fusion and reduction-fusion.
if (fusion_->hasReduction()) {
// copy the fusion, since each FusionExecutor needs to manipulate the fusion
// in order to generate kernel.
Fusion fusion = *fusion_;
FusionGuard fg(&fusion);
TensorView* red_tv = nullptr;
for (auto expr : fusion.exprs()) {
if (expr->getExprType().has_value() &&
expr->getExprType().value() == ExprType::ReductionOp) {
red_tv = expr->outputs()[0]->as<TensorView>();
break;
}
scheduleReduction(fusion_.get(), inputs, red_tv);
} else {
}
auto reduction_params = scheduleReduction(&fusion, inputs, red_tv);
TORCH_INTERNAL_ASSERT(
reduction_params.has_value(),
"reduction schedule failed in `scheduleReduction`");
auto& fusion_executor =
red_fusion_executor_cache_[reduction_params.value()];
if (!fusion_executor.compiled()) {
// This means we have not found a previously generated kernel that's
// compatible with the new reduction params. We need to finish codegen.
CompileOptions options;
options.device = device_;
fusion_executor.compileFusion(&fusion, options);
}
return fusion_executor.runFusion(inputs);
} else {
if (!pw_fusion_executor_cache_) {
pw_fusion_executor_cache_ = std::make_unique<FusionExecutor>();
CompileOptions options;
options.device = device_;
// no need to copy fusion_, as we are not generating more than 1 kernel
// for PW.
scheduleFusion(fusion_.get(), inputs);
pw_fusion_executor_cache_->compileFusion(fusion_.get(), options);
}
fusion_executor_cache_.emplace_back(std::make_unique<FusionExecutor>());
CompileOptions options;
options.device = device_;
fusion_executor_cache_.back()->compileFusion(fusion_.get(), options);
return pw_fusion_executor_cache_->runFusion(inputs);
}
return fusion_executor_cache_.back()->runFusion(inputs);
}

GraphCache::InputsRequirement::InputsRequirement(
Expand Down
27 changes: 21 additions & 6 deletions torch/csrc/jit/codegen/cuda/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/scheduler.h>

#include <c10/util/ArrayRef.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
Expand Down Expand Up @@ -36,9 +37,10 @@ namespace cuda {
// the GraphCache instance (We push back to both `input_stacks_` and
// `fe_cache_`, fusion executor cache.
// b. FusionExecutorCache
// - holds a vector of `FusionExecutor` to handle dynamic shape (varying
// - holds a group of `FusionExecutor` to handle dynamic shape (varying
// tensor sizes)
// - currently this is only a dummy implementation;
// - currently this is a dummy implementation and has branching to handle
// different scheduler for point-wise fusion and reduction fusion;
//
// * note computational graph
// In theory, computational graph should refer to only the computational nodes
Expand All @@ -54,8 +56,6 @@ namespace cuda {
// information now by generating an entry in GraphCache with the given profiling
// record.

// TODO: FusionExecutorCache is only a place holder here. It's populated in a
// later PR.
class FusionExecutorCache {
public:
// create new fusion executor cache at a given device to handle kernel
Expand All @@ -75,8 +75,23 @@ class FusionExecutorCache {
// original un-scheduled `Fusion`;
std::unique_ptr<Fusion> fusion_;

// TODO: placeholder that will be updated;
std::vector<std::unique_ptr<FusionExecutor>> fusion_executor_cache_;
// TODO: ugly logic for now. We should integrate the hashing of cache for
// different kernels. (alternatively we could do so in scheduler).
// ugly bits now:
// The fact that we have heuristics only for reduction, but use a general
// kernel for all point-wise fusion ended up with this:
// 1. For point-wise fusion, we have a single `FusionExecutor` in
// `pw_fusion_executor_cache_`
// 2. For reduction fusion we have a hash table with ReductionParams as entry
// pointing to the actual `FusionExecutor` in `red_fusion_executor_cache_`
//
// Unfortunately, at run-time in order to search compatible `FusionExecutor`,
// we have to call `scheduleReduction` in order to get an instance of
// `ReductionParams` for indexing. This is not very efficient. Hence the TODO:
// add a direct cache from inputs shapes to `FusionExecutor` entries.
std::unique_ptr<FusionExecutor> pw_fusion_executor_cache_;
std::unordered_map<ReductionParams, FusionExecutor, ReductionParamsHash>
red_fusion_executor_cache_;
};

class GraphCache {
Expand Down

0 comments on commit eccbf78

Please sign in to comment.