Add support for some empty fusion (#1981)

csarofeen · Sep 14, 2022 · 634820c · 634820c
1 parent eabe8d8
commit 634820c
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 7 deletions.
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -825,11 +825,6 @@ void bindInputForExprEvaluation(
 
  const auto value =
  root_domain[dim]->hasExpandedExtent() ? 1 : tensor_arg_size;
- if (value == 0 && cg_tensor->uses().empty()) {
- // If there's no uses, ignore there's a size-0 dimension.
- continue;
- }
- TORCH_INTERNAL_ASSERT(value != 0, "Cannot handle size-0 dimensions");
  bool should_bind = true;
  if (check_consistency) {
  const auto prev_value = expr_eval.evaluate(extent);

diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -663,6 +663,12 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
  const auto iter = output_holder.find(output);
  if (iter != output_holder.end()) {
  fusion_outputs.push_back(iter->second);
+ } else if (output->isFusionInput()) {
+ const auto iter = tensor_map.find(output);
+ TORCH_INTERNAL_ASSERT(
+ iter != tensor_map.end(), "Can not find output as aliased intput");
+ auto arg = dynamic_cast<const TensorArgAbstract*>(iter->second);
+ fusion_outputs.push_back(arg->getTensor());
  } else {
  bool empty_type_check = output->getDataType().has_value() &&
  output->getDataType().value() == DataType::Float;

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -866,8 +866,22 @@ class NoOpScheduler : public SchedulerEntry {
  //! Check if the no-op heuristics apply in given fusion
  static bool canScheduleCompileTime(Fusion* fusion) {
  // Check there're no non-trivial reduction ops.
- if (!ir_utils::getReductionOps(fusion, true /* ignore_trivial */).empty()) {
- return false;
+ for (auto reduction :
+ ir_utils::getReductionOps(fusion, true /* ignore_trivial */)) {
+ for (auto input :
+ ir_utils::filterByType<TensorView>(reduction->inputs())) {
+ auto root_dom = input->getRootDomain();
+ auto all_nonzero =
+ std::none_of(root_dom.begin(), root_dom.end(), [](IterDomain* id) {
+ return id->extent()->isZeroInt();
+ });
+ if (all_nonzero) {
+ scheduler_debug_utils::canScheduleRejectReason(
+ ScheduleHeuristic::NoOp,
+ "reduction of non-zero elements is not supported");
+ return false;
+ }
+ }
  }
 
  // Check that all outputs are either broadcast or ignored reduction.
@@ -893,6 +907,8 @@ class NoOpScheduler : public SchedulerEntry {
  [](IterDomain* id) { return id->extent()->isZeroInt(); })) {
  // We have found a out_tv with a dimension that NoOp scheduler couldn't
  // handle and therefore reject this fusion.
+ scheduler_debug_utils::canScheduleRejectReason(
+ ScheduleHeuristic::NoOp, "output has a concrete dimension");
  return false;
  }
  }

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -25824,6 +25824,115 @@ TEST_F(NVFuserTest, FusionNullScheduler_CUDA) {
  }
 }
 
+// Simple test case exercising the null scheduler path.
+TEST_F(NVFuserTest, FusionNullScheduler2_CUDA) {
+ auto fusion = std::make_unique<Fusion>();
+ FusionGuard fg(fusion.get());
+
+ auto tv0 = makeConcreteTensor({0, 1, 9223372036854775807L});
+ fusion->addInput(tv0);
+
+ auto tv1 = sum(tv0, {0, 1, 2});
+
+ fusion->addOutput(tv1);
+
+ auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+ at::Tensor t0 = at::randn({0, 1, 9223372036854775807L}, options);
+
+ std::vector<IValue> aten_inputs({t0});
+
+ FusionExecutorCache executor_cache(std::move(fusion));
+ auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+ auto t1 = t0.sum({0, 1, 2});
+
+ testValidate(
+ executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
+
+ auto groups =
+ executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+ // Check that all groups on the resulting runtime are null.
+ for (auto group : groups) {
+ TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
+ }
+}
+
+// Simple test case exercising the null scheduler path.
+TEST_F(NVFuserTest, FusionNullScheduler3_CUDA) {
+ auto fusion = std::make_unique<Fusion>();
+ FusionGuard fg(fusion.get());
+
+ auto tv0 = TensorViewBuilder().ndims(0).build();
+ auto tv1 = TensorViewBuilder().ndims(0).build();
+ fusion->addInput(tv0);
+ fusion->addInput(tv1);
+ auto tv2 = add(tv0, tv1);
+ fusion->addOutput(tv2);
+
+ auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+ at::Tensor t0 = at::randn({}, options);
+ at::Tensor t1 = at::randn({}, options);
+
+ std::vector<IValue> aten_inputs({t0, t1});
+
+ FusionExecutorCache executor_cache(std::move(fusion));
+ auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+ testValidate(
+ executor_cache.fusion(),
+ cg_outputs,
+ {t0, t1},
+ {t0 + t1},
+ __LINE__,
+ __FILE__);
+
+ auto groups =
+ executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+ // Check that all groups on the resulting runtime are null.
+ for (auto group : groups) {
+ TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
+ }
+}
+
+TEST_F(NVFuserTest, FusionEmpty_CUDA) {
+ auto fusion = std::make_unique<Fusion>();
+ FusionGuard fg(fusion.get());
+
+ auto tv0 = makeConcreteTensor({10, 10, 10});
+ auto tv1 = makeConcreteTensor({10, 10, 10});
+ fusion->addInput(tv0);
+ fusion->addInput(tv1);
+ fusion->addOutput(tv0);
+ fusion->addOutput(tv1);
+
+ auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+ at::Tensor t0 = at::randn({10, 10, 10}, options);
+ at::Tensor t1 = at::randn({10, 10, 10}, options);
+
+ std::vector<IValue> aten_inputs({t0, t1});
+
+ FusionExecutorCache executor_cache(std::move(fusion));
+ auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+ testValidate(
+ executor_cache.fusion(),
+ cg_outputs,
+ {t0, t1},
+ {t0, t1},
+ __LINE__,
+ __FILE__);
+
+ auto groups =
+ executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+ // Check that all groups on the resulting runtime are null.
+ for (auto group : groups) {
+ TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
+ }
+}
+
 TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
  auto fusion = fusion_ptr.get();