Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Segment self mapping fusions #1954

Merged
merged 8 commits into from
Sep 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 42 additions & 31 deletions torch/csrc/jit/codegen/cuda/compute_at_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>

#include <tuple>

namespace torch {
namespace jit {
namespace fuser {
Expand All @@ -29,8 +31,22 @@ bool idIsALeafDomain(IterDomain* id, TensorView* tv) {

} // namespace

IterDomainGraph::IterDomainGraph(Fusion* fusion) {
IterDomainGraph::IterDomainGraph(Fusion* fusion, bool allow_self_mapping) {
build(fusion);

if (!allow_self_mapping) {
TORCH_INTERNAL_ASSERT(
!hasSelfMapping(),
"Unsupported domain mapping detected in ",
std::get<0>(*self_mapping_info_)->toString(),
". ",
std::get<3>(*self_mapping_info_),
" domains, ",
std::get<1>(*self_mapping_info_)->toString(),
" and ",
std::get<2>(*self_mapping_info_)->toString(),
", are mapped with each other.");
}
}

//! Map corresponding inputs and outputs of swizzle op together
Expand Down Expand Up @@ -197,7 +213,8 @@ c10::optional<std::pair<IterDomain*, IterDomain*>> detectMappablePair(
// those domains should never be mapped with each other. It may be
// possible to lift this assumption, but it's unclear if it could
// matter in practice.
void failIfSelfMappingExists(Fusion* fusion, const IterDomainGraph& id_graph) {
c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
findFirstSelfMapping(Fusion* fusion, const IterDomainGraph& id_graph) {
for (auto tv : ir_utils::allTvs(fusion)) {
// For each tensor, make sure root, rfactor and leaf domains
// should not include domains that are mapped with another domain
Expand All @@ -207,44 +224,39 @@ void failIfSelfMappingExists(Fusion* fusion, const IterDomainGraph& id_graph) {
// Root domains
auto self_mappped_root_pair =
detectMappablePair(tv->getRootDomain(), id_graph);
TORCH_INTERNAL_ASSERT(
!self_mappped_root_pair.has_value(),
"Unsupported domain mapping detected in ",
tv->toString(),
". Root domains, ",
self_mappped_root_pair->first->toString(),
" and ",
self_mappped_root_pair->second->toString(),
", are mapped with each other.");
if (self_mappped_root_pair.has_value()) {
return std::make_tuple(
tv,
self_mappped_root_pair->first,
self_mappped_root_pair->second,
"Root");
}

// Rfactor domains
if (tv->hasRFactor()) {
auto self_mappped_rf_pair =
detectMappablePair(tv->getRFactorDomain(), id_graph);
TORCH_INTERNAL_ASSERT(
!self_mappped_rf_pair.has_value(),
"Unsupported domain mapping detected in ",
tv->toString(),
". RFactor domains, ",
self_mappped_rf_pair->first->toString(),
" and ",
self_mappped_rf_pair->second->toString(),
", are mapped with each other.");
if (self_mappped_rf_pair.has_value()) {
return std::make_tuple(
tv,
self_mappped_rf_pair->first,
self_mappped_rf_pair->second,
"RFactor");
}
}

// Leaf domains
auto self_mappped_leaf_pair =
detectMappablePair(tv->domain()->domain(), id_graph);
TORCH_INTERNAL_ASSERT(
!self_mappped_leaf_pair.has_value(),
"Unsupported domain mapping detected in ",
tv->toString(),
". Leaf domains, ",
self_mappped_leaf_pair->first->toString(),
" and ",
self_mappped_leaf_pair->second->toString(),
", are mapped with each other.");
if (self_mappped_leaf_pair.has_value()) {
return std::make_tuple(
tv,
self_mappped_leaf_pair->first,
self_mappped_leaf_pair->second,
"Leaf");
}
}
return c10::nullopt;
}

} // namespace
Expand Down Expand Up @@ -591,8 +603,7 @@ void IterDomainGraph::build(Fusion* fusion) {
}
}
}

failIfSelfMappingExists(fusion, *this);
self_mapping_info_ = findFirstSelfMapping(fusion, *this);
}

void IterDomainGraph::initializeId(
Expand Down
9 changes: 8 additions & 1 deletion torch/csrc/jit/codegen/cuda/compute_at_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ namespace cuda {
// Do not forward through any broadcast IDs
class TORCH_CUDA_CU_API IterDomainGraph {
public:
IterDomainGraph(Fusion* fusion);
IterDomainGraph(Fusion* fusion, bool allow_self_mapping = false);

const DisjointSets<IterDomain*>& permissiveNodes() const {
return permissive_nodes_;
Expand Down Expand Up @@ -88,6 +88,10 @@ class TORCH_CUDA_CU_API IterDomainGraph {
return view_rfactor_ids_;
}

bool hasSelfMapping() const {
return self_mapping_info_.has_value();
}

private:
void build(Fusion* fusion);

Expand Down Expand Up @@ -116,6 +120,9 @@ class TORCH_CUDA_CU_API IterDomainGraph {
VectorOfUniqueEntries<IterDomain*> all_ids_;

std::unordered_set<IterDomain*> view_rfactor_ids_;

c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
self_mapping_info_ = c10::nullopt;
};

class TrivialReductionInfo;
Expand Down
3 changes: 3 additions & 0 deletions torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1501,6 +1501,9 @@ bool checkCanSchedule(
if (!isConnectedFusionGraph(fusion)) {
return false;
}
if (IterDomainGraph(fusion, /*allow_self_mapping=*/true).hasSelfMapping()) {
return false;
}
if (!SchedulerType::canScheduleCompileTime(fusion)) {
return false;
}
Expand Down
56 changes: 56 additions & 0 deletions torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
Expand Down Expand Up @@ -795,6 +796,61 @@ TEST_F(NVFuserTest, FusionViewNoTranspose_CUDA) {
TORCH_CHECK(!hasAtLeastTwoValidGroups(&fusion));
}

TEST_F(NVFuserTest, FusionTransposeSelfMapping_CUDA) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(2);
fusion.addInput(tv0);
auto tv1 = transpose(tv0, 0, 1);
auto tv2 = add(tv0, tv1);
fusion.addOutput(tv2);

EXPECT_THAT(
[&]() { IterDomainGraph(fusion_ptr.get()); },
testing::ThrowsMessage<c10::Error>(
testing::HasSubstr("Unsupported domain mapping detected")));

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn({5, 5}, options);

FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto cg_outputs = executor_cache.runFusionWithInputs({t0});

auto ref = t0.transpose(0, 1) + t0;

testValidate(
executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
}

#if 0
// silent wrong result
TEST_F(NVFuserTest, FusionTransposeViewSelfMapping_CUDA) {
Comment on lines +827 to +829
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't get caught?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Random thought: we might want to make sure the disjoint view sets, and something similar for axes involved in permutation match. The issue here seems to be that we're transposing the same partial dimensions. We probably want to make sure throughout the fusion the dimensions active in a transpose are dimensions that are disjoint in the view set.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, thinking through the analysis, I don't know if we'd need to build something like a transpose disjoint map, but I think it might be enough if we check all the dimensions active in a transpose are disjoint. I'm wondering if we could somehow get in trouble with a series of transposes that are all individually disjoint in the view map, but could exhibit behavior above where we're effectively transposing "partial" dimensions.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any thoughts on this @zasdfgbnm ?

Copy link
Collaborator Author

@zasdfgbnm zasdfgbnm Sep 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be enough if we check all the dimensions active in a transpose are disjoint

I feel that this is in the wrong direction. For example torch.zeros(6).view(2, 3).transpose(0, 1) should be totally fine to schedule, but the transpose dimensions are not disjoint in the view disjoint map.

In my opinion, using solely disjoint-set is not sufficient for modeling this problem. That is, we can not just come up with a new fancy disjoint-set of IterDomains such that a fusion is schedulable iif its disjoint-set is really disjoint.

I think we should do the following analysis: For the DAG

       T0[I0, I1]
       /        \
     view   transpose
      |          |
T1[I2, I3]   T2[I4, I5]
       \       /
       T3[I6, I7]

we first start from the view operation. What the view operation tells us is:

  1. On the producer side, {I0, I1} must be ordered as (I0, I1) and I0 is contiguous.
  2. On the consumer side, {I2, I3} must be ordered as (I2, I3) and I2 is contiguous.

If we propagate this order and contiguity information, along the DAG, at some point, we will see a conflict. For example, if we propagate T0->T2 and T1->T3->T2, then:

  1. from the first path, we will conclude that {I4, I5} must be ordered as {I5, I4} and I5 is contiguous.
  2. from the second path, we will conclude that {I4, and I5} must be ordered as {I4, I5} and I4 is contiguous.

The above 1 and 2 are in conflict, so we reject this fusion.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we merge this PR, and discuss and fix the skipped test in a follow-up? Currently, transpose is already enabled on TorchScript in TOT devel, and this would prevent the bug from happening on transpose.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, let's merge this, then please move your comment to an issue and we can discuss further there.

std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(2);
fusion.addInput(tv0);
auto tv1 = transpose(tv0, 0, 1);
auto tv2 = view(tv0, {2, 3}, {3, 2});
auto tv3 = add(tv1, tv2);
fusion.addOutput(tv3);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn({2, 3}, options);

FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto cg_outputs = executor_cache.runFusionWithInputs({t0});

auto ref = t0.transpose(0, 1) + t0.view({3, 2});

testValidate(
executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
}
#endif

// t0------------.
// t2->broadcast->sub->mul->relu->t6
// t1------------------'
Expand Down