From e84fff8dacf382351b725d0d1f469168e0db28c3 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 24 Aug 2022 10:19:53 -0700 Subject: [PATCH 01/32] Enable transpose scheduler --- torch/csrc/jit/codegen/cuda/scheduler/registry.cpp | 6 ------ torch/csrc/jit/codegen/cuda/utils.cpp | 5 +---- torch/csrc/jit/codegen/cuda/utils.h | 1 - 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index ac7f66836a87c..2498c4b55b1f2 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1256,12 +1256,6 @@ class TransposeScheduler : public SchedulerEntry { } static bool canScheduleCompileTime(Fusion* fusion) { - if (!isOptionEnabled(EnableOption::TransposeScheduler)) { - scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Transpose, "not enabled"); - return false; - } - // Temporarily disallow view in transpose scheduler // TODO Add more testing before enabling auto view_tvs = scheduler_utils::getViewTVs(fusion); diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp index 5e82014c0c388..8eec9d7db0c45 100644 --- a/torch/csrc/jit/codegen/cuda/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/utils.cpp @@ -169,8 +169,7 @@ auto parseEnableOptions() { {EnableOption::Complex, false}, {EnableOption::KernelProfile, false}, {EnableOption::LinearDecomposition, false}, - {EnableOption::ConvDecomposition, false}, - {EnableOption::TransposeScheduler, false}}; + {EnableOption::ConvDecomposition, false}}; if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) { c10::string_view options_view(dump_options); @@ -185,8 +184,6 @@ auto parseEnableOptions() { options_map[EnableOption::LinearDecomposition] = true; } else if (token == "conv_decomposition") { options_map[EnableOption::ConvDecomposition] = true; - } else if (token == "transpose_scheduler") { - options_map[EnableOption::TransposeScheduler] = true; } else { TORCH_CHECK( false, diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index 679776b383af0..6c29b40550f00 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -80,7 +80,6 @@ enum class EnableOption { KernelProfile, //! Enable intra-kernel performance profiling LinearDecomposition, //! Enable linear-bias decomposition ConvDecomposition, //! Enable conv-bias decomposition - TransposeScheduler //! Enable the experimental transpose scheduler }; TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option); From bb8cef111c4828a9690ba68752dca84acd83fd08 Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 10:52:49 -0700 Subject: [PATCH 02/32] add lower index resolution --- torch/csrc/jit/codegen/cuda/index_compute.cpp | 33 +++++++++++ .../jit/codegen/cuda/lower_index_compute.cpp | 57 +++++++++++++++++++ .../jit/codegen/cuda/lower_index_compute.h | 12 ++++ 3 files changed, 102 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index 895c3e4bd96ef..e7feff88fb10c 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -650,6 +650,8 @@ IndexCompute::IndexCompute( } void IndexCompute::run(const LoopIndexing& loop_indexing) { + TORCH_INTERNAL_ASSERT( + concrete_id_pass_, "concrete pass only for this option"); // Apply loop swizzles if there are any that outputs to // the loop domains. // Currently only support loop swizzles that directly output @@ -669,9 +671,40 @@ void IndexCompute::run(const LoopIndexing& loop_indexing) { } } + // Resolve the out of line expressions first: + std::unordered_map permissive_index_map; + + for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) { + handle(expr); + + // Collect backward results from this expression if they are + // made available in by this expression. + auto id_inputs = ir_utils::filterByType(expr->inputs()); + for (auto id : id_inputs) { + auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id)); + if (idx_it != index_map_.end()) { + permissive_index_map[GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE)] = idx_it->second; + } + } + } + // Run through the loop indexing expressions and generate // the indexing integer math for the concrete ids. for (auto expr : loop_indexing.getBackwardExprList()) { + auto id_outputs = ir_utils::filterByType(expr->outputs()); + + for (auto id : id_outputs) { + auto concrete_id = ir_utils::caMapExactConcreteId(id); + if (!index_map_.count(concrete_id)) { + auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE); + auto permissive_it = permissive_index_map.find(permissive_id); + if (permissive_it != permissive_index_map.end()) { + index_map_[concrete_id] = permissive_it->second; + } + } + } handle(expr); } } diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp index 70b019a4cc48c..2d4444d340903 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp @@ -438,6 +438,7 @@ class LoopIndexingAnalysis { indexing.loop_root_ = loop_root_domains_; indexing.loop_domains_ = loop_domains_.vector(); indexing.index_exprs_ = replayed_exprs_; + indexing.out_of_line_exprs_ = out_of_line_exprs_; return indexing; } @@ -481,6 +482,12 @@ class LoopIndexingAnalysis { //! loop_domains_ with all of these iter domains. void constructLoopDomains(); + //! Fills out_of_line_exprs_ by traversing the selected list of + //! expressions in reverse topological order and collect iterdomains + //! on the indexing paths that only involves leaf id's on the right + //! of consumer's ca axis. + void collectOutOfLineExprs(); + private: //! Original loop nest input to derive info from. const std::vector& loops_; @@ -521,6 +528,10 @@ class LoopIndexingAnalysis { //! Selected list of exprs that will produce and consume each //! of the exact concrete ids from the loop nest exactly once. std::vector replayed_exprs_; + + //! Set of expressions from the selected list that can be + //! resolved from axes on the right of ca axes. + std::vector out_of_line_exprs_; }; LoopIndexingAnalysis::LoopIndexingAnalysis( @@ -559,6 +570,10 @@ LoopIndexingAnalysis::LoopIndexingAnalysis( // Reconstruct the iterdomain view of the original loopnest after resolving // the exact definition of each index. constructLoopDomains(); + + //! Collect the set of indexing expressions that can be + //! resolved out of line. + collectOutOfLineExprs(); } void LoopIndexingAnalysis::validateLoopStructure( @@ -1088,6 +1103,48 @@ std::vector LoopIndexingTraversal::getExprList() { } // namespace +void LoopIndexingAnalysis::collectOutOfLineExprs() { + // Keep track of all the id's that can be resolved without + // iterdomains on the left of ca axes. + std::unordered_set out_of_line_ids; + + // Start the set with all the leaf ids. + std::transform( + consumer_tv_->domain()->domain().begin() + + consumer_tv_->getComputeAtPosition(), + consumer_tv_->domain()->domain().end(), + std::inserter(out_of_line_ids, out_of_line_ids.end()), + ir_utils::caMapExactConcreteId); + + // Get the original selected list of index expressions + // in reverse topological order. + auto backward_expr_list = + LoopIndexingTraversal::backwardTopologicalOrder(replayed_exprs_); + + for (auto expr : backward_expr_list) { + auto id_outputs = ir_utils::filterByType(expr->outputs()); + if ( + // Check that all of the outputs are out of line + std::all_of( + id_outputs.begin(), + id_outputs.end(), + [&out_of_line_ids](IterDomain* id) { + return out_of_line_ids.count(ir_utils::caMapExactConcreteId(id)); + })) { + // Record out of line expression + out_of_line_exprs_.push_back(expr); + + // Add all of the expression inputs as out of line id's. + auto id_inputs = ir_utils::filterByType(expr->inputs()); + std::transform( + id_inputs.begin(), + id_inputs.end(), + std::inserter(out_of_line_ids, out_of_line_ids.end()), + ir_utils::caMapExactConcreteId); + } + } +} + std::vector LoopIndexing::getForwardExprList() const { return LoopIndexingTraversal::forwardTopologicalOrder(index_exprs_); } diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.h b/torch/csrc/jit/codegen/cuda/lower_index_compute.h index d8d4dd7103b3a..4b81fd0dec0c5 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index_compute.h +++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.h @@ -127,6 +127,12 @@ class LoopIndexing { //! topological order. std::vector getBackwardExprList() const; + //! Returns the set of out of line expressions in + //! reverse topological order. + const std::vector& getBackwardOutOfLineExprList() const { + return out_of_line_exprs_; + } + //! Returns all exact concrete id's that were produced //! or consumed in the selected indexing expressions std::unordered_set getAllExactConcreteIdSet() const; @@ -152,6 +158,12 @@ class LoopIndexing { //! The selected sequence of expressions that should represent //! the correct indexing math from the given loop nest. std::vector index_exprs_; + + //! The subset of sequence of expressions that can be resolved + //! with only the iterdomains on the right of consumer tv's ca + //! axis. + //! Expressions are ordered in reverse topological order. + std::vector out_of_line_exprs_; }; // When indexing there are sometimes an option to propagate an index down From 799230907888fa218b300c9156979f38a54d4de1 Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 13:52:24 -0700 Subject: [PATCH 03/32] add repro --- .../codegen/cuda/test/test_gpu_tensorcore.cpp | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp index c00d02c8a40dd..fa89f9e91a79a 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp @@ -2857,6 +2857,67 @@ TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) { } } +TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(4); + auto tv1 = makeConcreteTensor({-1, -1, -1, 1}); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = add(tv0, tv1); + fusion.addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input0 = at::randn({1, 1, 333, 1}, options); + at::Tensor input1 = at::randn({1, 1, 333, 1}, options); + + auto lparams = scheduleTranspose(&fusion, {input0, input1}); + + FusionExecutor fe; + fe.compileFusion(&fusion, {input0, input1}, lparams); + auto outputs = fe.runFusion({input0, input1}, lparams); + + auto tv_ref = input0 + input1; + + testValidate( + &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + auto tv1 = makeContigTensor(2); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = set(tv0); + auto tv3 = broadcast(tv2, {true, false}); + auto tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + tv4->merge(0); + tv4->split(0, 32); + + tv0->computeAt(tv4, 1); + + tv2->split(-1, 8); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({123}, options); + at::Tensor t1 = at::randn({3, 123}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}); + + auto outputs = fe.runFusion({t0, t1}); + + auto tv_ref = t0 + t1; + + testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__); +} + #undef NVFUSER_TEST_CUDA_ARCH_GUARD } // namespace jit From e9d09fe57264443cbe0ca22fd1c0b90d2f50175e Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 15:39:31 -0700 Subject: [PATCH 04/32] clear GPU memory after test --- torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h index 0247c33c8a726..05a4fd600b653 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h @@ -6,6 +6,7 @@ #include #include +#include #include @@ -36,6 +37,10 @@ class NVFuserTest : public ::testing::Test { GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs"; } } + + void TearDown() override{ + c10::cuda::CUDACachingAllocator::emptyCache(); + } }; struct ValidationConstants { From f406e23bb49c755a2d97c04f839d5dc8bbc0384b Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 19:33:35 -0700 Subject: [PATCH 05/32] cleanup and comment --- torch/csrc/jit/codegen/cuda/index_compute.cpp | 88 +++++++++++++------ torch/csrc/jit/codegen/cuda/index_compute.h | 22 +++++ torch/csrc/jit/codegen/cuda/test/test_gpu.cpp | 63 +++++++++++++ .../codegen/cuda/test/test_gpu_tensorcore.cpp | 61 ------------- .../codegen/cuda/test/test_gpu_validator.h | 4 +- 5 files changed, 149 insertions(+), 89 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index e7feff88fb10c..edd4fcf2c1acb 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -671,41 +671,77 @@ void IndexCompute::run(const LoopIndexing& loop_indexing) { } } - // Resolve the out of line expressions first: - std::unordered_map permissive_index_map; + // Resolve the index vals that could be resolved with only + // the loops that consumer_tv doesn't share with any of its + // consumers, i.e. the not-inlined loops that define consumer_tv + // values. + collectIndexIntoPermissiveMap(loop_indexing); + + // Run through the loop indexing expressions and generate + // the indexing integer math for the concrete ids. + for (auto expr : loop_indexing.getBackwardExprList()) { + // Resolve missing values from permissive map. + updateIndexMapFromPermissiveMap(expr); - for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) { handle(expr); + } +} - // Collect backward results from this expression if they are - // made available in by this expression. - auto id_inputs = ir_utils::filterByType(expr->inputs()); - for (auto id : id_inputs) { - auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id)); - if (idx_it != index_map_.end()) { - permissive_index_map[GpuLower::current()->caMap()->getConcreteMappedID( - id, IdMappingMode::PERMISSIVE)] = idx_it->second; +void IndexCompute::collectIndexIntoPermissiveMap( + const LoopIndexing& loop_indexing) { + // Visit the expressions that only produces un-inlined iterdomains, + // in reverse topological order. + for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) { + // Compute indexing vals for the expression inputs. + // + // This stage should run before any indexing computation so it could be + // made sure that all index values computed at this stage are + // the ones that can be resolved only with the not-inlined + // iterdomains. + // + auto id_outputs = ir_utils::filterByType(expr->outputs()); + if (std::all_of( + id_outputs.begin(), id_outputs.end(), [this](IterDomain* id) { + return index_map_.count(ir_utils::caMapExactConcreteId(id)); + })) { + // Visit this expression: + // LoopIndexingAnalysis::traverseFromDomainVals made sure that each + // concrete index is bound exactly once so computing these expressions + // early should still be consistent. + handle(expr); + + auto id_inputs = ir_utils::filterByType(expr->inputs()); + for (auto id : id_inputs) { + // Collect backward pass results from this expression if they are + // made available in by this expression. + auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id)); + + if (idx_it != index_map_.end()) { + permissive_index_map_ + [GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE)] = idx_it->second; + } } } } +} - // Run through the loop indexing expressions and generate - // the indexing integer math for the concrete ids. - for (auto expr : loop_indexing.getBackwardExprList()) { - auto id_outputs = ir_utils::filterByType(expr->outputs()); - - for (auto id : id_outputs) { - auto concrete_id = ir_utils::caMapExactConcreteId(id); - if (!index_map_.count(concrete_id)) { - auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID( - id, IdMappingMode::PERMISSIVE); - auto permissive_it = permissive_index_map.find(permissive_id); - if (permissive_it != permissive_index_map.end()) { - index_map_[concrete_id] = permissive_it->second; - } +void IndexCompute::updateIndexMapFromPermissiveMap(const Expr* id_expr) { + auto id_outputs = ir_utils::filterByType(id_expr->outputs()); + for (auto id : id_outputs) { + auto concrete_id = ir_utils::caMapExactConcreteId(id); + // Only try to copy index val from permissive map when + // the index is missing. + if (!index_map_.count(concrete_id)) { + auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE); + // Write the permissive index val into index_map_ if the + // missing value is found here. + auto permissive_it = permissive_index_map_.find(permissive_id); + if (permissive_it != permissive_index_map_.end()) { + index_map_[concrete_id] = permissive_it->second; } } - handle(expr); } } diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h index f064ebba293cb..3d865b4a8ceb0 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.h +++ b/torch/csrc/jit/codegen/cuda/index_compute.h @@ -86,6 +86,18 @@ class IndexCompute : public BackwardVisitor { //! based traversal. IterDomain* maybeGetExactMapConcreteID(IterDomain* id); + //! (Concrete indexing pass only) + //! Collect permissive index binding from the given expression. + //! See also permissive_map_ and LoopIndexing::getBackwardOutOfLineExprList. + void collectIndexIntoPermissiveMap(const LoopIndexing& loop_indexing); + + //! (Concrete indexing pass only) + //! Iterate through id_expr's input and pull index vals from permissive + //! map, when both of the following are true: + //! 1. the output id is missing in index_map_. + //! 2. the output id is found in permissive map. + void updateIndexMapFromPermissiveMap(const Expr* id_expr); + // Tensor domain we're mapping back to root const TensorDomain* td_; // NOLINT @@ -137,6 +149,16 @@ class IndexCompute : public BackwardVisitor { // pass. See also [Note on swizzle mode] SwizzleMode swizzle_mode_ = SwizzleMode::NoSwizzle; + // (Concrete id pass only) + // Contains the indexing math that could be resolved with only the + // iterdomains on the right of the consumer_tv's ca axis, i.e. the + // ones that corresponding to the loops that consumer_tv would not + // share with any of its consumers. + // These indexing vals should be kept separate from index_map_ and + // should only be used when the indexing traversal follows the + // order defined in LoopIndexingAnalysis::traverseFromDomainVals. + std::unordered_map permissive_index_map_; + public: const std::unordered_map& indexMap() const { return index_map_; diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp index 4f72bf93ba36e..8f2d3927eb1c1 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp @@ -25512,6 +25512,69 @@ TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) { executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__); } +// Repro for issue #1925 +TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(4); + auto tv1 = makeConcreteTensor({-1, -1, -1, 1}); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = add(tv0, tv1); + fusion.addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input0 = at::randn({1, 1, 333, 1}, options); + at::Tensor input1 = at::randn({1, 1, 333, 1}, options); + + auto lparams = scheduleTranspose(&fusion, {input0, input1}); + + FusionExecutor fe; + fe.compileFusion(&fusion, {input0, input1}, lparams); + auto outputs = fe.runFusion({input0, input1}, lparams); + + auto tv_ref = input0 + input1; + + testValidate( + &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__); +} + +// Repro for issue #1873 +TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + auto tv1 = makeContigTensor(2); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = set(tv0); + auto tv3 = broadcast(tv2, {true, false}); + auto tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + tv4->merge(0); + tv4->split(0, 32); + + tv0->computeAt(tv4, 1); + + tv2->split(-1, 8); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({123}, options); + at::Tensor t1 = at::randn({3, 123}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}); + + auto outputs = fe.runFusion({t0, t1}); + + auto tv_ref = t0 + t1; + + testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__); +} + TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) { // https://github.com/csarofeen/pytorch/issues/1926 std::unique_ptr fusion_ptr = std::make_unique(); diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp index fa89f9e91a79a..c00d02c8a40dd 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp @@ -2857,67 +2857,6 @@ TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) { } } -TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(4); - auto tv1 = makeConcreteTensor({-1, -1, -1, 1}); - fusion.addInput(tv0); - fusion.addInput(tv1); - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::randn({1, 1, 333, 1}, options); - at::Tensor input1 = at::randn({1, 1, 333, 1}, options); - - auto lparams = scheduleTranspose(&fusion, {input0, input1}); - - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1}, lparams); - auto outputs = fe.runFusion({input0, input1}, lparams); - - auto tv_ref = input0 + input1; - - testValidate( - &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__); -} - -TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(1); - auto tv1 = makeContigTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - auto tv2 = set(tv0); - auto tv3 = broadcast(tv2, {true, false}); - auto tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv4->merge(0); - tv4->split(0, 32); - - tv0->computeAt(tv4, 1); - - tv2->split(-1, 8); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({123}, options); - at::Tensor t1 = at::randn({3, 123}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - - auto outputs = fe.runFusion({t0, t1}); - - auto tv_ref = t0 + t1; - - testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__); -} - #undef NVFUSER_TEST_CUDA_ARCH_GUARD } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h index 05a4fd600b653..2d0bada1c0911 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h @@ -5,8 +5,8 @@ #include #include -#include #include +#include #include @@ -38,7 +38,7 @@ class NVFuserTest : public ::testing::Test { } } - void TearDown() override{ + void TearDown() override { c10::cuda::CUDACachingAllocator::emptyCache(); } }; From abe1f6dc5d11902ca25a27dea91a4e174f72de0d Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 25 Aug 2022 00:20:09 -0700 Subject: [PATCH 06/32] fix --- torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h | 4 ++++ torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h index 7947a27f48360..13d306fa06654 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h @@ -24,6 +24,10 @@ class DomainMap { return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT); } + bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const { + return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE); + } + const ComputeAtMap& getComputeAtMap() const { return ca_map_; } diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 5ef502321b773..d9cf61fbcb0a0 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -59,9 +59,10 @@ class DomainMap : public pointwise_utils::DomainMap { } int getPosMappedTo(TensorView* tv, IterDomain* id) const { + std::cout << ca_map_.toString() << std::endl; const auto& dom = tv->domain()->domain(); for (auto i : c10::irange(dom.size())) { - if (areExactMapped(id, tv->axis(i))) { + if (arePermissiveMapped(tv->axis(i), id)) { return i; } } @@ -382,6 +383,9 @@ std::shared_ptr getTransposeHeuristics( auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1); auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2); + fusion->printMath(); + fusion->print(); + auto inner_most_pos1_in_ref1 = domain_map.getPosMappedTo(reference1, inner_most_id1); auto inner_most_pos2_in_ref1 = From bcc6f6cd0fb0945644ddb359f543e63dc21c889f Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 25 Aug 2022 00:25:36 -0700 Subject: [PATCH 07/32] fix --- torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h | 6 ++++++ torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h index 13d306fa06654..8b40a306922ee 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h @@ -21,10 +21,16 @@ class DomainMap { virtual ~DomainMap() = default; bool areExactMapped(IterDomain* id1, IterDomain* id2) const { + if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { + return false; + } return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT); } bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const { + if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { + return false; + } return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE); } diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index d9cf61fbcb0a0..5ac3f61702a5e 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -59,7 +59,6 @@ class DomainMap : public pointwise_utils::DomainMap { } int getPosMappedTo(TensorView* tv, IterDomain* id) const { - std::cout << ca_map_.toString() << std::endl; const auto& dom = tv->domain()->domain(); for (auto i : c10::irange(dom.size())) { if (arePermissiveMapped(tv->axis(i), id)) { @@ -383,9 +382,6 @@ std::shared_ptr getTransposeHeuristics( auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1); auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2); - fusion->printMath(); - fusion->print(); - auto inner_most_pos1_in_ref1 = domain_map.getPosMappedTo(reference1, inner_most_id1); auto inner_most_pos2_in_ref1 = From 63e3e762905a236c014505abe461211e12f3cfea Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 25 Aug 2022 11:45:19 -0700 Subject: [PATCH 08/32] Allow splitting inner-most ID to create virtual innermost ID --- .../codegen/cuda/scheduler/pointwise_utils.h | 10 +++ .../jit/codegen/cuda/scheduler/transpose.cpp | 72 ++++++++++++++++--- .../codegen/cuda/test/test_gpu_transpose.cpp | 31 ++++++++ 3 files changed, 102 insertions(+), 11 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h index 7947a27f48360..8b40a306922ee 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h @@ -21,9 +21,19 @@ class DomainMap { virtual ~DomainMap() = default; bool areExactMapped(IterDomain* id1, IterDomain* id2) const { + if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { + return false; + } return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT); } + bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const { + if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { + return false; + } + return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE); + } + const ComputeAtMap& getComputeAtMap() const { return ca_map_; } diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 5ef502321b773..6a0a932481157 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -59,9 +59,42 @@ class DomainMap : public pointwise_utils::DomainMap { } int getPosMappedTo(TensorView* tv, IterDomain* id) const { + // Find the root id mapped to `id` + const auto& root_dom = tv->getRootDomain(); + IterDomain* mapped_id = nullptr; + for (auto i : c10::irange(root_dom.size())) { + if (arePermissiveMapped(root_dom[i], id)) { + mapped_id = root_dom[i]; + break; + } + } + TORCH_INTERNAL_ASSERT( + mapped_id != nullptr, + "Can not find ID mapped to ", + id, + " in tensor ", + tv); + // Project the root id to leaf id + while (!mapped_id->uses().empty()) { + TORCH_INTERNAL_ASSERT(mapped_id->uses().size() == 1); + auto expr = mapped_id->uses()[0]; + if (expr->isA()) { + mapped_id = expr->as()->inner(); + } else { + auto merge = expr->as(); + TORCH_INTERNAL_ASSERT( + mapped_id == merge->inner(), + "Can not find ID mapped to ", + id, + " in tensor ", + tv); + mapped_id = merge->out(); + } + } + // Find the position of the leaf id const auto& dom = tv->domain()->domain(); for (auto i : c10::irange(dom.size())) { - if (areExactMapped(id, tv->axis(i))) { + if (dom[i] == mapped_id) { return i; } } @@ -240,22 +273,35 @@ void maybeBuildVirtualInnerDims( // both virtual innermost dim. // 2. The satisfied one did not merge in anything. For example, // T0[I0{1024*1024}, I1{2}] + // If this is the case, this means that we need to split the large + // inner-most dimension to satisfy the small innermost dimension int64_t large_dim; int64_t split_factor; + bool split_inner_most; if (merged_size1 < params.tile_size1) { if (params.dims_merged_with_2.empty()) { // case 2 - return; + split_inner_most = true; + large_dim = inner_most2; + split_factor = params.tile_size2; + } else { + // case 1 + split_inner_most = false; + large_dim = params.dims_merged_with_2.back(); + split_factor = ceilDiv(params.tile_size1, merged_size1); } - large_dim = params.dims_merged_with_2.back(); - split_factor = ceilDiv(params.tile_size1, merged_size1); } else { if (params.dims_merged_with_1.empty()) { // case 2 - return; + split_inner_most = true; + large_dim = inner_most1; + split_factor = params.tile_size1; + } else { + // case 1 + split_inner_most = false; + large_dim = params.dims_merged_with_1.back(); + split_factor = ceilDiv(params.tile_size2, merged_size2); } - large_dim = params.dims_merged_with_1.back(); - split_factor = ceilDiv(params.tile_size2, merged_size2); } params.split_before_tiling.push_back({large_dim, split_factor}); // adjust all dims to after-split @@ -271,12 +317,16 @@ void maybeBuildVirtualInnerDims( } // Give the split-out dim to the unsatisfied one, so that both are satisfied. if (merged_size1 < params.tile_size1) { - params.dims_merged_with_2.pop_back(); - params.dims_merged_with_2.push_back(large_dim + 1); + if (!split_inner_most) { + params.dims_merged_with_2.pop_back(); + params.dims_merged_with_2.push_back(large_dim + 1); + } params.dims_merged_with_1.push_back(large_dim); } else { - params.dims_merged_with_1.pop_back(); - params.dims_merged_with_1.push_back(large_dim + 1); + if (!split_inner_most) { + params.dims_merged_with_1.pop_back(); + params.dims_merged_with_1.push_back(large_dim + 1); + } params.dims_merged_with_2.push_back(large_dim); } } diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp index b9d8e9d294782..d5823c22683c0 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp @@ -932,6 +932,37 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) { testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__); } +// x->sin->transpose->cos->y +TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) { + std::array, 2> shapes{ + std::vector{1024 * 1024 * 128, 2}, + std::vector{2, 1024 * 1024 * 128}}; + for (const auto& shape : shapes) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(2); + fusion.addInput(tv0); + auto tv1 = sin(tv0); + auto tv2 = transpose(tv1, 0, 1); + auto tv3 = cos(tv2); + fusion.addOutput(tv3); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn(shape, options); + + auto lparams = scheduleTranspose(&fusion, {input}); + + FusionExecutor fe; + fe.compileFusion(&fusion, {input}, lparams); + auto outputs = fe.runFusion({input}, lparams); + + auto tv_ref = input.sin().transpose(0, 1).cos(); + + testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__); + } +} + } // namespace jit } // namespace torch #endif // #if defined(USE_CUDA) From 5a423a01d7181f883e2c49fb5da6be9674da9f52 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 25 Aug 2022 16:54:03 -0700 Subject: [PATCH 09/32] remove obselete comment --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 6a0a932481157..0db554ebb9849 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -419,12 +419,6 @@ std::shared_ptr getTransposeHeuristics( if (n_elems < device_multiprocessor_count * kMaxTileSize * kMaxTileSize) { params->tile_size1 = 8; params->tile_size2 = 8; - // TODO: I was trying the following but I got silent wrong result - // params->tile_size1 = 8; - // params->tile_size2 = 4; - // This should not happen, because the correctness should be irrevalent to - // schedulers. We don't have to use tile size (8, 4), but we need to fix our - // bug in codegen. } // Expand inner-most dims to virtual inner-most dims so that the inner-most From 5f266b3ec1e7bdefcede6e7b2597460aaa5fcd76 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 30 Aug 2022 21:54:33 -0700 Subject: [PATCH 10/32] skip innermost split --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index b197924d5d368..9ab09922ac7ec 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -280,10 +280,14 @@ void maybeBuildVirtualInnerDims( bool split_inner_most; if (merged_size1 < params.tile_size1) { if (params.dims_merged_with_2.empty()) { +#if 0 // case 2 split_inner_most = true; large_dim = inner_most2; split_factor = params.tile_size2; +#else + return; +#endif } else { // case 1 split_inner_most = false; @@ -292,10 +296,14 @@ void maybeBuildVirtualInnerDims( } } else { if (params.dims_merged_with_1.empty()) { +#if 0 // case 2 split_inner_most = true; large_dim = inner_most1; split_factor = params.tile_size1; +#else + return; +#endif } else { // case 1 split_inner_most = false; From bd93302ab7bc62adfd40bc4938df7f0dd5520ff0 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 30 Aug 2022 21:55:16 -0700 Subject: [PATCH 11/32] comment --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 9ab09922ac7ec..b43810ec83a73 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -286,6 +286,7 @@ void maybeBuildVirtualInnerDims( large_dim = inner_most2; split_factor = params.tile_size2; #else + // disabled due to indexing error return; #endif } else { @@ -302,6 +303,7 @@ void maybeBuildVirtualInnerDims( large_dim = inner_most1; split_factor = params.tile_size1; #else + // disabled due to indexing error return; #endif } else { From 5933f5381988e84ce0d65131006fe23843c4aa5a Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 30 Aug 2022 22:01:43 -0700 Subject: [PATCH 12/32] cleanup --- torch/csrc/jit/codegen/cuda/utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp index 56e4266a26e97..a79c4d2db83ad 100644 --- a/torch/csrc/jit/codegen/cuda/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/utils.cpp @@ -191,7 +191,7 @@ auto parseEnableOptions() { token, "'\nAvailable options:\n", "\tcomplex, kernel_profile, linear_decomposition,", - "conv_decomposition, transpose_scheduler"); + "conv_decomposition"); } options_view = (end_pos != c10::string_view::npos) ? options_view.substr(end_pos + 1) From 7c366b753f2b07e011dfe1a87226b90808d636a4 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 31 Aug 2022 23:03:39 -0700 Subject: [PATCH 13/32] save --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index b43810ec83a73..abdab77fb2c43 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -536,13 +536,16 @@ std::shared_ptr getTransposeHeuristics( std::cerr << "\n===== Transpose Stats ========\n" << "inputs: " << ir_utils::toString(fusion->inputs()) << "\n" << "outputs: " << ir_utils::toString(fusion->outputs()) << "\n" + << "shape: " << shape_in_ref1 << "\n" << "num_elems: " << n_elems << "\n" << "n_input_tensors: " << n_input_tensors << "\n" << "max_input_dtype_size: " << max_input_dtype_size << "\n" << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0]) << "\n" + << "reference1: " << reference1 << "\n" << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1]) - << std::endl; + << "\n" + << "reference2: " << reference2 << std::endl; if (!params->split_before_tiling.empty() || !params->dims_merged_with_1.empty() || !params->dims_merged_with_2.empty()) { From 2c4f646ad5a97dd6cf65c13282eca48efd08b2c7 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 31 Aug 2022 23:08:17 -0700 Subject: [PATCH 14/32] save --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index abdab77fb2c43..6ee1546abf27e 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -543,9 +543,11 @@ std::shared_ptr getTransposeHeuristics( << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0]) << "\n" << "reference1: " << reference1 << "\n" + << "inner_most_id1: " << inner_most_id1 << "\n" << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1]) << "\n" - << "reference2: " << reference2 << std::endl; + << "reference2: " << reference2 << "\n" + << "inner_most_id2: " << inner_most_id2 << std::endl; if (!params->split_before_tiling.empty() || !params->dims_merged_with_1.empty() || !params->dims_merged_with_2.empty()) { From 6d5a79fe466b6c69e926c801656e93e8cb394c4b Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 31 Aug 2022 23:43:29 -0700 Subject: [PATCH 15/32] save --- torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h index d672b6dc965bd..099f6d6d38c11 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h @@ -65,7 +65,7 @@ class TransposeParams : public HeuristicParams { std::stringstream ss; ss << "\n===== Transpose Parameters ========\n" << (tag == "" ? "" : "Tag: ") << tag << " Transpose Characteristics:\n" - << " Gridx: " << lparams.gdimx() << " BlckX: " << lparams.bdimx() + << " BlckX: " << lparams.bdimx() << "\n"; ss << " input tile size: " << tile_size1 << "\n"; ss << " output tile size: " << tile_size2 << "\n"; From f9a2d88ba2659a420fa57ccabf113a111aa6f2e9 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 31 Aug 2022 23:47:12 -0700 Subject: [PATCH 16/32] no small tile --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 6ee1546abf27e..f1e87b2553a20 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -425,12 +425,6 @@ std::shared_ptr getTransposeHeuristics( auto params = std::make_shared("Transpose heuristics"); - // If the problem size is small use small tile sizes. - if (n_elems < device_multiprocessor_count * kMaxTileSize * kMaxTileSize) { - params->tile_size1 = 8; - params->tile_size2 = 8; - } - // Expand inner-most dims to virtual inner-most dims so that the inner-most // dims has at least tile_size elements auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1); @@ -957,6 +951,9 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) { InlinePropagator inline_propagator( reference1, -1, ComputeAtMode::MostInlined); entire_dag.traverse(&inline_propagator); + + fusion->printMath(); + fusion->print(); } } // namespace cuda From ade4d229f8134eaae6c18db0b71046ca7ab298da Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 31 Aug 2022 23:47:54 -0700 Subject: [PATCH 17/32] cleanup --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index f1e87b2553a20..de64b7679100a 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -24,8 +24,6 @@ namespace cuda { namespace { -constexpr int64_t kMaxTileSize = 32; - // DomainMap uses the ComputeAtMap to find a reference TensorView // that maps to all iterDomains in the fusion. class DomainMap : public pointwise_utils::DomainMap { From ba9b01bb5132238ef987830a2911eb4f49657b2a Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 1 Sep 2022 00:03:31 -0700 Subject: [PATCH 18/32] tune tile size --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 3 --- torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index de64b7679100a..d3d787d92ab77 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -949,9 +949,6 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) { InlinePropagator inline_propagator( reference1, -1, ComputeAtMode::MostInlined); entire_dag.traverse(&inline_propagator); - - fusion->printMath(); - fusion->print(); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h index 099f6d6d38c11..aa86a9754b055 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h @@ -37,7 +37,7 @@ class TransposeParams : public HeuristicParams { // https://github.com/csarofeen/pytorch/pull/1854#discussion_r928143729 // Tile size for the inner most dim of tensors in the first group - size_t tile_size1 = 32; + size_t tile_size1 = 16; // Tile size for the inner most dim of tensors in the second group size_t tile_size2 = 32; From cddae0bd57c0374b589e495607ae4b8fc6fad395 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 1 Sep 2022 00:04:44 -0700 Subject: [PATCH 19/32] undo tune --- torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h index aa86a9754b055..099f6d6d38c11 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h @@ -37,7 +37,7 @@ class TransposeParams : public HeuristicParams { // https://github.com/csarofeen/pytorch/pull/1854#discussion_r928143729 // Tile size for the inner most dim of tensors in the first group - size_t tile_size1 = 16; + size_t tile_size1 = 32; // Tile size for the inner most dim of tensors in the second group size_t tile_size2 = 32; From 3ce9784b433e73e8bfc8878c5c89ab977e1bb569 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 1 Sep 2022 00:23:13 -0700 Subject: [PATCH 20/32] no virt inner if low occupancy --- .../jit/codegen/cuda/scheduler/transpose.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index d3d787d92ab77..915d43a92ecfa 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -421,6 +421,11 @@ std::shared_ptr getTransposeHeuristics( shape_in_ref1.push_back(size); } + // maximum unroll factor limited by occupancy + auto max_unroll_factor_occupancy = ceilDiv( + n_elems, + device_multiprocessor_count * params->tile_size1 * params->tile_size2); + auto params = std::make_shared("Transpose heuristics"); // Expand inner-most dims to virtual inner-most dims so that the inner-most @@ -434,8 +439,15 @@ std::shared_ptr getTransposeHeuristics( domain_map.getPosMappedTo(reference1, inner_most_id2); // See note [Supporting small transpose dimensions] - maybeBuildVirtualInnerDims( - *params, shape_in_ref1, inner_most_pos1_in_ref1, inner_most_pos2_in_ref1); + if (max_unroll_factor_occupancy > 1) { + // if creating virtual inner dims could not help us get better occupancy, + // then don't do it + maybeBuildVirtualInnerDims( + *params, + shape_in_ref1, + inner_most_pos1_in_ref1, + inner_most_pos2_in_ref1); + } // Note [vectorization and unroll of input and output] // @@ -487,9 +499,6 @@ std::shared_ptr getTransposeHeuristics( (int64_t)1)); // Don't unroll at the cost of getting a full wave on the GPU - auto max_unroll_factor_occupancy = ceilDiv( - n_elems, - device_multiprocessor_count * params->tile_size1 * params->tile_size2); max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_occupancy); // Don't unroll at the cost of getting a full warp, useful for the case where From 5fb2b58716f5420c602c46a87530dcfd0810a2a6 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 1 Sep 2022 00:23:34 -0700 Subject: [PATCH 21/32] fix --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 915d43a92ecfa..82dd98c970334 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -421,13 +421,13 @@ std::shared_ptr getTransposeHeuristics( shape_in_ref1.push_back(size); } + auto params = std::make_shared("Transpose heuristics"); + // maximum unroll factor limited by occupancy auto max_unroll_factor_occupancy = ceilDiv( n_elems, device_multiprocessor_count * params->tile_size1 * params->tile_size2); - auto params = std::make_shared("Transpose heuristics"); - // Expand inner-most dims to virtual inner-most dims so that the inner-most // dims has at least tile_size elements auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1); From 41483a9a868b2f2bedc171254e8a16ca88e5658d Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 1 Sep 2022 00:34:18 -0700 Subject: [PATCH 22/32] if one full wave can handle all elements, don't create virtual inner dims --- .../jit/codegen/cuda/scheduler/transpose.cpp | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 82dd98c970334..a215a39753209 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -209,12 +209,26 @@ class DomainMap : public pointwise_utils::DomainMap { // T0[I0*I1o*I5*I6{1024*1024/4*8}, I1i*I2*I3*I4{32}] void maybeBuildVirtualInnerDims( TransposeParams& params, + int64_t device_multiprocessor_count, + int64_t n_elems, const std::vector& shape_in_ref1, int64_t inner_most1, int64_t inner_most2) { int64_t merged_size1 = shape_in_ref1[inner_most1]; int64_t merged_size2 = shape_in_ref1[inner_most2]; + int64_t actual_tile_size1 = + std::min(merged_size1, params.tile_size1); + int64_t actual_tile_size2 = + std::min(merged_size2, params.tile_size2); + int64_t wave_elements = + device_multiprocessor_count * actual_tile_size1 * actual_tile_size2; + + if (wave_elements >= n_elems) { + // if one full wave can handle all elements, don't create virtual inner dims + return; + } + // merge inner_most1 and inner_most2 left until we are done or we can no // longer do so int64_t dim = inner_most1 - 1; @@ -423,11 +437,6 @@ std::shared_ptr getTransposeHeuristics( auto params = std::make_shared("Transpose heuristics"); - // maximum unroll factor limited by occupancy - auto max_unroll_factor_occupancy = ceilDiv( - n_elems, - device_multiprocessor_count * params->tile_size1 * params->tile_size2); - // Expand inner-most dims to virtual inner-most dims so that the inner-most // dims has at least tile_size elements auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1); @@ -439,15 +448,13 @@ std::shared_ptr getTransposeHeuristics( domain_map.getPosMappedTo(reference1, inner_most_id2); // See note [Supporting small transpose dimensions] - if (max_unroll_factor_occupancy > 1) { - // if creating virtual inner dims could not help us get better occupancy, - // then don't do it - maybeBuildVirtualInnerDims( - *params, - shape_in_ref1, - inner_most_pos1_in_ref1, - inner_most_pos2_in_ref1); - } + maybeBuildVirtualInnerDims( + *params, + device_multiprocessor_count, + n_elems, + shape_in_ref1, + inner_most_pos1_in_ref1, + inner_most_pos2_in_ref1); // Note [vectorization and unroll of input and output] // @@ -499,6 +506,9 @@ std::shared_ptr getTransposeHeuristics( (int64_t)1)); // Don't unroll at the cost of getting a full wave on the GPU + auto max_unroll_factor_occupancy = ceilDiv( + n_elems, + device_multiprocessor_count * params->tile_size1 * params->tile_size2); max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_occupancy); // Don't unroll at the cost of getting a full warp, useful for the case where From 21a789625eb8fe7062858592549b948c75998171 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 6 Sep 2022 19:09:55 -0700 Subject: [PATCH 23/32] reject < 1 wave on runtime --- .../jit/codegen/cuda/scheduler/registry.cpp | 12 +++ .../jit/codegen/cuda/scheduler/transpose.cpp | 94 +++++++++++++++---- .../jit/codegen/cuda/scheduler/transpose.h | 5 + 3 files changed, 91 insertions(+), 20 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index 570307d7d49e1..34c6dc9ef87cb 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1267,6 +1267,18 @@ class TransposeScheduler : public SchedulerEntry { Fusion* fusion, SchedulerRuntimeInfo& runtime_info, HeuristicSummary* data_cache = nullptr) { + FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime"); + + const int64_t device_multiprocessor_count = + (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + auto n_elems = getShape(fusion, data_cache, runtime_info).second; + if (device_multiprocessor_count * 32 * 32 > n_elems) { + // don't schedule with transpose scheduler if less than a full wave + scheduler_debug_utils::canScheduleRejectReason( + ScheduleHeuristic::Transpose, + "Transpose scheduler does not perform well on small problem sizes."); + return false; + } return true; } diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index c05a2d86dabf5..b71da876e7092 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -370,35 +370,38 @@ std::shared_ptr getTransposeHeuristics( return getTransposeHeuristics(fusion, runtime_info, data_cache); } -std::shared_ptr getTransposeHeuristics( - Fusion* fusion, - SchedulerRuntimeInfo& runtime_info, - HeuristicSummary* data_cache) { - FUSER_PERF_SCOPE("getTransposeHeuristics"); - - FusionGuard fg(fusion); - - // Incase any buffer is of type DataType::Index - DataType index_type = indexModeToDtype(runtime_info.getIndexMode()); - +HeuristicSummaryEntry getDomainMap( + HeuristicSummary* data_cache, + Fusion* fusion) { auto domain_map_entry = HeuristicSummaryEntry( data_cache, [fusion]() { return std::make_unique(fusion); }); - const auto& domain_map = dynamic_cast(domain_map_entry.get()); + return domain_map_entry; +} +HeuristicSummaryEntry +getInputsOutputsGroups(HeuristicSummary* data_cache, DomainMap& domain_map) { auto grouped_inputs_outputs_entry = HeuristicSummaryEntry( data_cache, [&domain_map]() { return std::make_unique>>( domain_map.groupInputsOutputsByInnerDim()); }); - auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get(); + auto& grouped_inputs_outputs = grouped_inputs_outputs_entry.get(); TORCH_INTERNAL_ASSERT( grouped_inputs_outputs.size() >= 2, "Can not find mismatched inner most dim, should use pointwise scheduler."); + return grouped_inputs_outputs_entry; +} + +HeuristicSummaryEntry +getReferenceTensors( + HeuristicSummary* data_cache, + DomainMap& domain_map, + std::vector>& grouped_inputs_outputs) { auto reference_tensors_entry = HeuristicSummaryEntry( data_cache, [&domain_map, &grouped_inputs_outputs]() { @@ -415,13 +418,17 @@ std::shared_ptr getTransposeHeuristics( reference1 != nullptr, "Unable to find reference tensor for group 1"); TORCH_INTERNAL_ASSERT( reference2 != nullptr, "Unable to find reference tensor for group 2"); + return reference_tensors_entry; +} - const int64_t device_multiprocessor_count = - (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; - - auto ref_root = reference1->getMaybeRFactorDomain(); - std::vector shape_in_ref1; - shape_in_ref1.reserve(reference1->nDims()); +std::pair, int64_t> getShapeInReference( + HeuristicSummary* data_cache, + SchedulerRuntimeInfo& runtime_info, + TensorView* reference, + DomainMap& domain_map) { + auto ref_root = reference->getMaybeRFactorDomain(); + std::vector shape_in_ref; + shape_in_ref.reserve(reference->nDims()); int64_t n_elems = 1; for (size_t ref_i = 0; ref_i < ref_root.size(); ref_i++) { auto id = ref_root[ref_i]; @@ -435,8 +442,55 @@ std::shared_ptr getTransposeHeuristics( ref_root[ref_i]->extent()->toInlineString()); int64_t size = inferred_val->as(); n_elems *= size; - shape_in_ref1.push_back(size); + shape_in_ref.push_back(size); } + return {shape_in_ref, n_elems}; +} + +std::pair, int64_t> getShape( + Fusion* fusion, + HeuristicSummary* data_cache, + SchedulerRuntimeInfo& runtime_info) { + auto domain_map_entry = getDomainMap(data_cache, fusion); + auto& domain_map = dynamic_cast(domain_map_entry.get()); + auto grouped_inputs_outputs_entry = + getInputsOutputsGroups(data_cache, domain_map); + auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get(); + auto reference_tensors_entry = + getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs); + auto reference_tensors = reference_tensors_entry.get(); + TensorView* reference1 = reference_tensors[0]; + return getShapeInReference(data_cache, runtime_info, reference1, domain_map); +} + +std::shared_ptr getTransposeHeuristics( + Fusion* fusion, + SchedulerRuntimeInfo& runtime_info, + HeuristicSummary* data_cache) { + FUSER_PERF_SCOPE("getTransposeHeuristics"); + + FusionGuard fg(fusion); + + // Incase any buffer is of type DataType::Index + DataType index_type = indexModeToDtype(runtime_info.getIndexMode()); + + auto domain_map_entry = getDomainMap(data_cache, fusion); + auto& domain_map = dynamic_cast(domain_map_entry.get()); + auto grouped_inputs_outputs_entry = + getInputsOutputsGroups(data_cache, domain_map); + auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get(); + auto reference_tensors_entry = + getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs); + auto reference_tensors = reference_tensors_entry.get(); + TensorView* reference1 = reference_tensors[0]; + TensorView* reference2 = reference_tensors[1]; + auto pair = + getShapeInReference(data_cache, runtime_info, reference1, domain_map); + auto& shape_in_ref1 = pair.first; + auto& n_elems = pair.second; + + const int64_t device_multiprocessor_count = + (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; auto params = std::make_shared("Transpose heuristics"); diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h index 0cf6920ea058b..ad104d44e9835 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h @@ -100,6 +100,11 @@ TORCH_CUDA_CU_API LaunchParams scheduleTranspose( //! groups, each with a fully broadcasted reference tensor. TORCH_CUDA_CU_API bool hasAtLeastTwoValidGroups(Fusion* fusion); +TORCH_CUDA_CU_API std::pair, int64_t> getShape( + Fusion* fusion, + HeuristicSummary* data_cache, + SchedulerRuntimeInfo& runtime_info); + } // namespace cuda } // namespace fuser } // namespace jit From f8551d76d0706d095159a0e9d642e3d0651164b9 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 8 Sep 2022 15:19:20 -0700 Subject: [PATCH 24/32] cache inner most positions --- .../cuda/scheduler/compile_time_info.h | 13 +++- .../jit/codegen/cuda/scheduler/registry.cpp | 5 +- .../jit/codegen/cuda/scheduler/transpose.cpp | 68 +++++++++++++------ 3 files changed, 62 insertions(+), 24 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h index c43ef64eac0a3..86d906c4747a7 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h @@ -33,7 +33,8 @@ enum class CompileTimeEntryType { REDUCTION_TVS, PERSISTENT_BUFFER_INFO, SCOPE_PERSISTENT_FACTOR_INFO, - BROADCAST_BYTE_MULTIPLES + BROADCAST_BYTE_MULTIPLES, + INNER_MOST_DIMS_INFO, }; //! Entry type definition class for `DOMAIN_MAP`, @@ -99,6 +100,16 @@ class PersistentBufferInfo { CompileTimeEntryType::PERSISTENT_BUFFER_INFO; }; +//! Entry type definition class for `INNER_MOST_DIMS_INFO`, +//! Used in the transpose scheduler to store inner most IterDomains and their +//! position in reference1 of group 1 and group 2 +class InnerMostDimInfo { + public: + using DataType = std::vector; + static const CompileTimeEntryType EntryType = + CompileTimeEntryType::INNER_MOST_DIMS_INFO; +}; + //! Auxiliary data types for `SCOPE_PERSISTENT_FACTOR_INFO` entry type. using ScopedPersistenceBufferMap = std::unordered_map>; diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index 34c6dc9ef87cb..90b859f16ae42 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1271,7 +1271,9 @@ class TransposeScheduler : public SchedulerEntry { const int64_t device_multiprocessor_count = (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; - auto n_elems = getShape(fusion, data_cache, runtime_info).second; + auto pair = getShape(fusion, data_cache, runtime_info); + auto shape = pair.first; + auto n_elems = pair.second; if (device_multiprocessor_count * 32 * 32 > n_elems) { // don't schedule with transpose scheduler if less than a full wave scheduler_debug_utils::canScheduleRejectReason( @@ -1572,6 +1574,7 @@ template class HeuristicSummaryEntry< template class HeuristicSummaryEntry< HeuristicCompileTime::ScopePersistentFactorInfo>; template class HeuristicSummaryEntry; +template class HeuristicSummaryEntry; } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index b71da876e7092..1d07ea1858197 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -358,18 +358,6 @@ void maybeBuildVirtualInnerDims( } // namespace -bool hasAtLeastTwoValidGroups(Fusion* fusion) { - return DomainMap::hasAtLeastTwoValidGroups(fusion); -} - -std::shared_ptr getTransposeHeuristics( - Fusion* fusion, - const at::ArrayRef& runtime_inputs, - HeuristicSummary* data_cache) { - SchedulerRuntimeInfo runtime_info(fusion, runtime_inputs, true); - return getTransposeHeuristics(fusion, runtime_info, data_cache); -} - HeuristicSummaryEntry getDomainMap( HeuristicSummary* data_cache, Fusion* fusion) { @@ -447,6 +435,29 @@ std::pair, int64_t> getShapeInReference( return {shape_in_ref, n_elems}; } +HeuristicSummaryEntry +getInnerMostDimInfoInReference( + HeuristicSummary* data_cache, + const std::vector& group_references, + TensorView* global_reference, + DomainMap& domain_map) { + auto innermost_info_entry = + HeuristicSummaryEntry( + data_cache, [&]() { + std::vector data; + data.reserve(group_references.size()); + for (auto ref_tv : group_references) { + auto inner_most_id = scheduler_utils::innerMostRootDim(ref_tv); + auto inner_most_pos_in_global_ref = + domain_map.getInnerLeafDim(global_reference, inner_most_id); + data.emplace_back(inner_most_pos_in_global_ref); + } + return std::make_unique>(std::move(data)); + ; + }); + return innermost_info_entry; +} + std::pair, int64_t> getShape( Fusion* fusion, HeuristicSummary* data_cache, @@ -463,6 +474,18 @@ std::pair, int64_t> getShape( return getShapeInReference(data_cache, runtime_info, reference1, domain_map); } +bool hasAtLeastTwoValidGroups(Fusion* fusion) { + return DomainMap::hasAtLeastTwoValidGroups(fusion); +} + +std::shared_ptr getTransposeHeuristics( + Fusion* fusion, + const at::ArrayRef& runtime_inputs, + HeuristicSummary* data_cache) { + SchedulerRuntimeInfo runtime_info(fusion, runtime_inputs, true); + return getTransposeHeuristics(fusion, runtime_info, data_cache); +} + std::shared_ptr getTransposeHeuristics( Fusion* fusion, SchedulerRuntimeInfo& runtime_info, @@ -492,18 +515,17 @@ std::shared_ptr getTransposeHeuristics( const int64_t device_multiprocessor_count = (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + auto innermost_info_entry = getInnerMostDimInfoInReference( + data_cache, reference_tensors, reference1, domain_map); + auto innermost_info = innermost_info_entry.get(); + + auto inner_most_pos1_in_ref1 = innermost_info[0]; + auto inner_most_pos2_in_ref1 = innermost_info[1]; + auto params = std::make_shared("Transpose heuristics"); // Expand inner-most dims to virtual inner-most dims so that the inner-most // dims has at least tile_size elements - auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1); - auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2); - - auto inner_most_pos1_in_ref1 = - domain_map.getInnerLeafDim(reference1, inner_most_id1); - auto inner_most_pos2_in_ref1 = - domain_map.getInnerLeafDim(reference1, inner_most_id2); - // See note [Supporting small transpose dimensions] maybeBuildVirtualInnerDims( *params, @@ -611,11 +633,13 @@ std::shared_ptr getTransposeHeuristics( << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0]) << "\n" << "reference1: " << reference1 << "\n" - << "inner_most_id1: " << inner_most_id1 << "\n" + << "inner_most_id1 position: " << inner_most_pos1_in_ref1 + << " (in reference 1)\n" << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1]) << "\n" << "reference2: " << reference2 << "\n" - << "inner_most_id2: " << inner_most_id2 << std::endl; + << "inner_most_id2 position: " << inner_most_pos2_in_ref1 + << " (in reference 1)" << std::endl; if (!params->split_before_tiling.empty() || !params->dims_merged_with_1.empty() || !params->dims_merged_with_2.empty()) { From 53690a0f2ffa7494a15ee37fa1ad42f008d6b5e9 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 8 Sep 2022 16:28:04 -0700 Subject: [PATCH 25/32] issue id --- .../jit/codegen/cuda/scheduler/registry.cpp | 12 +-- .../jit/codegen/cuda/scheduler/transpose.cpp | 78 +++++++++++++++++-- .../jit/codegen/cuda/scheduler/transpose.h | 6 +- .../cuda/scheduler/transpose_heuristic.h | 8 +- 4 files changed, 86 insertions(+), 18 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index 90b859f16ae42..a43fed67103e8 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1269,16 +1269,10 @@ class TransposeScheduler : public SchedulerEntry { HeuristicSummary* data_cache = nullptr) { FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime"); - const int64_t device_multiprocessor_count = - (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; - auto pair = getShape(fusion, data_cache, runtime_info); - auto shape = pair.first; - auto n_elems = pair.second; - if (device_multiprocessor_count * 32 * 32 > n_elems) { - // don't schedule with transpose scheduler if less than a full wave + auto reason = getRuntimeRejectReason(fusion, data_cache, runtime_info); + if (!reason.empty()) { scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Transpose, - "Transpose scheduler does not perform well on small problem sizes."); + ScheduleHeuristic::Transpose, reason); return false; } return true; diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 1d07ea1858197..03c7f55742c13 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -293,7 +293,8 @@ void maybeBuildVirtualInnerDims( bool split_inner_most; if (merged_size1 < params.tile_size1) { if (params.dims_merged_with_2.empty()) { -#if 0 +#if SUPPORT_SPLITTING_INNERMOST_DIM + // https://github.com/csarofeen/pytorch/issues/1964 // case 2 split_inner_most = true; large_dim = inner_most2; @@ -311,7 +312,8 @@ void maybeBuildVirtualInnerDims( } } else { if (params.dims_merged_with_1.empty()) { -#if 0 +#if SUPPORT_SPLITTING_INNERMOST_DIM + // https://github.com/csarofeen/pytorch/issues/1964 // case 2 split_inner_most = true; large_dim = inner_most1; @@ -356,8 +358,6 @@ void maybeBuildVirtualInnerDims( } } -} // namespace - HeuristicSummaryEntry getDomainMap( HeuristicSummary* data_cache, Fusion* fusion) { @@ -458,7 +458,9 @@ getInnerMostDimInfoInReference( return innermost_info_entry; } -std::pair, int64_t> getShape( +} // namespace + +std::string getRuntimeRejectReason( Fusion* fusion, HeuristicSummary* data_cache, SchedulerRuntimeInfo& runtime_info) { @@ -471,7 +473,71 @@ std::pair, int64_t> getShape( getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs); auto reference_tensors = reference_tensors_entry.get(); TensorView* reference1 = reference_tensors[0]; - return getShapeInReference(data_cache, runtime_info, reference1, domain_map); + + auto pair = + getShapeInReference(data_cache, runtime_info, reference1, domain_map); + auto& shape_in_ref1 = pair.first; + auto& n_elems = pair.second; + + constexpr size_t default_tile_elements = + TransposeParams::getDefaultTileSize() * + TransposeParams::getDefaultTileSize(); + + // don't schedule with transpose scheduler if less than a full wave + const int64_t device_multiprocessor_count = + (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + auto elements_per_wave = device_multiprocessor_count * default_tile_elements; + if (elements_per_wave > n_elems) { + return "Transpose scheduler does not perform well on small problem sizes."; + } + + auto innermost_info_entry = getInnerMostDimInfoInReference( + data_cache, reference_tensors, reference1, domain_map); + auto innermost_info = innermost_info_entry.get(); + + auto inner_most_pos1_in_ref1 = innermost_info[0]; + auto inner_most_pos2_in_ref1 = innermost_info[1]; + + auto inner_size1 = shape_in_ref1[inner_most_pos1_in_ref1]; + auto inner_size2 = shape_in_ref1[inner_most_pos2_in_ref1]; + + // For cases like + // transpose(T0[1000000000, 2, 2], 1, 2) + // the pointwise scheduler should provide better performance, because it + // provides coalesced memory access + if (inner_size1 * inner_size2 < default_tile_elements) { + auto inner_elements = inner_size1 * inner_size2; + for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1; + i++) { + inner_elements *= shape_in_ref1[i]; + } + // note that the algorithm here is only an approximation because it only + // checks reference1. In principle, we need to check all inputs and outputs + // to get an accurate result, but that is too much work. I think checking + // only reference 1 is fine for now. Below is an example where the + // approximation here will not work: + // T0[10000000, 2, 3] (reference 1) + // T1[2, 10000000, 3] input/output + // T2[2, 10000000, 3] input/output + // T3[2, 10000000, 3] input/output + // T4[3, 10000000, 2] input/output + // T5[3, 10000000, 2] input/output + if (inner_elements < default_tile_elements) { + return "Inner transpose of small dimensions should be scheduled by the " + "pointwise scheduler because it provides better memory coalescing"; + } + } + +#if !SUPPORT_SPLITTING_INNERMOST_DIM + if (n_elems / inner_size1 < TransposeParams::getDefaultTileSize() || + n_elems / inner_size2 < TransposeParams::getDefaultTileSize()) { + return "Splitting of inner most dim for the creation of virtual inner most dim " + "is disabled due to indexing bug, skipping this case at runtime for now" + "See: https://github.com/csarofeen/pytorch/issues/1964"; + } +#endif + + return ""; } bool hasAtLeastTwoValidGroups(Fusion* fusion) { diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h index ad104d44e9835..abf141846555d 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h @@ -5,6 +5,8 @@ #include #include +#define SUPPORT_SPLITTING_INNERMOST_DIM 0 + namespace torch { namespace jit { namespace fuser { @@ -100,7 +102,9 @@ TORCH_CUDA_CU_API LaunchParams scheduleTranspose( //! groups, each with a fully broadcasted reference tensor. TORCH_CUDA_CU_API bool hasAtLeastTwoValidGroups(Fusion* fusion); -TORCH_CUDA_CU_API std::pair, int64_t> getShape( +// If can schedule at runtime, returns empty string, otherwise returns the +// reason why we should not schedule at runtime. +TORCH_CUDA_CU_API std::string getRuntimeRejectReason( Fusion* fusion, HeuristicSummary* data_cache, SchedulerRuntimeInfo& runtime_info); diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h index 099f6d6d38c11..07be8dfb03a3d 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h @@ -21,6 +21,10 @@ class TransposeParams : public HeuristicParams { return 128; } + static constexpr size_t getDefaultTileSize() { + return 32; + } + // See note [Supporting small transpose dimensions], all dims are positions in // reference1 std::vector> split_before_tiling = {}; @@ -37,10 +41,10 @@ class TransposeParams : public HeuristicParams { // https://github.com/csarofeen/pytorch/pull/1854#discussion_r928143729 // Tile size for the inner most dim of tensors in the first group - size_t tile_size1 = 32; + size_t tile_size1 = getDefaultTileSize(); // Tile size for the inner most dim of tensors in the second group - size_t tile_size2 = 32; + size_t tile_size2 = getDefaultTileSize(); using HeuristicParams::HeuristicParams; From 44f781019130a3cc1bc9503c465bec63400d076f Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 8 Sep 2022 16:52:31 -0700 Subject: [PATCH 26/32] lint --- torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h index 07be8dfb03a3d..5e56278a7f16b 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h @@ -69,8 +69,7 @@ class TransposeParams : public HeuristicParams { std::stringstream ss; ss << "\n===== Transpose Parameters ========\n" << (tag == "" ? "" : "Tag: ") << tag << " Transpose Characteristics:\n" - << " BlckX: " << lparams.bdimx() - << "\n"; + << " BlckX: " << lparams.bdimx() << "\n"; ss << " input tile size: " << tile_size1 << "\n"; ss << " output tile size: " << tile_size2 << "\n"; int elements_per_tile = tile_size1 * tile_size2; From ea0d1fffb099abd4f33c81f0d720e4b63c1316c3 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Sat, 10 Sep 2022 14:28:11 -0700 Subject: [PATCH 27/32] maybe_tune --- .../jit/codegen/cuda/scheduler/pointwise.cpp | 20 ++++++++++--------- .../jit/codegen/cuda/scheduler/pointwise.h | 2 ++ .../jit/codegen/cuda/scheduler/transpose.cpp | 20 +++++++++---------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp index d404ab622a5c7..e298fda6893d4 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp @@ -26,7 +26,6 @@ namespace cuda { namespace { // constexpr int64_t x_grid_limit = ((int64_t)1 << (int64_t)31) - (int64_t)1; // Unused at the moment, commenting for clang tidy -constexpr int64_t kThreadX = 128; class DomainMap : public pointwise_utils::DomainMap { public: @@ -174,11 +173,12 @@ std::shared_ptr getPointwiseHeuristics( (int64_t)1)); // Don't unroll at the cost of getting a full wave on the GPU - if (n_elems < device_multiprocessor_count * kThreadX && + if (n_elems < device_multiprocessor_count * kPointwiseSchedulerThreadX && max_unroll_factor > 1) { max_unroll_factor = std::min( max_unroll_factor, - ceilDiv(n_elems, device_multiprocessor_count * kThreadX)); + ceilDiv( + n_elems, device_multiprocessor_count * kPointwiseSchedulerThreadX)); } auto params = std::make_shared("Pointwise heuristics"); @@ -213,7 +213,7 @@ std::shared_ptr getPointwiseHeuristics( // right) int64_t right_elem_count = 0; - int64_t bdimx = kThreadX; + int64_t bdimx = kPointwiseSchedulerThreadX; // bdimy may be used if the right side of the break point is not large and we // need to expand block level parallelism into the left side of the break @@ -262,7 +262,8 @@ std::shared_ptr getPointwiseHeuristics( } // If there isn't very much parallelism available, just use 1D scheduler - if (n_elems * 2 > device_multiprocessor_count * kThreadX) { + if (n_elems * 2 > + device_multiprocessor_count * kPointwiseSchedulerThreadX) { int64_t min_total_transfer = std::numeric_limits::max(); for (const auto break_point_i : c10::irange(ref_root.size())) { @@ -324,13 +325,14 @@ std::shared_ptr getPointwiseHeuristics( } // Min transfer found, start setting values bdimx = std::min( - ceilDiv(cur_right_elem_count, max_unroll_factor), kThreadX); + ceilDiv(cur_right_elem_count, max_unroll_factor), + kPointwiseSchedulerThreadX); bdimy = 1; gdim_right = 1; // Put remainder in bdimy if there's at least a wave of grid level // parallelism. if (cur_left_elem_count > device_multiprocessor_count) { - bdimy = kThreadX / bdimx; + bdimy = kPointwiseSchedulerThreadX / bdimx; } auto remainder_left = ceilDiv(cur_left_elem_count, bdimy); auto remainder_right = @@ -644,7 +646,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) { // Unswitch reference_tv->split(0, 1); // Threads - reference_tv->split(0, kThreadX); + reference_tv->split(0, kPointwiseSchedulerThreadX); reference_tv->axis(0)->parallelize(ParallelType::BIDx); reference_tv->axis(1)->parallelize(ParallelType::TIDx); @@ -658,7 +660,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) { //[BIDx, Unswitch, Vectorization, TIDx] } else { // Threads - reference_tv->split(0, kThreadX); + reference_tv->split(0, kPointwiseSchedulerThreadX); // Unroll reference_tv->split(0, params.unroll_factor); // Unswitch diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h index 6cba29cd6b4b9..aee470c3ba12f 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h @@ -10,6 +10,8 @@ namespace jit { namespace fuser { namespace cuda { +constexpr int64_t kPointwiseSchedulerThreadX = 128; + class SchedulerRuntimeInfo; class HeuristicSummary; diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 03c7f55742c13..963f37465166c 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -479,21 +479,21 @@ std::string getRuntimeRejectReason( auto& shape_in_ref1 = pair.first; auto& n_elems = pair.second; - constexpr size_t default_tile_elements = - TransposeParams::getDefaultTileSize() * - TransposeParams::getDefaultTileSize(); + auto innermost_info_entry = getInnerMostDimInfoInReference( + data_cache, reference_tensors, reference1, domain_map); + auto innermost_info = innermost_info_entry.get(); // don't schedule with transpose scheduler if less than a full wave const int64_t device_multiprocessor_count = (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; - auto elements_per_wave = device_multiprocessor_count * default_tile_elements; - if (elements_per_wave > n_elems) { + auto pointwise_elements_per_wave = + device_multiprocessor_count * kPointwiseSchedulerThreadX; + if (pointwise_elements_per_wave > n_elems) { return "Transpose scheduler does not perform well on small problem sizes."; } - auto innermost_info_entry = getInnerMostDimInfoInReference( - data_cache, reference_tensors, reference1, domain_map); - auto innermost_info = innermost_info_entry.get(); + auto max_tile_elements = TransposeParams::getDefaultTileSize() * + TransposeParams::getDefaultTileSize(); auto inner_most_pos1_in_ref1 = innermost_info[0]; auto inner_most_pos2_in_ref1 = innermost_info[1]; @@ -505,7 +505,7 @@ std::string getRuntimeRejectReason( // transpose(T0[1000000000, 2, 2], 1, 2) // the pointwise scheduler should provide better performance, because it // provides coalesced memory access - if (inner_size1 * inner_size2 < default_tile_elements) { + if (inner_size1 * inner_size2 < max_tile_elements) { auto inner_elements = inner_size1 * inner_size2; for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1; i++) { @@ -522,7 +522,7 @@ std::string getRuntimeRejectReason( // T3[2, 10000000, 3] input/output // T4[3, 10000000, 2] input/output // T5[3, 10000000, 2] input/output - if (inner_elements < default_tile_elements) { + if (inner_elements < max_tile_elements) { return "Inner transpose of small dimensions should be scheduled by the " "pointwise scheduler because it provides better memory coalescing"; } From 5920c62adaf125edaa8922dab56a3852579dbec7 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Sat, 10 Sep 2022 14:28:27 -0700 Subject: [PATCH 28/32] Revert "maybe_tune" This reverts commit ea0d1fffb099abd4f33c81f0d720e4b63c1316c3. --- .../jit/codegen/cuda/scheduler/pointwise.cpp | 20 +++++++++---------- .../jit/codegen/cuda/scheduler/pointwise.h | 2 -- .../jit/codegen/cuda/scheduler/transpose.cpp | 20 +++++++++---------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp index e298fda6893d4..d404ab622a5c7 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp @@ -26,6 +26,7 @@ namespace cuda { namespace { // constexpr int64_t x_grid_limit = ((int64_t)1 << (int64_t)31) - (int64_t)1; // Unused at the moment, commenting for clang tidy +constexpr int64_t kThreadX = 128; class DomainMap : public pointwise_utils::DomainMap { public: @@ -173,12 +174,11 @@ std::shared_ptr getPointwiseHeuristics( (int64_t)1)); // Don't unroll at the cost of getting a full wave on the GPU - if (n_elems < device_multiprocessor_count * kPointwiseSchedulerThreadX && + if (n_elems < device_multiprocessor_count * kThreadX && max_unroll_factor > 1) { max_unroll_factor = std::min( max_unroll_factor, - ceilDiv( - n_elems, device_multiprocessor_count * kPointwiseSchedulerThreadX)); + ceilDiv(n_elems, device_multiprocessor_count * kThreadX)); } auto params = std::make_shared("Pointwise heuristics"); @@ -213,7 +213,7 @@ std::shared_ptr getPointwiseHeuristics( // right) int64_t right_elem_count = 0; - int64_t bdimx = kPointwiseSchedulerThreadX; + int64_t bdimx = kThreadX; // bdimy may be used if the right side of the break point is not large and we // need to expand block level parallelism into the left side of the break @@ -262,8 +262,7 @@ std::shared_ptr getPointwiseHeuristics( } // If there isn't very much parallelism available, just use 1D scheduler - if (n_elems * 2 > - device_multiprocessor_count * kPointwiseSchedulerThreadX) { + if (n_elems * 2 > device_multiprocessor_count * kThreadX) { int64_t min_total_transfer = std::numeric_limits::max(); for (const auto break_point_i : c10::irange(ref_root.size())) { @@ -325,14 +324,13 @@ std::shared_ptr getPointwiseHeuristics( } // Min transfer found, start setting values bdimx = std::min( - ceilDiv(cur_right_elem_count, max_unroll_factor), - kPointwiseSchedulerThreadX); + ceilDiv(cur_right_elem_count, max_unroll_factor), kThreadX); bdimy = 1; gdim_right = 1; // Put remainder in bdimy if there's at least a wave of grid level // parallelism. if (cur_left_elem_count > device_multiprocessor_count) { - bdimy = kPointwiseSchedulerThreadX / bdimx; + bdimy = kThreadX / bdimx; } auto remainder_left = ceilDiv(cur_left_elem_count, bdimy); auto remainder_right = @@ -646,7 +644,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) { // Unswitch reference_tv->split(0, 1); // Threads - reference_tv->split(0, kPointwiseSchedulerThreadX); + reference_tv->split(0, kThreadX); reference_tv->axis(0)->parallelize(ParallelType::BIDx); reference_tv->axis(1)->parallelize(ParallelType::TIDx); @@ -660,7 +658,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) { //[BIDx, Unswitch, Vectorization, TIDx] } else { // Threads - reference_tv->split(0, kPointwiseSchedulerThreadX); + reference_tv->split(0, kThreadX); // Unroll reference_tv->split(0, params.unroll_factor); // Unswitch diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h index aee470c3ba12f..6cba29cd6b4b9 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h @@ -10,8 +10,6 @@ namespace jit { namespace fuser { namespace cuda { -constexpr int64_t kPointwiseSchedulerThreadX = 128; - class SchedulerRuntimeInfo; class HeuristicSummary; diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 963f37465166c..03c7f55742c13 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -479,21 +479,21 @@ std::string getRuntimeRejectReason( auto& shape_in_ref1 = pair.first; auto& n_elems = pair.second; - auto innermost_info_entry = getInnerMostDimInfoInReference( - data_cache, reference_tensors, reference1, domain_map); - auto innermost_info = innermost_info_entry.get(); + constexpr size_t default_tile_elements = + TransposeParams::getDefaultTileSize() * + TransposeParams::getDefaultTileSize(); // don't schedule with transpose scheduler if less than a full wave const int64_t device_multiprocessor_count = (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; - auto pointwise_elements_per_wave = - device_multiprocessor_count * kPointwiseSchedulerThreadX; - if (pointwise_elements_per_wave > n_elems) { + auto elements_per_wave = device_multiprocessor_count * default_tile_elements; + if (elements_per_wave > n_elems) { return "Transpose scheduler does not perform well on small problem sizes."; } - auto max_tile_elements = TransposeParams::getDefaultTileSize() * - TransposeParams::getDefaultTileSize(); + auto innermost_info_entry = getInnerMostDimInfoInReference( + data_cache, reference_tensors, reference1, domain_map); + auto innermost_info = innermost_info_entry.get(); auto inner_most_pos1_in_ref1 = innermost_info[0]; auto inner_most_pos2_in_ref1 = innermost_info[1]; @@ -505,7 +505,7 @@ std::string getRuntimeRejectReason( // transpose(T0[1000000000, 2, 2], 1, 2) // the pointwise scheduler should provide better performance, because it // provides coalesced memory access - if (inner_size1 * inner_size2 < max_tile_elements) { + if (inner_size1 * inner_size2 < default_tile_elements) { auto inner_elements = inner_size1 * inner_size2; for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1; i++) { @@ -522,7 +522,7 @@ std::string getRuntimeRejectReason( // T3[2, 10000000, 3] input/output // T4[3, 10000000, 2] input/output // T5[3, 10000000, 2] input/output - if (inner_elements < max_tile_elements) { + if (inner_elements < default_tile_elements) { return "Inner transpose of small dimensions should be scheduled by the " "pointwise scheduler because it provides better memory coalescing"; } From 1427438d9b3eb328e55cea81c6ec2117e856b85b Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Sat, 10 Sep 2022 15:19:42 -0700 Subject: [PATCH 29/32] reject at pointwise scheduler --- .../cuda/scheduler/compile_time_info.h | 10 ++ .../jit/codegen/cuda/scheduler/registry.cpp | 170 ++++++++++-------- .../jit/codegen/cuda/scheduler/transpose.cpp | 2 +- .../jit/codegen/cuda/scheduler/transpose.h | 2 +- 4 files changed, 105 insertions(+), 79 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h index 86d906c4747a7..2e509cfb8a106 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h @@ -35,6 +35,7 @@ enum class CompileTimeEntryType { SCOPE_PERSISTENT_FACTOR_INFO, BROADCAST_BYTE_MULTIPLES, INNER_MOST_DIMS_INFO, + CAN_SCHEDULE_TRANSPOSE, }; //! Entry type definition class for `DOMAIN_MAP`, @@ -137,6 +138,15 @@ class BroadcastMultiples { CompileTimeEntryType::BROADCAST_BYTE_MULTIPLES; }; +//! Entry type definition class for `CAN_SCHEDULE_TRANSPOSE`, +//! stores if the transpose scheduler can scheduler this fusion +class CanScheduleTranspose { + public: + using DataType = bool; + static const CompileTimeEntryType EntryType = + CompileTimeEntryType::CAN_SCHEDULE_TRANSPOSE; +}; + //! Base abstract class for unified storage in `HeuristicSummary`, //! each entry in `HeuristicSummary` will be a subclass. class CompileTimeInfoBase : public PolymorphicBase { diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index a43fed67103e8..17a1fc540507e 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -937,6 +937,84 @@ class ReductionScheduler : public SchedulerEntry { } }; +class TransposeScheduler : public SchedulerEntry { + public: + explicit TransposeScheduler( + Fusion* fusion, + SchedulerRuntimeInfo& runtime_info, + HeuristicSummary* data_cache = nullptr) + : SchedulerEntry(ScheduleHeuristic::Transpose) { + computeHeuristics(fusion, runtime_info, data_cache); + } + + static bool canScheduleCompileTime(Fusion* fusion) { + // Temporarily disallow view in transpose scheduler + // TODO Add more testing before enabling + auto view_tvs = scheduler_utils::getViewTVs(fusion); + if (view_tvs.size() > 0) { + scheduler_debug_utils::canScheduleRejectReason( + ScheduleHeuristic::Transpose, "No support for view op"); + return false; + } + + if (!hasAtLeastTwoValidGroups(fusion)) { + scheduler_debug_utils::canScheduleRejectReason( + ScheduleHeuristic::Transpose, + "cannot find two mismatching inner most dimensions"); + return false; + } + + // TODO: add support for trivial reduction + auto reduction_ops = + ir_utils::getReductionOps(fusion, false /* ignore_trivial */); + + if (!reduction_ops.empty()) { + scheduler_debug_utils::canScheduleRejectReason( + ScheduleHeuristic::Transpose, "no support for reduction ops"); + return false; + } + + if (hasNonUniqueBcast(fusion)) { + scheduler_debug_utils::canScheduleRejectReason( + ScheduleHeuristic::Transpose, + "Broadcasting dimension might be broadcasting to multiple sizes."); + return false; + } + + return true; + } + + static bool canScheduleRunTime( + Fusion* fusion, + SchedulerRuntimeInfo& runtime_info, + HeuristicSummary* data_cache = nullptr) { + FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime"); + + auto reason = + getTransposeRuntimeRejectReason(fusion, data_cache, runtime_info); + if (!reason.empty()) { + scheduler_debug_utils::canScheduleRejectReason( + ScheduleHeuristic::Transpose, reason); + return false; + } + return true; + } + + void schedule(Fusion* fusion) override { + FUSER_PERF_SCOPE("Schedule Transpose Fusion"); + scheduleTranspose(fusion, transposeParams()); + } + + private: + void computeHeuristics( + Fusion* fusion, + SchedulerRuntimeInfo& runtime_info, + HeuristicSummary* data_cache = nullptr) { + params_ = getTransposeHeuristics(fusion, runtime_info, data_cache); + TORCH_INTERNAL_ASSERT(params_ != nullptr); + } +}; + class PointWiseScheduler : public SchedulerEntry { public: explicit PointWiseScheduler( @@ -980,6 +1058,19 @@ class PointWiseScheduler : public SchedulerEntry { Fusion* fusion, SchedulerRuntimeInfo& runtime_info, HeuristicSummary* data_cache = nullptr) { + auto can_schedule_transpose_entry = + HeuristicSummaryEntry( + data_cache, [fusion]() { + return std::make_unique( + TransposeScheduler::canScheduleCompileTime(fusion)); + }); + if (can_schedule_transpose_entry.get()) { + // TODO: data cache + auto reason = + getTransposeRuntimeRejectReason(fusion, nullptr, runtime_info); + return !reason.empty(); + } + return true; } @@ -1216,83 +1307,6 @@ class PersistentKernelScheduler : public SchedulerEntry { } }; -class TransposeScheduler : public SchedulerEntry { - public: - explicit TransposeScheduler( - Fusion* fusion, - SchedulerRuntimeInfo& runtime_info, - HeuristicSummary* data_cache = nullptr) - : SchedulerEntry(ScheduleHeuristic::Transpose) { - computeHeuristics(fusion, runtime_info, data_cache); - } - - static bool canScheduleCompileTime(Fusion* fusion) { - // Temporarily disallow view in transpose scheduler - // TODO Add more testing before enabling - auto view_tvs = scheduler_utils::getViewTVs(fusion); - if (view_tvs.size() > 0) { - scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Transpose, "No support for view op"); - return false; - } - - if (!hasAtLeastTwoValidGroups(fusion)) { - scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Transpose, - "cannot find two mismatching inner most dimensions"); - return false; - } - - // TODO: add support for trivial reduction - auto reduction_ops = - ir_utils::getReductionOps(fusion, false /* ignore_trivial */); - - if (!reduction_ops.empty()) { - scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Transpose, "no support for reduction ops"); - return false; - } - - if (hasNonUniqueBcast(fusion)) { - scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Transpose, - "Broadcasting dimension might be broadcasting to multiple sizes."); - return false; - } - - return true; - } - - static bool canScheduleRunTime( - Fusion* fusion, - SchedulerRuntimeInfo& runtime_info, - HeuristicSummary* data_cache = nullptr) { - FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime"); - - auto reason = getRuntimeRejectReason(fusion, data_cache, runtime_info); - if (!reason.empty()) { - scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Transpose, reason); - return false; - } - return true; - } - - void schedule(Fusion* fusion) override { - FUSER_PERF_SCOPE("Schedule Transpose Fusion"); - scheduleTranspose(fusion, transposeParams()); - } - - private: - void computeHeuristics( - Fusion* fusion, - SchedulerRuntimeInfo& runtime_info, - HeuristicSummary* data_cache = nullptr) { - params_ = getTransposeHeuristics(fusion, runtime_info, data_cache); - TORCH_INTERNAL_ASSERT(params_ != nullptr); - } -}; - // Schedule Table const std::vector& all_heuristics() { static const std::vector hlist = { @@ -1569,6 +1583,8 @@ template class HeuristicSummaryEntry< HeuristicCompileTime::ScopePersistentFactorInfo>; template class HeuristicSummaryEntry; template class HeuristicSummaryEntry; +template class HeuristicSummaryEntry< + HeuristicCompileTime::CanScheduleTranspose>; } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 03c7f55742c13..8480b78e792d7 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -460,7 +460,7 @@ getInnerMostDimInfoInReference( } // namespace -std::string getRuntimeRejectReason( +std::string getTransposeRuntimeRejectReason( Fusion* fusion, HeuristicSummary* data_cache, SchedulerRuntimeInfo& runtime_info) { diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h index abf141846555d..c1a4ab6efb6ae 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h @@ -104,7 +104,7 @@ TORCH_CUDA_CU_API bool hasAtLeastTwoValidGroups(Fusion* fusion); // If can schedule at runtime, returns empty string, otherwise returns the // reason why we should not schedule at runtime. -TORCH_CUDA_CU_API std::string getRuntimeRejectReason( +TORCH_CUDA_CU_API std::string getTransposeRuntimeRejectReason( Fusion* fusion, HeuristicSummary* data_cache, SchedulerRuntimeInfo& runtime_info); From 27a4db83aec1cb9a97631fff3f1f655e7a617841 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Sat, 10 Sep 2022 16:03:56 -0700 Subject: [PATCH 30/32] fix --- .../jit/codegen/cuda/scheduler/registry.cpp | 27 ++++++++++++++----- .../jit/codegen/cuda/scheduler/transpose.cpp | 9 +++---- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index 17a1fc540507e..40efd8353baf8 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1065,9 +1065,8 @@ class PointWiseScheduler : public SchedulerEntry { TransposeScheduler::canScheduleCompileTime(fusion)); }); if (can_schedule_transpose_entry.get()) { - // TODO: data cache auto reason = - getTransposeRuntimeRejectReason(fusion, nullptr, runtime_info); + getTransposeRuntimeRejectReason(fusion, data_cache, runtime_info); return !reason.empty(); } @@ -1499,6 +1498,25 @@ void HeuristicSummary::validate() const { entry_type_map_.count(EntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS)); TORCH_INTERNAL_ASSERT( entry_type_map_.count(EntryType::BROADCAST_BYTE_MULTIPLES)); + TORCH_INTERNAL_ASSERT( + entry_type_map_.count(EntryType::CAN_SCHEDULE_TRANSPOSE)); + auto can_schedule_transpose = + entry_type_map_.at(EntryType::CAN_SCHEDULE_TRANSPOSE) + ->as< + CompileTimeInfo>() + ->get(); + if (!can_schedule_transpose) { + break; + } + } + case ScheduleHeuristic::Transpose: { + TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::DOMAIN_MAP)); + TORCH_INTERNAL_ASSERT(entry_type_map_.count( + EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS)); + TORCH_INTERNAL_ASSERT( + entry_type_map_.count(EntryType::REFERENCE_TENSORS)); + TORCH_INTERNAL_ASSERT( + entry_type_map_.count(EntryType::INNER_MOST_DIMS_INFO)); break; } case ScheduleHeuristic::Reduction: { @@ -1528,11 +1546,6 @@ void HeuristicSummary::validate() const { entry_type_map_.count(EntryType::SCOPE_PERSISTENT_FACTOR_INFO)); break; } - case ScheduleHeuristic::Transpose: { - TORCH_INTERNAL_ASSERT(entry_type_map_.count( - EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS)); - break; - } default: TORCH_INTERNAL_ASSERT(false, "unknown heuristic"); } diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 8480b78e792d7..bc330217b8596 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -453,7 +453,6 @@ getInnerMostDimInfoInReference( data.emplace_back(inner_most_pos_in_global_ref); } return std::make_unique>(std::move(data)); - ; }); return innermost_info_entry; } @@ -479,6 +478,10 @@ std::string getTransposeRuntimeRejectReason( auto& shape_in_ref1 = pair.first; auto& n_elems = pair.second; + auto innermost_info_entry = getInnerMostDimInfoInReference( + data_cache, reference_tensors, reference1, domain_map); + auto innermost_info = innermost_info_entry.get(); + constexpr size_t default_tile_elements = TransposeParams::getDefaultTileSize() * TransposeParams::getDefaultTileSize(); @@ -491,10 +494,6 @@ std::string getTransposeRuntimeRejectReason( return "Transpose scheduler does not perform well on small problem sizes."; } - auto innermost_info_entry = getInnerMostDimInfoInReference( - data_cache, reference_tensors, reference1, domain_map); - auto innermost_info = innermost_info_entry.get(); - auto inner_most_pos1_in_ref1 = innermost_info[0]; auto inner_most_pos2_in_ref1 = innermost_info[1]; From 0c173fa5aeefe90abc519bf7d518727b1c904fee Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Sat, 10 Sep 2022 16:36:41 -0700 Subject: [PATCH 31/32] fix --- torch/csrc/jit/codegen/cuda/scheduler/registry.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index 40efd8353baf8..c4a04fd060077 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1505,7 +1505,7 @@ void HeuristicSummary::validate() const { ->as< CompileTimeInfo>() ->get(); - if (!can_schedule_transpose) { + if (!*can_schedule_transpose) { break; } } From 911956eabf28d32e07a2e96af76dfac294146736 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Sun, 11 Sep 2022 02:15:25 -0700 Subject: [PATCH 32/32] fix fusion input reductions --- .../cuda/scheduler/compile_time_info.h | 21 +++++++++++++++++++ .../jit/codegen/cuda/scheduler/registry.cpp | 8 +++++-- .../jit/codegen/cuda/scheduler/transpose.cpp | 14 +++++++++---- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h index 2e509cfb8a106..262f1f84d259a 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h @@ -26,7 +26,9 @@ namespace HeuristicCompileTime { //! Enum for all possible types of cached entries of compile-time info. enum class CompileTimeEntryType { DOMAIN_MAP, + TRANSPOSE_DOMAIN_MAP, REFERENCE_TENSORS, + REFERENCE_TENSORS_FOR_GROUPS, VECTORIZABLE_INPUTS_AND_OUTPUTS, INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS, UNROLLABLE_INPUTS_AND_OUTPUTS, @@ -47,6 +49,15 @@ class DomainMap { CompileTimeEntryType::DOMAIN_MAP; }; +//! Entry type definition class for `DOMAIN_MAP`, +//! stores the domain map of a fusion, used by transpose scheduler. +class TransposeDomainMap { + public: + using DataType = pointwise_utils::DomainMap; + static const CompileTimeEntryType EntryType = + CompileTimeEntryType::TRANSPOSE_DOMAIN_MAP; +}; + //! Entry type definition class for `REFERENCE_TENSORS`, //! stores the the reference TensorViews used to schedule a fusion. class ReferenceTensors { @@ -56,6 +67,16 @@ class ReferenceTensors { CompileTimeEntryType::REFERENCE_TENSORS; }; +//! Entry type definition class for `REFERENCE_TENSORS`, +//! stores the the reference TensorViews used to schedule a fusion, used by +//! transpose scheduler. +class ReferenceTensorsForGroups { + public: + using DataType = std::vector; + static const CompileTimeEntryType EntryType = + CompileTimeEntryType::REFERENCE_TENSORS_FOR_GROUPS; +}; + //! Entry type definition class for `VECTORIZABLE_INPUTS_AND_OUTPUTS`, //! stores the vectorizable TensorViews on a fusion's inputs and outputs. class VectorizableInputsAndOutputs { diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index c4a04fd060077..0ba2c1b8afa30 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1510,11 +1510,12 @@ void HeuristicSummary::validate() const { } } case ScheduleHeuristic::Transpose: { - TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::DOMAIN_MAP)); + TORCH_INTERNAL_ASSERT( + entry_type_map_.count(EntryType::TRANSPOSE_DOMAIN_MAP)); TORCH_INTERNAL_ASSERT(entry_type_map_.count( EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS)); TORCH_INTERNAL_ASSERT( - entry_type_map_.count(EntryType::REFERENCE_TENSORS)); + entry_type_map_.count(EntryType::REFERENCE_TENSORS_FOR_GROUPS)); TORCH_INTERNAL_ASSERT( entry_type_map_.count(EntryType::INNER_MOST_DIMS_INFO)); break; @@ -1582,7 +1583,10 @@ HeuristicSummaryEntry::HeuristicSummaryEntry( // Template instantiation for pre-defined cache entries template class HeuristicSummaryEntry; +template class HeuristicSummaryEntry; template class HeuristicSummaryEntry; +template class HeuristicSummaryEntry< + HeuristicCompileTime::ReferenceTensorsForGroups>; template class HeuristicSummaryEntry< HeuristicCompileTime::VectorizableInputsAndOutputs>; template class HeuristicSummaryEntry< diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index bc330217b8596..5d1c533224dcd 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -160,6 +160,12 @@ class DomainMap : public pointwise_utils::DomainMap { // Then we still want to T1 and T2 to be grouped together. auto group = scheduler_utils::getInputsOutputsWithInnerDim(tv, true, false); + if (group.empty()) { + // In case that the inner most dim of tv is not found (for example, tv + // is a fusion input with only reductions), we just return a null + // result which will tell the scheduler to reject the fusion + return {}; + } for (auto member_tv : group) { if (grouped.count(member_tv) == 0) { grouped.emplace(member_tv); @@ -358,11 +364,11 @@ void maybeBuildVirtualInnerDims( } } -HeuristicSummaryEntry getDomainMap( +HeuristicSummaryEntry getDomainMap( HeuristicSummary* data_cache, Fusion* fusion) { auto domain_map_entry = - HeuristicSummaryEntry( + HeuristicSummaryEntry( data_cache, [fusion]() { return std::make_unique(fusion); }); return domain_map_entry; @@ -385,13 +391,13 @@ getInputsOutputsGroups(HeuristicSummary* data_cache, DomainMap& domain_map) { return grouped_inputs_outputs_entry; } -HeuristicSummaryEntry +HeuristicSummaryEntry getReferenceTensors( HeuristicSummary* data_cache, DomainMap& domain_map, std::vector>& grouped_inputs_outputs) { auto reference_tensors_entry = - HeuristicSummaryEntry( + HeuristicSummaryEntry( data_cache, [&domain_map, &grouped_inputs_outputs]() { std::vector data{ domain_map.findReferenceFor(grouped_inputs_outputs[0]),