Skip to content

Commit

Permalink
Misc fixes/tuning for transpose scheduler (#1912)
Browse files Browse the repository at this point in the history
  • Loading branch information
zasdfgbnm authored Aug 23, 2022
1 parent 20cf109 commit 3c3c89e
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 39 deletions.
17 changes: 15 additions & 2 deletions benchmarks/cpp/nvfuser/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

using namespace torch::jit::fuser::cuda;

std::string toString(ReductionParams rparams) {
std::string toString(const ReductionParams& rparams) {
std::stringstream ss;
ss << (rparams.fastest_dim ? "Red On Fastest Dim // " : "Red On Slow Dim // ")
<< (rparams.persistent_kernel ? "Persistent Kernel // " : "")
Expand Down Expand Up @@ -65,7 +65,7 @@ std::string toString(ReductionParams rparams) {
return ss.str();
}

std::string toString(PointwiseParams params) {
std::string toString(const PointwiseParams& params) {
std::stringstream ss;
if (params.break_point) {
ss << "2D Schedule at " << params.break_point << "/";
Expand All @@ -89,6 +89,15 @@ std::string toString(PointwiseParams params) {
return ss.str();
}

std::string toString(const TransposeParams& params) {
std::stringstream ss;
ss << "Tile size: (" << params.tile_size1 << "," << params.tile_size2
<< ")/";
ss << "Vectorize size: (" << params.vectorize_factor1 << ","
<< params.vectorize_factor2 << ")";
return ss.str();
}

std::string toString(const std::shared_ptr<HeuristicParams>& params) {
auto rparams = std::dynamic_pointer_cast<ReductionParams>(params);
if (rparams) {
Expand All @@ -98,6 +107,10 @@ std::string toString(const std::shared_ptr<HeuristicParams>& params) {
if (pparams) {
return toString(*pparams);
}
auto tparams = std::dynamic_pointer_cast<TransposeParams>(params);
if (tparams) {
return toString(*tparams);
}
TORCH_INTERNAL_ASSERT(
false,
"Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?");
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/cpp/nvfuser/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ TensorView* makeContigConcreteTensor(
std::vector<int64_t> shape,
DataType dtype = DataType::Float);

std::string toString(ReductionParams rparams);
std::string toString(PointwiseParams params);
std::string toString(const ReductionParams& rparams);
std::string toString(const PointwiseParams& params);
std::string toString(const TransposeParams& params);
std::string toString(const std::shared_ptr<HeuristicParams>& params);
std::string toString(LaunchParams lparams);

Expand Down
8 changes: 6 additions & 2 deletions test/test_jit_cuda_fuser.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@
if RUN_NVFUSER and torch.version.cuda is not None:
CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2])

os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition'
os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma'
if 'PYTORCH_NVFUSER_ENABLE' not in os.environ:
os.environ['PYTORCH_NVFUSER_ENABLE'] = ""
os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition,' + os.environ['PYTORCH_NVFUSER_ENABLE']
if 'PYTORCH_NVFUSER_DISABLE' not in os.environ:
os.environ['PYTORCH_NVFUSER_DISABLE'] = ""
os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,' + os.environ['PYTORCH_NVFUSER_DISABLE']
os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0'
# TODO: enable complex when we fixes the extremal cases in OpInfo
# see issue https://github.com/csarofeen/pytorch/issues/1730"
Expand Down
6 changes: 4 additions & 2 deletions torch/csrc/jit/codegen/cuda/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -980,14 +980,16 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
const auto& input_tensor = input.toTensor();
std::cout << " " << input_tensor.scalar_type() << " "
<< input.toTensor().sizes()
<< " (strides = " << input.toTensor().strides() << ")"
<< " (strides = " << input.toTensor().strides()
<< ", address = " << input.toTensor().data_ptr() << ")"
<< std::endl;
}
}
std::cout << "Outputs:" << std::endl;
for (const auto& output : allocated_outputs) {
std::cout << " " << output.scalar_type() << " " << output.sizes()
<< " (strides = " << output.strides() << ")" << std::endl;
<< " (strides = " << output.strides()
<< ", address = " << output.data_ptr() << ")" << std::endl;
}
std::cout << "Reduction and semaphore buffers:" << std::endl;
TORCH_INTERNAL_ASSERT(
Expand Down
12 changes: 7 additions & 5 deletions torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1256,15 +1256,18 @@ class TransposeScheduler : public SchedulerEntry {
}

static bool canScheduleCompileTime(Fusion* fusion) {
// Not enabling this yet. Needs more validation.
return false;
#if 0
if (!isOptionEnabled(EnableOption::TransposeScheduler)) {
scheduler_debug_utils::canScheduleRejectReason(
ScheduleHeuristic::Transpose, "not enabled");
return false;
}

// Temporarily disallow view in transpose scheduler
// TODO Add more testing before enabling
auto view_tvs = scheduler_utils::getViewTVs(fusion);
if (view_tvs.size() > 0) {
scheduler_debug_utils::canScheduleRejectReason(
ScheduleHeuristic::Reduction, "No support for view op");
ScheduleHeuristic::Transpose, "No support for view op");
return false;
}

Expand Down Expand Up @@ -1293,7 +1296,6 @@ class TransposeScheduler : public SchedulerEntry {
}

return true;
#endif
}

static bool canScheduleRunTime(
Expand Down
44 changes: 29 additions & 15 deletions torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,15 @@ class DomainMap : public pointwise_utils::DomainMap {
auto group =
scheduler_utils::getInputsOutputsWithInnerDim(tv, true, false);
for (auto member_tv : group) {
TORCH_INTERNAL_ASSERT(
grouped.count(member_tv) == 0 || member_tv == tv,
"The group of ",
member_tv->toString(),
" is ambiguous. This is likely a bug.");
grouped.emplace(member_tv);
groups.back().emplace_back(member_tv);
if (grouped.count(member_tv) == 0) {
grouped.emplace(member_tv);
groups.back().emplace_back(member_tv);
} else if (member_tv != tv) {
// Ambiguous grouping. This should only happen at `canSchedule`, so
// we just return a null result which will tell the scheduler to
// reject the fusion
return {};
}
}
}
}
Expand Down Expand Up @@ -229,15 +231,26 @@ void maybeBuildVirtualInnerDims(
(merged_size2 < params.tile_size2)) {
return; // no need to split
}
// If one of them are not satisfied, this usually means that the satisfied one
// just merged in a large dim. We split this large dim, so that now we have
// two available dims to satisfy both virtual innermost dim.
// If one of them are not satisfied, there might be two cases:
// 1. The satisfied one just merged in a large dim. If this is the case, We
// split this large dim, so that now we have two available dims to satisfy
// both virtual innermost dim.
// 2. The satisfied one did not merge in anything. For example,
// T0[I0{1024*1024}, I1{2}]
int64_t large_dim;
int64_t split_factor;
if (merged_size1 < params.tile_size1) {
if (params.dims_merged_with_2.empty()) {
// case 2
return;
}
large_dim = params.dims_merged_with_2.back();
split_factor = ceilDiv(params.tile_size1, merged_size1);
} else {
if (params.dims_merged_with_1.empty()) {
// case 2
return;
}
large_dim = params.dims_merged_with_1.back();
split_factor = ceilDiv(params.tile_size2, merged_size2);
}
Expand Down Expand Up @@ -435,7 +448,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
auto max_unroll_factor_block =
ceilDiv(params->tile_size1 * params->tile_size2, 32);
max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_block);
max_unroll_factor = scheduler_utils::lastPow2(max_unroll_factor);

// Compute maximum vectorize factor that can be used
size_t vectorize_factor1 = max_unroll_factor;
Expand All @@ -456,15 +468,17 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor);
}

params->vectorize_factor1 =
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor1);
params->vectorize_factor2 =
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor2);
params->vectorize_factor1 = scheduler_utils::lastPow2(
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor1));
params->vectorize_factor2 = scheduler_utils::lastPow2(
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor2));

params->lparams.bind(params->getThreadsPerBlock(), ParallelType::TIDx);

if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
std::cerr << "\n===== Transpose Stats ========\n"
<< "inputs: " << ir_utils::toString(fusion->inputs()) << "\n"
<< "outputs: " << ir_utils::toString(fusion->outputs()) << "\n"
<< "num_elems: " << n_elems << "\n"
<< "n_input_tensors: " << n_input_tensors << "\n"
<< "max_input_dtype_size: " << max_input_dtype_size << "\n"
Expand Down
16 changes: 9 additions & 7 deletions torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1390,20 +1390,22 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(

std::vector<TensorView*> vectorizable_tensors;

for (auto input_tv :
ir_utils::filterByType<TensorView>(reference_tv->fusion()->inputs())) {
if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) {
vectorizable_tensors.push_back(input_tv);
}
}

// We put outputs in front of inputs because this would make the transpose
// scheduler prefer to use output instead of input as reference tensor.
for (auto output_tv :
ir_utils::filterByType<TensorView>(reference_tv->fusion()->outputs())) {
if (hasInnerDim(output_tv, vectorizable_dims, vectorize_pass)) {
vectorizable_tensors.push_back(output_tv);
}
}

for (auto input_tv :
ir_utils::filterByType<TensorView>(reference_tv->fusion()->inputs())) {
if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) {
vectorizable_tensors.push_back(input_tv);
}
}

return vectorizable_tensors;
}

Expand Down
10 changes: 7 additions & 3 deletions torch/csrc/jit/codegen/cuda/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ auto parseEnableOptions() {
{EnableOption::Complex, false},
{EnableOption::KernelProfile, false},
{EnableOption::LinearDecomposition, false},
{EnableOption::ConvDecomposition, false}};
{EnableOption::ConvDecomposition, false},
{EnableOption::TransposeScheduler, false}};

if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) {
c10::string_view options_view(dump_options);
Expand All @@ -184,13 +185,16 @@ auto parseEnableOptions() {
options_map[EnableOption::LinearDecomposition] = true;
} else if (token == "conv_decomposition") {
options_map[EnableOption::ConvDecomposition] = true;
} else if (token == "transpose_scheduler") {
options_map[EnableOption::TransposeScheduler] = true;
} else {
TORCH_CHECK(
false,
"Invalid disable option: '",
"Invalid enable option: '",
token,
"'\nAvailable options:\n",
"\tcomplex, kernel_profile");
"\tcomplex, kernel_profile, linear_decomposition,",
"conv_decomposition, transpose_scheduler");
}
options_view = (end_pos != c10::string_view::npos)
? options_view.substr(end_pos + 1)
Expand Down
3 changes: 2 additions & 1 deletion torch/csrc/jit/codegen/cuda/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ enum class EnableOption {
Complex, //! Enable complex support on python
KernelProfile, //! Enable intra-kernel performance profiling
LinearDecomposition, //! Enable linear-bias decomposition
ConvDecomposition //! Enable conv-bias decomposition
ConvDecomposition, //! Enable conv-bias decomposition
TransposeScheduler //! Enable the experimental transpose scheduler
};

TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
Expand Down

0 comments on commit 3c3c89e

Please sign in to comment.