Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some cleanups on tests and heuristics params #1866

Merged
merged 4 commits into from
Jul 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions benchmarks/cpp/nvfuser/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ static void MagicScheduler_DivMaxSoftDropFwd(
std::vector<at::Tensor> cg_outputs;

auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, norm_params.value());
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);

FusionExecutor fe;
fe.compileFusion(&fusion);
Expand All @@ -143,7 +143,7 @@ static void MagicScheduler_DivMaxSoftDropFwd(
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion({t0, t1}, norm_params.value().lparams);
cg_outputs = fe.runFusion({t0, t1}, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
Expand Down Expand Up @@ -193,8 +193,8 @@ static void MagicScheduler_DivMaxSoftDropBwd(
std::vector<at::Tensor> cg_outputs;

auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, norm_params.value());
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);

FusionExecutor fe;
fe.compileFusion(&fusion);
Expand All @@ -203,7 +203,7 @@ static void MagicScheduler_DivMaxSoftDropBwd(
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion({t0, t1, t2, t3}, norm_params.value().lparams);
cg_outputs = fe.runFusion({t0, t1, t2, t3}, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
Expand Down Expand Up @@ -308,8 +308,8 @@ static void MagicScheduler_BiasDropoutAddLayernormFwd(
std::vector<at::Tensor> cg_outputs;

auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, norm_params.value());
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);

FusionExecutor fe;
fe.compileFusion(&fusion);
Expand All @@ -319,7 +319,7 @@ static void MagicScheduler_BiasDropoutAddLayernormFwd(
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
Expand Down Expand Up @@ -423,8 +423,8 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd1(
std::vector<at::Tensor> cg_outputs;

auto norm_params = getReductionHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
scheduleReduction(&fusion, norm_params.value());
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
scheduleReduction(&fusion, *norm_params);

FusionExecutor fe;
fe.compileFusion(&fusion);
Expand All @@ -434,7 +434,7 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd1(
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
clearL2Cache();
cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
Expand Down Expand Up @@ -534,8 +534,8 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd2(
std::vector<at::Tensor> cg_outputs;

auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, norm_params.value());
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);

FusionExecutor fe;
fe.compileFusion(&fusion);
Expand All @@ -545,7 +545,7 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd2(
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
Expand Down Expand Up @@ -625,8 +625,8 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd3(
std::vector<at::Tensor> cg_outputs;

auto norm_params = getReductionHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
scheduleReduction(&fusion, norm_params.value());
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
scheduleReduction(&fusion, *norm_params);

FusionExecutor fe;
fe.compileFusion(&fusion);
Expand All @@ -636,7 +636,7 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd3(
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
Expand Down
3 changes: 1 addition & 2 deletions benchmarks/cpp/nvfuser/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ static void NvFuserScheduler_Broadcast(

auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
auto params = toString(compile_log.pointwise_params.value());
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());

benchmark_state.SetLabel(params + lparams);
Expand Down
3 changes: 1 addition & 2 deletions benchmarks/cpp/nvfuser/reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ static void NvFuserScheduler_Reduction(

auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
TORCH_INTERNAL_ASSERT(compile_log.reduction_params.has_value());
auto rparams = toString(compile_log.reduction_params.value());
auto rparams = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());

benchmark_state.SetLabel(rparams + lparams);
Expand Down
6 changes: 2 additions & 4 deletions benchmarks/cpp/nvfuser/scale_bias_relu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ static void NvFuserScheduler_SBR(

auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
auto params = toString(compile_log.pointwise_params.value());
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());

benchmark_state.SetLabel(params + lparams);
Expand Down Expand Up @@ -238,8 +237,7 @@ static void NvFuserScheduler_SBR_Norm(

auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
auto params = toString(compile_log.pointwise_params.value());
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());

benchmark_state.SetLabel(params + lparams);
Expand Down
30 changes: 18 additions & 12 deletions benchmarks/cpp/nvfuser/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,20 @@ std::string toString(PointwiseParams params) {
return ss.str();
}

std::string toString(const std::shared_ptr<HeuristicParams>& params) {
auto rparams = std::dynamic_pointer_cast<ReductionParams>(params);
if (rparams) {
return toString(*rparams);
}
auto pparams = std::dynamic_pointer_cast<PointwiseParams>(params);
if (pparams) {
return toString(*pparams);
}
TORCH_INTERNAL_ASSERT(
false,
"Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?");
}

std::string toString(LaunchParams lparams) {
std::stringstream ss;
lparams.toString();
Expand Down Expand Up @@ -123,9 +137,7 @@ TensorView* makeContigTensor(size_t ndims, DataType dtype) {
.build();
}

TensorView* makeConcreteTensor(
std::vector<int64_t> shape,
DataType dtype) {
TensorView* makeConcreteTensor(std::vector<int64_t> shape, DataType dtype) {
return TensorViewBuilder().shape(shape).dtype(dtype).build();
}

Expand Down Expand Up @@ -157,15 +169,9 @@ void runBenchmarkIterations(
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;

if (compile_log.reduction_params.has_value()) {
auto rparams = toString(compile_log.reduction_params.value());
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(rparams + lparams);
} else if (compile_log.pointwise_params.has_value()){
auto pparams = toString(compile_log.pointwise_params.value());
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(pparams + lparams);
}
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(params + lparams);

executor_instance->setMeasureKernelTimeFlag(true);

Expand Down
1 change: 1 addition & 0 deletions benchmarks/cpp/nvfuser/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ TensorView* makeContigConcreteTensor(

std::string toString(ReductionParams rparams);
std::string toString(PointwiseParams params);
std::string toString(const std::shared_ptr<HeuristicParams>& params);
std::string toString(LaunchParams lparams);

// Run benchmark iterations with provided inputs. If not segmented, report
Expand Down
37 changes: 6 additions & 31 deletions torch/csrc/jit/codegen/cuda/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,32 +341,16 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
options.index_mode = scheduler_entry->indexMode();
FusionGuard fg(fusion_to_run.get());
scheduler_entry->schedule(fusion_to_run.get());
// Load launch params for reduction and normalization kernels
if (scheduler_entry->hasReductionParam()) {
launch_params = scheduler_entry->reductionParams().lparams;
} else {
launch_params = scheduler_entry->pointwiseParams().lparams;
}
launch_params = scheduler_entry->params()->lparams;
executors_[group_id].compileFusion(
fusion_to_run.get(), inputs, launch_params, options);
} else {
// Load launch params for reduction and normalization kernels
if (scheduler_entry->hasReductionParam()) {
launch_params = scheduler_entry->reductionParams().lparams;
} else {
launch_params = scheduler_entry->pointwiseParams().lparams;
}
launch_params = scheduler_entry->params()->lparams;
}

if (profiling_) {
most_recent_executor_log_.fusion_executor = &executors_[group_id];
if (scheduler_entry->hasReductionParam()) {
most_recent_executor_log_.reduction_params =
scheduler_entry->reductionParams();
} else {
most_recent_executor_log_.pointwise_params =
scheduler_entry->pointwiseParams();
}
most_recent_executor_log_.params = scheduler_entry->params()->clone();
}

auto& executor = executors_[group_id];
Expand Down Expand Up @@ -395,11 +379,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
}
}
std::cout << "Compiler log: " << executor.compilerLog() << "\n";
if (scheduler_entry->hasReductionParam()) {
std::cout << scheduler_entry->reductionParams().toString() << "\n";
} else {
std::cout << scheduler_entry->pointwiseParams().toString() << "\n";
}
std::cout << scheduler_entry->params()->toString() << "\n";
std::cout << "With arguments: " << executor.lastLaunchParams().toString();
std::cout << executor.kernelName() << " " << executor.bytesProcessed()
<< " bytes/ " << std::setprecision(3) << executor.kernelTimeMs()
Expand Down Expand Up @@ -604,13 +584,8 @@ void FusionKernelRuntime::updateHeuristicsLaunchParams(
update_heuristics->heuristicsList().size() == scheduler_list_length);
for (const auto i : c10::irange(scheduler_list_length)) {
auto& schedulerPtr = heuristics_->heuristicsList()[i];
if (schedulerPtr->hasReductionParam()) {
schedulerPtr->updateLaunchConstraint(
update_heuristics->heuristicsList()[i]->reductionParams().lparams);
} else {
schedulerPtr->updateLaunchConstraint(
update_heuristics->heuristicsList()[i]->pointwiseParams().lparams);
}
schedulerPtr->updateLaunchConstraint(
update_heuristics->heuristicsList()[i]->params()->lparams);
}
}

Expand Down
3 changes: 1 addition & 2 deletions torch/csrc/jit/codegen/cuda/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ class SchedulerRuntimeInfo;

// Utilities for benchmarking and profiling
struct ExecutorLog {
c10::optional<ReductionParams> reduction_params = c10::nullopt;
c10::optional<PointwiseParams> pointwise_params = c10::nullopt;
std::shared_ptr<HeuristicParams> params = nullptr;
FusionExecutor* fusion_executor = nullptr;
};

Expand Down
37 changes: 37 additions & 0 deletions torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma once

#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>

#include <string>

namespace torch {
namespace jit {
namespace fuser {
namespace cuda {

class HeuristicParams {
public:
std::string tag = "";

LaunchParams lparams;

virtual std::string toString() const {
return "Undefined Heuristic Params";
}

virtual size_t hash() const = 0;

virtual ~HeuristicParams() = default;

virtual bool sameAs(const std::shared_ptr<HeuristicParams>& other) const = 0;

virtual std::shared_ptr<HeuristicParams> clone() const = 0;

HeuristicParams() = default;
HeuristicParams(const std::string& tag) : tag(tag) {}
};

} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch
Loading