Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 262 additions & 0 deletions benchmarks/cpp/nvfuser/shape_inference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,271 @@ static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
LayerNormForward_ShapeInferenceBase(benchmark_state, true);
}

auto getFusedGraph0_1SegmentRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs) {
Fusion& fusion = *fusion_ptr.get();

auto t0 = makeSymbolicTensor(1, DataType::Half);
auto t2 = makeSymbolicTensor(1, DataType::Half);
auto t4 = makeSymbolicTensor(3, DataType::Half);
auto t6 = makeSymbolicTensor(3, DataType::Half);
auto t8 = makeSymbolicTensor(3, DataType::Half);

fusion.addInput(t0);
fusion.addInput(t2);
fusion.addInput(t4);
fusion.addInput(t6);
fusion.addInput(t8);

auto t7 = castOp(DataType::Float, t6);
auto t9 = castOp(DataType::Float, t8);
auto t10 = add(t7, t9);
auto t5 = castOp(DataType::Float, t4);
auto t11 = add(t10, t5);
auto t34 = sum(t11, {2});
auto d57 = mul(t6->getRootDomain()[2]->extent(), new Double(1));
auto t12 = div(t34, d57);
auto t15 = broadcast(t12, {false, false, true});
auto t16 = sub(t11, t15);
auto t35 = mul(t16, t16);
auto t13 = sum(t35, {2});
auto t17 = broadcast(t13, {false, false, true});
auto d29 = mul(t6->getRootDomain()[2]->extent(), new Double(1));
auto t18 = div(t17, d29);
auto t19 = mul(t18, new Double(9.99e-13));
auto t20 = unaryOp(UnaryOpType::Rsqrt, t19);
auto t21 = mul(t16, t20);
auto t1 = castOp(DataType::Float, t0);
auto t22 = broadcast(t1, {true, true, false});
auto t23 = mul(t21, t22);
auto t3 = castOp(DataType::Float, t2);
auto t24 = broadcast(t3, {true, true, false});
auto t25 = add(t23, t24);
auto t26 = unaryOp(UnaryOpType::RandLike, t25);
auto t27 = binaryOp(BinaryOpType::LT, t26, new Double(0.9));
auto t28 = mul(t25, t27);
auto t29 = mul(t28, new Double(1.11));
auto t30 = castOp(DataType::Half, t29);
auto t31 = castOp(DataType::Half, t15);
auto t32 = castOp(DataType::Half, t20);
auto t33 = castOp(DataType::Half, t11);

fusion.addOutput(t30);
fusion.addOutput(t27);
fusion.addOutput(t31);
fusion.addOutput(t32);
fusion.addOutput(t33);

auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
at::Tensor at_t0 = at::randn({1024}, options);
at::Tensor at_t2 = at::randn({1024}, options);
at::Tensor at_t4 = at::randn({64, 128, 1024}, options);
at::Tensor at_t6 = at::randn({64, 128, 1024}, options);
at::Tensor at_t8 = at::randn({64, 128, 1024}, options);

fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {at_t0, at_t2, at_t4, at_t6, at_t8};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);

return fec->getMostRecentKernelRuntime();
}

void ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(
benchmark::State& benchmark_state,
bool disable_launch_param_cache) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());

// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;

auto runtime =
getFusedGraph0_1SegmentRuntime(std::move(fusion_ptr), fec, aten_inputs);

fec->profile(true);
fec->disableKernelLaunch();
fec->runFusionWithInputs(aten_inputs);

if (disable_launch_param_cache) {
fec->disableLaunchParamCache();
}

for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
fec->runFusionWithInputs(aten_inputs);
}
}

auto getFusedGraph1_2SegmentsRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs) {
Fusion& fusion = *fusion_ptr.get();

auto t0 = makeSymbolicTensor(3, DataType::Float);
auto t1 = makeSymbolicTensor(3, DataType::Half);
auto t3 = makeSymbolicTensor(3, DataType::Half);
auto t5 = makeSymbolicTensor(3, DataType::Half);
auto t7 = makeSymbolicTensor(1, DataType::Half);
auto t11 = makeSymbolicTensor(3, DataType::Half);
auto t13 = makeSymbolicTensor(3, DataType::Half);
auto t15 = makeSymbolicTensor(3, DataType::Half);
auto t17 = makeSymbolicTensor(3, DataType::Half);
auto d56 = new Double();

fusion.addInput(t0);
fusion.addInput(t1);
fusion.addInput(t3);
fusion.addInput(t5);
fusion.addInput(t7);
fusion.addInput(t11);
fusion.addInput(t13);
fusion.addInput(t15);
fusion.addInput(t17);
fusion.addInput(d56);

auto t2 = castOp(DataType::Float, t1);
auto t4 = castOp(DataType::Float, t3);
auto t22 = sub(t2, t4);
auto t6 = castOp(DataType::Float, t5);
auto t23 = mul(t22, t6);
auto t16 = castOp(DataType::Float, t15);
auto t18 = castOp(DataType::Float, t17);
auto t19 = add(t16, t18);
auto t14 = castOp(DataType::Float, t13);
auto t20 = add(t19, t14);
auto t12 = castOp(DataType::Float, t11);
auto t21 = add(t20, t12);
auto t8 = castOp(DataType::Float, t7);
auto t24 = broadcast(t8, {true, true, false});
auto t25 = mul(t21, t24);
auto t27 = sum(t25, {2});
auto t28 = broadcast(t27, {false, false, true});
auto t29 = mul(t25, t23);
auto t30 = sum(t29, {2});
auto t31 = broadcast(t30, {false, false, true});
auto d59 = mul(t1->getRootDomain()[2]->extent(), new Double(1));
auto t26 = mul(d59, t25);
auto t33 = sub(t26, t28);
auto d70 = unaryOp(UnaryOpType::Reciprocal, d59);
auto t35 = mul(d70, t6);
auto t39 = sum(t21, {0, 1});
auto t47 = castOp(DataType::Half, t39);
auto t37 = mul(t21, t23);
auto t38 = sum(t37, {0, 1});
auto t46 = castOp(DataType::Half, t38);
auto t32 = mul(t23, t31);
auto t34 = sub(t33, t32);
auto t36 = mul(t35, t34);
auto t45 = castOp(DataType::Half, t36);
auto t40 = mul(t36, t0);
auto t41 = mul(t40, d56);
auto t44 = castOp(DataType::Half, t41);
auto t42 = sum(t41, {0, 1});
auto t43 = castOp(DataType::Half, t42);

fusion.addOutput(t43);
fusion.addOutput(t44);
fusion.addOutput(t45);
fusion.addOutput(t46);
fusion.addOutput(t47);

auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto options_float =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float);
at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half);
at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half);
at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half);
at::Tensor at_t7 = at::randn({1024}, options_half);
at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half);
at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half);
at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half);
at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half);
double at_d56 = 1.1111;

fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
at_t0,
at_t1,
at_t3,
at_t5,
at_t7,
at_t11,
at_t13,
at_t15,
at_t17,
at_d56};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);

return fec->getMostRecentKernelRuntime();
}

void ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(
benchmark::State& benchmark_state,
bool disable_launch_param_cache) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());

// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;

auto runtime =
getFusedGraph1_2SegmentsRuntime(std::move(fusion_ptr), fec, aten_inputs);

fec->profile(true);
fec->disableKernelLaunch();
fec->runFusionWithInputs(aten_inputs);

if (disable_launch_param_cache) {
fec->disableLaunchParamCache();
}

for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
fec->runFusionWithInputs(aten_inputs);
}
}

static void ShapeInferenceBenchmark_FusedGraph1_2Segments(
benchmark::State& benchmark_state) {
ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, true);
}

static void
ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline(
benchmark::State& benchmark_state) {
ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, false);
}

static void ShapeInferenceBenchmark_FusedGraph0_1Segment(
benchmark::State& benchmark_state) {
ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, true);
}

static void
ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline(
benchmark::State& benchmark_state) {
ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, false);
}

BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
->Unit(benchmark::kMicrosecond);

BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment)
->Unit(benchmark::kMicrosecond);
BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline)
->Unit(benchmark::kMicrosecond);
BENCHMARK(ShapeInferenceBenchmark_FusedGraph1_2Segments)
->Unit(benchmark::kMicrosecond);
BENCHMARK(
ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline)
->Unit(benchmark::kMicrosecond);