diff --git a/benchmarks/cpp/nvfuser/shape_inference.cpp b/benchmarks/cpp/nvfuser/shape_inference.cpp index 33a9404b07390..2997751fac015 100644 --- a/benchmarks/cpp/nvfuser/shape_inference.cpp +++ b/benchmarks/cpp/nvfuser/shape_inference.cpp @@ -214,9 +214,271 @@ static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) { LayerNormForward_ShapeInferenceBase(benchmark_state, true); } +auto getFusedGraph0_1SegmentRuntime( + std::unique_ptr fusion_ptr, + std::unique_ptr& fec, + std::vector& aten_inputs) { + Fusion& fusion = *fusion_ptr.get(); + + auto t0 = makeSymbolicTensor(1, DataType::Half); + auto t2 = makeSymbolicTensor(1, DataType::Half); + auto t4 = makeSymbolicTensor(3, DataType::Half); + auto t6 = makeSymbolicTensor(3, DataType::Half); + auto t8 = makeSymbolicTensor(3, DataType::Half); + + fusion.addInput(t0); + fusion.addInput(t2); + fusion.addInput(t4); + fusion.addInput(t6); + fusion.addInput(t8); + + auto t7 = castOp(DataType::Float, t6); + auto t9 = castOp(DataType::Float, t8); + auto t10 = add(t7, t9); + auto t5 = castOp(DataType::Float, t4); + auto t11 = add(t10, t5); + auto t34 = sum(t11, {2}); + auto d57 = mul(t6->getRootDomain()[2]->extent(), new Double(1)); + auto t12 = div(t34, d57); + auto t15 = broadcast(t12, {false, false, true}); + auto t16 = sub(t11, t15); + auto t35 = mul(t16, t16); + auto t13 = sum(t35, {2}); + auto t17 = broadcast(t13, {false, false, true}); + auto d29 = mul(t6->getRootDomain()[2]->extent(), new Double(1)); + auto t18 = div(t17, d29); + auto t19 = mul(t18, new Double(9.99e-13)); + auto t20 = unaryOp(UnaryOpType::Rsqrt, t19); + auto t21 = mul(t16, t20); + auto t1 = castOp(DataType::Float, t0); + auto t22 = broadcast(t1, {true, true, false}); + auto t23 = mul(t21, t22); + auto t3 = castOp(DataType::Float, t2); + auto t24 = broadcast(t3, {true, true, false}); + auto t25 = add(t23, t24); + auto t26 = unaryOp(UnaryOpType::RandLike, t25); + auto t27 = binaryOp(BinaryOpType::LT, t26, new Double(0.9)); + auto t28 = mul(t25, t27); + auto t29 = mul(t28, new Double(1.11)); + auto t30 = castOp(DataType::Half, t29); + auto t31 = castOp(DataType::Half, t15); + auto t32 = castOp(DataType::Half, t20); + auto t33 = castOp(DataType::Half, t11); + + fusion.addOutput(t30); + fusion.addOutput(t27); + fusion.addOutput(t31); + fusion.addOutput(t32); + fusion.addOutput(t33); + + auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + at::Tensor at_t0 = at::randn({1024}, options); + at::Tensor at_t2 = at::randn({1024}, options); + at::Tensor at_t4 = at::randn({64, 128, 1024}, options); + at::Tensor at_t6 = at::randn({64, 128, 1024}, options); + at::Tensor at_t8 = at::randn({64, 128, 1024}, options); + + fec = std::make_unique(std::move(fusion_ptr)); + aten_inputs = {at_t0, at_t2, at_t4, at_t6, at_t8}; + auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + + return fec->getMostRecentKernelRuntime(); +} + +void ShapeInferenceBenchmark_FusedGraph0_1SegmentBase( + benchmark::State& benchmark_state, + bool disable_launch_param_cache) { + std::unique_ptr fusion_ptr = std::make_unique(); + FusionGuard fg(fusion_ptr.get()); + + // PreAllocate + std::unique_ptr fec; + std::vector aten_inputs; + + auto runtime = + getFusedGraph0_1SegmentRuntime(std::move(fusion_ptr), fec, aten_inputs); + + fec->profile(true); + fec->disableKernelLaunch(); + fec->runFusionWithInputs(aten_inputs); + + if (disable_launch_param_cache) { + fec->disableLaunchParamCache(); + } + + for (auto _ : benchmark_state) { + // Setup (not included in the measurement) + fec->runFusionWithInputs(aten_inputs); + } +} + +auto getFusedGraph1_2SegmentsRuntime( + std::unique_ptr fusion_ptr, + std::unique_ptr& fec, + std::vector& aten_inputs) { + Fusion& fusion = *fusion_ptr.get(); + + auto t0 = makeSymbolicTensor(3, DataType::Float); + auto t1 = makeSymbolicTensor(3, DataType::Half); + auto t3 = makeSymbolicTensor(3, DataType::Half); + auto t5 = makeSymbolicTensor(3, DataType::Half); + auto t7 = makeSymbolicTensor(1, DataType::Half); + auto t11 = makeSymbolicTensor(3, DataType::Half); + auto t13 = makeSymbolicTensor(3, DataType::Half); + auto t15 = makeSymbolicTensor(3, DataType::Half); + auto t17 = makeSymbolicTensor(3, DataType::Half); + auto d56 = new Double(); + + fusion.addInput(t0); + fusion.addInput(t1); + fusion.addInput(t3); + fusion.addInput(t5); + fusion.addInput(t7); + fusion.addInput(t11); + fusion.addInput(t13); + fusion.addInput(t15); + fusion.addInput(t17); + fusion.addInput(d56); + + auto t2 = castOp(DataType::Float, t1); + auto t4 = castOp(DataType::Float, t3); + auto t22 = sub(t2, t4); + auto t6 = castOp(DataType::Float, t5); + auto t23 = mul(t22, t6); + auto t16 = castOp(DataType::Float, t15); + auto t18 = castOp(DataType::Float, t17); + auto t19 = add(t16, t18); + auto t14 = castOp(DataType::Float, t13); + auto t20 = add(t19, t14); + auto t12 = castOp(DataType::Float, t11); + auto t21 = add(t20, t12); + auto t8 = castOp(DataType::Float, t7); + auto t24 = broadcast(t8, {true, true, false}); + auto t25 = mul(t21, t24); + auto t27 = sum(t25, {2}); + auto t28 = broadcast(t27, {false, false, true}); + auto t29 = mul(t25, t23); + auto t30 = sum(t29, {2}); + auto t31 = broadcast(t30, {false, false, true}); + auto d59 = mul(t1->getRootDomain()[2]->extent(), new Double(1)); + auto t26 = mul(d59, t25); + auto t33 = sub(t26, t28); + auto d70 = unaryOp(UnaryOpType::Reciprocal, d59); + auto t35 = mul(d70, t6); + auto t39 = sum(t21, {0, 1}); + auto t47 = castOp(DataType::Half, t39); + auto t37 = mul(t21, t23); + auto t38 = sum(t37, {0, 1}); + auto t46 = castOp(DataType::Half, t38); + auto t32 = mul(t23, t31); + auto t34 = sub(t33, t32); + auto t36 = mul(t35, t34); + auto t45 = castOp(DataType::Half, t36); + auto t40 = mul(t36, t0); + auto t41 = mul(t40, d56); + auto t44 = castOp(DataType::Half, t41); + auto t42 = sum(t41, {0, 1}); + auto t43 = castOp(DataType::Half, t42); + + fusion.addOutput(t43); + fusion.addOutput(t44); + fusion.addOutput(t45); + fusion.addOutput(t46); + fusion.addOutput(t47); + + auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto options_float = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float); + at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t7 = at::randn({1024}, options_half); + at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half); + double at_d56 = 1.1111; + + fec = std::make_unique(std::move(fusion_ptr)); + aten_inputs = { + at_t0, + at_t1, + at_t3, + at_t5, + at_t7, + at_t11, + at_t13, + at_t15, + at_t17, + at_d56}; + auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + + return fec->getMostRecentKernelRuntime(); +} + +void ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase( + benchmark::State& benchmark_state, + bool disable_launch_param_cache) { + std::unique_ptr fusion_ptr = std::make_unique(); + FusionGuard fg(fusion_ptr.get()); + + // PreAllocate + std::unique_ptr fec; + std::vector aten_inputs; + + auto runtime = + getFusedGraph1_2SegmentsRuntime(std::move(fusion_ptr), fec, aten_inputs); + + fec->profile(true); + fec->disableKernelLaunch(); + fec->runFusionWithInputs(aten_inputs); + + if (disable_launch_param_cache) { + fec->disableLaunchParamCache(); + } + + for (auto _ : benchmark_state) { + // Setup (not included in the measurement) + fec->runFusionWithInputs(aten_inputs); + } +} + +static void ShapeInferenceBenchmark_FusedGraph1_2Segments( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, true); +} + +static void +ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, false); +} + +static void ShapeInferenceBenchmark_FusedGraph0_1Segment( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, true); +} + +static void +ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, false); +} + BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); + +BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(ShapeInferenceBenchmark_FusedGraph1_2Segments) + ->Unit(benchmark::kMicrosecond); +BENCHMARK( + ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline) + ->Unit(benchmark::kMicrosecond);