From 2f134704f1714f247abea9088b54728b9155e9be Mon Sep 17 00:00:00 2001 From: shmsong Date: Tue, 7 Sep 2021 13:00:10 -0700 Subject: [PATCH 1/2] add perf benchmark --- benchmarks/cpp/nvfuser/shape_inference.cpp | 262 +++++++++++++++++++++ 1 file changed, 262 insertions(+) diff --git a/benchmarks/cpp/nvfuser/shape_inference.cpp b/benchmarks/cpp/nvfuser/shape_inference.cpp index 33a9404b0739..3eec18b38ce8 100644 --- a/benchmarks/cpp/nvfuser/shape_inference.cpp +++ b/benchmarks/cpp/nvfuser/shape_inference.cpp @@ -214,9 +214,271 @@ static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) { LayerNormForward_ShapeInferenceBase(benchmark_state, true); } +auto getFusedGraph0_1SegmentRuntime( + std::unique_ptr fusion_ptr, + std::unique_ptr& fec, + std::vector& aten_inputs) { + Fusion& fusion = *fusion_ptr.get(); + + auto t0 = makeSymbolicTensor(1,DataType::Half); + auto t2 = makeSymbolicTensor(1,DataType::Half); + auto t4 = makeSymbolicTensor(3,DataType::Half); + auto t6 = makeSymbolicTensor(3,DataType::Half); + auto t8 = makeSymbolicTensor(3,DataType::Half); + + fusion.addInput(t0); + fusion.addInput(t2); + fusion.addInput(t4); + fusion.addInput(t6); + fusion.addInput(t8); + + auto t7 = castOp(DataType::Float, t6); + auto t9 = castOp(DataType::Float, t8); + auto t10 = add(t7,t9); + auto t5 = castOp(DataType::Float, t4); + auto t11 = add(t10, t5); + auto t34 = sum(t11,{2}); + auto d57 = mul(t6->getRootDomain()[2]->extent(),new Double(1)); + auto t12 = div(t34,d57); + auto t15 = broadcast(t12,{false,false,true}); + auto t16 = sub(t11,t15); + auto t35 = mul(t16,t16); + auto t13 = sum(t35,{2}); + auto t17 = broadcast(t13,{false,false,true}); + auto d29 = mul(t6->getRootDomain()[2]->extent(),new Double(1)); + auto t18 = div(t17,d29); + auto t19 = mul(t18, new Double(9.99e-13)); + auto t20 = unaryOp(UnaryOpType::Rsqrt, t19); + auto t21 = mul(t16, t20); + auto t1 = castOp(DataType::Float, t0); + auto t22 = broadcast(t1,{true,true,false}); + auto t23 = mul(t21,t22); + auto t3 = castOp(DataType::Float, t2); + auto t24 = broadcast(t3,{true,true,false}); + auto t25 = add(t23,t24); + auto t26 = unaryOp(UnaryOpType::RandLike, t25); + auto t27 = binaryOp(BinaryOpType::LT,t26, new Double(0.9)); + auto t28 = mul(t25,t27); + auto t29 = mul(t28,new Double(1.11)); + auto t30 = castOp(DataType::Half, t29); + auto t31 = castOp(DataType::Half, t15); + auto t32 = castOp(DataType::Half, t20); + auto t33 = castOp(DataType::Half, t11); + + fusion.addOutput(t30); + fusion.addOutput(t27); + fusion.addOutput(t31); + fusion.addOutput(t32); + fusion.addOutput(t33); + + auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + at::Tensor at_t0 = at::randn({1024},options); + at::Tensor at_t2 = at::randn({1024},options); + at::Tensor at_t4 = at::randn({64, 128, 1024},options); + at::Tensor at_t6 = at::randn({64, 128, 1024},options); + at::Tensor at_t8 = at::randn({64, 128, 1024},options); + + fec = std::make_unique(std::move(fusion_ptr)); + aten_inputs = { + at_t0, + at_t2, + at_t4, + at_t6, + at_t8 + }; + auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + + return fec->getMostRecentKernelRuntime(); +} + +void ShapeInferenceBenchmark_FusedGraph0_1SegmentBase( + benchmark::State& benchmark_state, + bool disable_launch_param_cache) { + std::unique_ptr fusion_ptr = std::make_unique(); + FusionGuard fg(fusion_ptr.get()); + + // PreAllocate + std::unique_ptr fec; + std::vector aten_inputs; + + auto runtime = getFusedGraph0_1SegmentRuntime(std::move(fusion_ptr), fec, aten_inputs); + + fec->profile(true); + fec->disableKernelLaunch(); + fec->runFusionWithInputs(aten_inputs); + + if (disable_launch_param_cache) { + fec->disableLaunchParamCache(); + } + + for (auto _ : benchmark_state) { + // Setup (not included in the measurement) + fec->runFusionWithInputs(aten_inputs); + } +} + +auto getFusedGraph1_2SegmentsRuntime( + std::unique_ptr fusion_ptr, + std::unique_ptr& fec, + std::vector& aten_inputs) { + Fusion& fusion = *fusion_ptr.get(); + + auto t0 = makeSymbolicTensor(3,DataType::Float); + auto t1 = makeSymbolicTensor(3,DataType::Half); + auto t3 = makeSymbolicTensor(3,DataType::Half); + auto t5 = makeSymbolicTensor(3,DataType::Half); + auto t7 = makeSymbolicTensor(1,DataType::Half); + auto t11 = makeSymbolicTensor(3,DataType::Half); + auto t13 = makeSymbolicTensor(3,DataType::Half); + auto t15 = makeSymbolicTensor(3,DataType::Half); + auto t17 = makeSymbolicTensor(3,DataType::Half); + auto d56 = new Double(); + + fusion.addInput(t0); + fusion.addInput(t1); + fusion.addInput(t3); + fusion.addInput(t5); + fusion.addInput(t7); + fusion.addInput(t11); + fusion.addInput(t13); + fusion.addInput(t15); + fusion.addInput(t17); + fusion.addInput(d56); + + + auto t2 = castOp(DataType::Float, t1); + auto t4 = castOp(DataType::Float, t3); + auto t22 = sub(t2,t4); + auto t6 = castOp(DataType::Float, t5); + auto t23 = mul(t22,t6); + auto t16 = castOp(DataType::Float, t15); + auto t18 = castOp(DataType::Float, t17); + auto t19 = add(t16, t18); + auto t14 = castOp(DataType::Float, t13); + auto t20 = add(t19,t14); + auto t12 = castOp(DataType::Float, t11); + auto t21 = add(t20,t12); + auto t8 = castOp(DataType::Float, t7); + auto t24 = broadcast(t8,{true,true,false}); + auto t25 = mul(t21,t24); + auto t27 = sum(t25,{2}); + auto t28 = broadcast(t27,{false,false,true}); + auto t29 = mul(t25,t23); + auto t30 = sum(t29, {2}); + auto t31 = broadcast(t30, {false,false,true}); + auto d59 = mul(t1->getRootDomain()[2]->extent(),new Double(1)); + auto t26 = mul(d59,t25); + auto t33 = sub(t26,t28); + auto d70 = unaryOp(UnaryOpType::Reciprocal, d59); + auto t35 = mul(d70,t6); + auto t39 = sum(t21,{0,1}); + auto t47 = castOp(DataType::Half, t39); + auto t37 = mul(t21,t23); + auto t38 = sum(t37,{0,1}); + auto t46 = castOp(DataType::Half, t38); + auto t32 = mul(t23,t31); + auto t34 = sub(t33,t32); + auto t36 = mul(t35,t34); + auto t45 = castOp(DataType::Half, t36); + auto t40 = mul(t36,t0); + auto t41 = mul(t40,d56); + auto t44 = castOp(DataType::Half, t41); + auto t42 = sum(t41,{0,1}); + auto t43 = castOp(DataType::Half, t42); + + fusion.addOutput(t43); + fusion.addOutput(t44); + fusion.addOutput(t45); + fusion.addOutput(t46); + fusion.addOutput(t47); + + auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto options_float = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_t0 = at::randn({128, 64, 1024},options_float); + at::Tensor at_t1 = at::randn({128, 64, 1024},options_half); + at::Tensor at_t3 = at::randn({128, 64, 1024},options_half); + at::Tensor at_t5 = at::randn({128, 64, 1024},options_half); + at::Tensor at_t7 = at::randn({1024},options_half); + at::Tensor at_t11 = at::randn({128, 64, 1024},options_half); + at::Tensor at_t13 = at::randn({128, 64, 1024},options_half); + at::Tensor at_t15 = at::randn({128, 64, 1024},options_half); + at::Tensor at_t17 = at::randn({128, 64, 1024},options_half); + double at_d56 = 1.1111; + + fec = std::make_unique(std::move(fusion_ptr)); + aten_inputs = { + at_t0, + at_t1, + at_t3, + at_t5, + at_t7, + at_t11, + at_t13, + at_t15, + at_t17, + at_d56 + }; + auto cg_outputs = fec->runFusionWithInputs(aten_inputs); + + return fec->getMostRecentKernelRuntime(); +} + +void ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase( + benchmark::State& benchmark_state, + bool disable_launch_param_cache) { + std::unique_ptr fusion_ptr = std::make_unique(); + FusionGuard fg(fusion_ptr.get()); + + // PreAllocate + std::unique_ptr fec; + std::vector aten_inputs; + + auto runtime = getFusedGraph1_2SegmentsRuntime(std::move(fusion_ptr), fec, aten_inputs); + + fec->profile(true); + fec->disableKernelLaunch(); + fec->runFusionWithInputs(aten_inputs); + + if (disable_launch_param_cache) { + fec->disableLaunchParamCache(); + } + + for (auto _ : benchmark_state) { + // Setup (not included in the measurement) + fec->runFusionWithInputs(aten_inputs); + } +} + +static void ShapeInferenceBenchmark_FusedGraph1_2Segments( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, true); +} + +static void ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, false); +} + +static void ShapeInferenceBenchmark_FusedGraph0_1Segment( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, true); +} + +static void ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline( + benchmark::State& benchmark_state) { + ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, false); +} + BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); + +BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment)->Unit(benchmark::kMicrosecond); +BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(ShapeInferenceBenchmark_FusedGraph1_2Segments)->Unit(benchmark::kMicrosecond); +BENCHMARK(ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline) + ->Unit(benchmark::kMicrosecond); From 8faa7d79894d34ba0779e7df681a448acf94ceee Mon Sep 17 00:00:00 2001 From: shmsong Date: Thu, 9 Sep 2021 15:27:00 -0700 Subject: [PATCH 2/2] format --- benchmarks/cpp/nvfuser/shape_inference.cpp | 196 ++++++++++----------- 1 file changed, 98 insertions(+), 98 deletions(-) diff --git a/benchmarks/cpp/nvfuser/shape_inference.cpp b/benchmarks/cpp/nvfuser/shape_inference.cpp index 3eec18b38ce8..2997751fac01 100644 --- a/benchmarks/cpp/nvfuser/shape_inference.cpp +++ b/benchmarks/cpp/nvfuser/shape_inference.cpp @@ -220,12 +220,12 @@ auto getFusedGraph0_1SegmentRuntime( std::vector& aten_inputs) { Fusion& fusion = *fusion_ptr.get(); - auto t0 = makeSymbolicTensor(1,DataType::Half); - auto t2 = makeSymbolicTensor(1,DataType::Half); - auto t4 = makeSymbolicTensor(3,DataType::Half); - auto t6 = makeSymbolicTensor(3,DataType::Half); - auto t8 = makeSymbolicTensor(3,DataType::Half); - + auto t0 = makeSymbolicTensor(1, DataType::Half); + auto t2 = makeSymbolicTensor(1, DataType::Half); + auto t4 = makeSymbolicTensor(3, DataType::Half); + auto t6 = makeSymbolicTensor(3, DataType::Half); + auto t8 = makeSymbolicTensor(3, DataType::Half); + fusion.addInput(t0); fusion.addInput(t2); fusion.addInput(t4); @@ -234,32 +234,32 @@ auto getFusedGraph0_1SegmentRuntime( auto t7 = castOp(DataType::Float, t6); auto t9 = castOp(DataType::Float, t8); - auto t10 = add(t7,t9); + auto t10 = add(t7, t9); auto t5 = castOp(DataType::Float, t4); auto t11 = add(t10, t5); - auto t34 = sum(t11,{2}); - auto d57 = mul(t6->getRootDomain()[2]->extent(),new Double(1)); - auto t12 = div(t34,d57); - auto t15 = broadcast(t12,{false,false,true}); - auto t16 = sub(t11,t15); - auto t35 = mul(t16,t16); - auto t13 = sum(t35,{2}); - auto t17 = broadcast(t13,{false,false,true}); - auto d29 = mul(t6->getRootDomain()[2]->extent(),new Double(1)); - auto t18 = div(t17,d29); + auto t34 = sum(t11, {2}); + auto d57 = mul(t6->getRootDomain()[2]->extent(), new Double(1)); + auto t12 = div(t34, d57); + auto t15 = broadcast(t12, {false, false, true}); + auto t16 = sub(t11, t15); + auto t35 = mul(t16, t16); + auto t13 = sum(t35, {2}); + auto t17 = broadcast(t13, {false, false, true}); + auto d29 = mul(t6->getRootDomain()[2]->extent(), new Double(1)); + auto t18 = div(t17, d29); auto t19 = mul(t18, new Double(9.99e-13)); auto t20 = unaryOp(UnaryOpType::Rsqrt, t19); auto t21 = mul(t16, t20); auto t1 = castOp(DataType::Float, t0); - auto t22 = broadcast(t1,{true,true,false}); - auto t23 = mul(t21,t22); + auto t22 = broadcast(t1, {true, true, false}); + auto t23 = mul(t21, t22); auto t3 = castOp(DataType::Float, t2); - auto t24 = broadcast(t3,{true,true,false}); - auto t25 = add(t23,t24); + auto t24 = broadcast(t3, {true, true, false}); + auto t25 = add(t23, t24); auto t26 = unaryOp(UnaryOpType::RandLike, t25); - auto t27 = binaryOp(BinaryOpType::LT,t26, new Double(0.9)); - auto t28 = mul(t25,t27); - auto t29 = mul(t28,new Double(1.11)); + auto t27 = binaryOp(BinaryOpType::LT, t26, new Double(0.9)); + auto t28 = mul(t25, t27); + auto t29 = mul(t28, new Double(1.11)); auto t30 = castOp(DataType::Half, t29); auto t31 = castOp(DataType::Half, t15); auto t32 = castOp(DataType::Half, t20); @@ -272,20 +272,14 @@ auto getFusedGraph0_1SegmentRuntime( fusion.addOutput(t33); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::Tensor at_t0 = at::randn({1024},options); - at::Tensor at_t2 = at::randn({1024},options); - at::Tensor at_t4 = at::randn({64, 128, 1024},options); - at::Tensor at_t6 = at::randn({64, 128, 1024},options); - at::Tensor at_t8 = at::randn({64, 128, 1024},options); + at::Tensor at_t0 = at::randn({1024}, options); + at::Tensor at_t2 = at::randn({1024}, options); + at::Tensor at_t4 = at::randn({64, 128, 1024}, options); + at::Tensor at_t6 = at::randn({64, 128, 1024}, options); + at::Tensor at_t8 = at::randn({64, 128, 1024}, options); fec = std::make_unique(std::move(fusion_ptr)); - aten_inputs = { - at_t0, - at_t2, - at_t4, - at_t6, - at_t8 - }; + aten_inputs = {at_t0, at_t2, at_t4, at_t6, at_t8}; auto cg_outputs = fec->runFusionWithInputs(aten_inputs); return fec->getMostRecentKernelRuntime(); @@ -301,7 +295,8 @@ void ShapeInferenceBenchmark_FusedGraph0_1SegmentBase( std::unique_ptr fec; std::vector aten_inputs; - auto runtime = getFusedGraph0_1SegmentRuntime(std::move(fusion_ptr), fec, aten_inputs); + auto runtime = + getFusedGraph0_1SegmentRuntime(std::move(fusion_ptr), fec, aten_inputs); fec->profile(true); fec->disableKernelLaunch(); @@ -323,17 +318,17 @@ auto getFusedGraph1_2SegmentsRuntime( std::vector& aten_inputs) { Fusion& fusion = *fusion_ptr.get(); - auto t0 = makeSymbolicTensor(3,DataType::Float); - auto t1 = makeSymbolicTensor(3,DataType::Half); - auto t3 = makeSymbolicTensor(3,DataType::Half); - auto t5 = makeSymbolicTensor(3,DataType::Half); - auto t7 = makeSymbolicTensor(1,DataType::Half); - auto t11 = makeSymbolicTensor(3,DataType::Half); - auto t13 = makeSymbolicTensor(3,DataType::Half); - auto t15 = makeSymbolicTensor(3,DataType::Half); - auto t17 = makeSymbolicTensor(3,DataType::Half); + auto t0 = makeSymbolicTensor(3, DataType::Float); + auto t1 = makeSymbolicTensor(3, DataType::Half); + auto t3 = makeSymbolicTensor(3, DataType::Half); + auto t5 = makeSymbolicTensor(3, DataType::Half); + auto t7 = makeSymbolicTensor(1, DataType::Half); + auto t11 = makeSymbolicTensor(3, DataType::Half); + auto t13 = makeSymbolicTensor(3, DataType::Half); + auto t15 = makeSymbolicTensor(3, DataType::Half); + auto t17 = makeSymbolicTensor(3, DataType::Half); auto d56 = new Double(); - + fusion.addInput(t0); fusion.addInput(t1); fusion.addInput(t3); @@ -345,45 +340,44 @@ auto getFusedGraph1_2SegmentsRuntime( fusion.addInput(t17); fusion.addInput(d56); - auto t2 = castOp(DataType::Float, t1); auto t4 = castOp(DataType::Float, t3); - auto t22 = sub(t2,t4); + auto t22 = sub(t2, t4); auto t6 = castOp(DataType::Float, t5); - auto t23 = mul(t22,t6); + auto t23 = mul(t22, t6); auto t16 = castOp(DataType::Float, t15); auto t18 = castOp(DataType::Float, t17); auto t19 = add(t16, t18); auto t14 = castOp(DataType::Float, t13); - auto t20 = add(t19,t14); + auto t20 = add(t19, t14); auto t12 = castOp(DataType::Float, t11); - auto t21 = add(t20,t12); + auto t21 = add(t20, t12); auto t8 = castOp(DataType::Float, t7); - auto t24 = broadcast(t8,{true,true,false}); - auto t25 = mul(t21,t24); - auto t27 = sum(t25,{2}); - auto t28 = broadcast(t27,{false,false,true}); - auto t29 = mul(t25,t23); + auto t24 = broadcast(t8, {true, true, false}); + auto t25 = mul(t21, t24); + auto t27 = sum(t25, {2}); + auto t28 = broadcast(t27, {false, false, true}); + auto t29 = mul(t25, t23); auto t30 = sum(t29, {2}); - auto t31 = broadcast(t30, {false,false,true}); - auto d59 = mul(t1->getRootDomain()[2]->extent(),new Double(1)); - auto t26 = mul(d59,t25); - auto t33 = sub(t26,t28); + auto t31 = broadcast(t30, {false, false, true}); + auto d59 = mul(t1->getRootDomain()[2]->extent(), new Double(1)); + auto t26 = mul(d59, t25); + auto t33 = sub(t26, t28); auto d70 = unaryOp(UnaryOpType::Reciprocal, d59); - auto t35 = mul(d70,t6); - auto t39 = sum(t21,{0,1}); + auto t35 = mul(d70, t6); + auto t39 = sum(t21, {0, 1}); auto t47 = castOp(DataType::Half, t39); - auto t37 = mul(t21,t23); - auto t38 = sum(t37,{0,1}); + auto t37 = mul(t21, t23); + auto t38 = sum(t37, {0, 1}); auto t46 = castOp(DataType::Half, t38); - auto t32 = mul(t23,t31); - auto t34 = sub(t33,t32); - auto t36 = mul(t35,t34); + auto t32 = mul(t23, t31); + auto t34 = sub(t33, t32); + auto t36 = mul(t35, t34); auto t45 = castOp(DataType::Half, t36); - auto t40 = mul(t36,t0); - auto t41 = mul(t40,d56); + auto t40 = mul(t36, t0); + auto t41 = mul(t40, d56); auto t44 = castOp(DataType::Half, t41); - auto t42 = sum(t41,{0,1}); + auto t42 = sum(t41, {0, 1}); auto t43 = castOp(DataType::Half, t42); fusion.addOutput(t43); @@ -393,31 +387,31 @@ auto getFusedGraph1_2SegmentsRuntime( fusion.addOutput(t47); auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - auto options_float = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_t0 = at::randn({128, 64, 1024},options_float); - at::Tensor at_t1 = at::randn({128, 64, 1024},options_half); - at::Tensor at_t3 = at::randn({128, 64, 1024},options_half); - at::Tensor at_t5 = at::randn({128, 64, 1024},options_half); - at::Tensor at_t7 = at::randn({1024},options_half); - at::Tensor at_t11 = at::randn({128, 64, 1024},options_half); - at::Tensor at_t13 = at::randn({128, 64, 1024},options_half); - at::Tensor at_t15 = at::randn({128, 64, 1024},options_half); - at::Tensor at_t17 = at::randn({128, 64, 1024},options_half); + auto options_float = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float); + at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t7 = at::randn({1024}, options_half); + at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half); + at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half); double at_d56 = 1.1111; fec = std::make_unique(std::move(fusion_ptr)); aten_inputs = { - at_t0, - at_t1, - at_t3, - at_t5, - at_t7, - at_t11, - at_t13, - at_t15, - at_t17, - at_d56 - }; + at_t0, + at_t1, + at_t3, + at_t5, + at_t7, + at_t11, + at_t13, + at_t15, + at_t17, + at_d56}; auto cg_outputs = fec->runFusionWithInputs(aten_inputs); return fec->getMostRecentKernelRuntime(); @@ -433,7 +427,8 @@ void ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase( std::unique_ptr fec; std::vector aten_inputs; - auto runtime = getFusedGraph1_2SegmentsRuntime(std::move(fusion_ptr), fec, aten_inputs); + auto runtime = + getFusedGraph1_2SegmentsRuntime(std::move(fusion_ptr), fec, aten_inputs); fec->profile(true); fec->disableKernelLaunch(); @@ -454,7 +449,8 @@ static void ShapeInferenceBenchmark_FusedGraph1_2Segments( ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, true); } -static void ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline( +static void +ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline( benchmark::State& benchmark_state) { ShapeInferenceBenchmark_FusedGraph1_2SegmentsBase(benchmark_state, false); } @@ -464,7 +460,8 @@ static void ShapeInferenceBenchmark_FusedGraph0_1Segment( ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, true); } -static void ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline( +static void +ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline( benchmark::State& benchmark_state) { ShapeInferenceBenchmark_FusedGraph0_1SegmentBase(benchmark_state, false); } @@ -476,9 +473,12 @@ BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline) BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); -BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment)->Unit(benchmark::kMicrosecond); +BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment) + ->Unit(benchmark::kMicrosecond); BENCHMARK(ShapeInferenceBenchmark_FusedGraph0_1Segment_NoShapeInferenceBaseline) ->Unit(benchmark::kMicrosecond); -BENCHMARK(ShapeInferenceBenchmark_FusedGraph1_2Segments)->Unit(benchmark::kMicrosecond); -BENCHMARK(ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline) +BENCHMARK(ShapeInferenceBenchmark_FusedGraph1_2Segments) + ->Unit(benchmark::kMicrosecond); +BENCHMARK( + ShapeInferenceBenchmark_FusedGraph1_2Segments_NoShapeInferenceBaseline) ->Unit(benchmark::kMicrosecond);