@@ -37,7 +37,7 @@ void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t
3737 std::transform (runtimes.begin (), runtimes.end (), rt_diff.begin (), [avg_runtime](float x) { return x - avg_runtime; });
3838 float rt_sq_sum = std::inner_product (rt_diff.begin (), rt_diff.end (), rt_diff.begin (), 0.0 );
3939 float rt_std_dev = std::sqrt (rt_sq_sum / runtimes.size ());
40-
40+
4141 std::vector<float > fps_diff (runtimes.size ());
4242 std::transform (runtimes.begin (), runtimes.end (), fps_diff.begin (), [fps, batch_size](float x) { return ((1000 .f / x) * batch_size) - fps; });
4343 float fps_sq_sum = std::inner_product (fps_diff.begin (), fps_diff.end (), fps_diff.begin (), 0.0 );
@@ -62,7 +62,7 @@ std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector
6262 cudaDeviceSynchronize ();
6363
6464 }
65-
65+
6666 for (uint64_t i = 0 ; i < NUM_RUNS; i++) {
6767 std::vector<torch::jit::IValue> inputs_ivalues;
6868 auto in = at::rand (shape, {at::kCUDA });
@@ -71,7 +71,7 @@ std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector
7171#endif
7272 inputs_ivalues.push_back (in.clone ());
7373 cudaDeviceSynchronize ();
74-
74+
7575 execution_timer.start ();
7676 mod.forward (inputs_ivalues);
7777 cudaDeviceSynchronize ();
@@ -80,7 +80,7 @@ std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector
8080 auto time = execution_timer.milliseconds ();
8181 execution_timer.reset ();
8282 execution_runtimes.push_back (time);
83-
83+
8484 c10::cuda::CUDACachingAllocator::emptyCache ();
8585 }
8686 return execution_runtimes;
@@ -91,9 +91,9 @@ int main(int argc, const char* argv[]) {
9191 std::cerr << " usage: benchmark <path-to-exported-script-module> <input-size>\n " << std::endl;
9292 return -1 ;
9393 }
94-
95-
96- torch::jit::script:: Module mod;
94+
95+
96+ torch::jit::Module mod;
9797 try {
9898 // Deserialize the ScriptModule from a file using torch::jit::load().
9999 mod = torch::jit::load (argv[1 ]);
@@ -104,16 +104,16 @@ int main(int argc, const char* argv[]) {
104104 }
105105
106106 mod.to (at::kCUDA );
107-
107+
108108#ifdef HALF
109109 mod.to (torch::kHalf );
110110 for (auto layer : mod.named_modules ()) {
111111 if (layer.name .find (" .bn" ) != std::string::npos) {
112112 layer.value .to (torch::kFloat );
113113 }
114114 }
115- #endif
116-
115+ #endif
116+
117117 std::vector<std::vector<int64_t >> dims;
118118 for (int i = 2 ; i < argc; i++) {
119119 auto arg = std::string (argv[i]);
@@ -128,7 +128,7 @@ int main(int argc, const char* argv[]) {
128128 }
129129
130130 at::globalContext ().setBenchmarkCuDNN (true );
131-
131+
132132#ifdef JIT
133133 auto jit_runtimes = benchmark_module (mod, dims[0 ]);
134134 print_avg_std_dev (" JIT" , jit_runtimes, dims[0 ][0 ]);
@@ -140,11 +140,11 @@ int main(int argc, const char* argv[]) {
140140#ifdef HALF
141141 extra_info.op_precision = at::kHalf ;
142142#endif
143-
143+
144144 auto trt_mod = trtorch::CompileGraph (mod, extra_info);
145145 auto trt_runtimes = benchmark_module (trt_mod, dims[0 ]);
146146 print_avg_std_dev (" JIT/TRT" , trt_runtimes, dims[0 ][0 ]);
147147#endif
148-
148+
149149 std::cout << " ok\n " ;
150150}
0 commit comments