1+ from __future__ import print_function
2+ from __future__ import absolute_import
3+ from __future__ import division
4+
5+ import argparse
6+ import timeit
7+ import numpy as np
8+ import torch .backends .cudnn as cudnn
9+ import yaml
10+ import os
11+ import pandas as pd
12+
13+ # Backend
14+ import torch
15+ import torch_tensorrt as torchtrt
16+ import tensorrt as trt
17+ import pycuda .autoinit
18+ import pycuda .driver as cuda
19+
20+
21+ TRT_LOGGER = trt .Logger ()
22+ EXPLICIT_BATCH = 1 << (int )(trt .NetworkDefinitionCreationFlag .EXPLICIT_BATCH )
23+
24+ results = []
25+
26+ def run_torch (model , input_tensors , params , precision ):
27+ print ("Running Torch for precision: " , precision )
28+
29+ iters = 20 if not "iterations" in params else params ['iterations' ]
30+
31+ # Warm up
32+ with torch .no_grad ():
33+ for _ in range (20 ):
34+ features = model (* input_tensors )
35+
36+ torch .cuda .synchronize ()
37+
38+ timings = []
39+ with torch .no_grad ():
40+ for i in range (iters ):
41+ start_time = timeit .default_timer ()
42+ features = model (* input_tensors )
43+ torch .cuda .synchronize ()
44+ end_time = timeit .default_timer ()
45+ meas_time = end_time - start_time
46+ timings .append (meas_time )
47+ print ("Iteration {}: {:.6f} s" .format (i , end_time - start_time ))
48+
49+ printStats ("Torch" , timings , precision )
50+
51+ def onnx_to_trt_engine (onnx_model , precision ):
52+
53+ with trt .Builder (TRT_LOGGER ) as builder , builder .create_network (EXPLICIT_BATCH ) as network , builder .create_builder_config () as config , trt .OnnxParser (network , TRT_LOGGER ) as parser , trt .Runtime (TRT_LOGGER ) as runtime :
54+ config .max_workspace_size = 1 << 28 # 256MiB
55+ builder .max_batch_size = 1
56+
57+ if precision == 'int8' :
58+ config .set_flag (trt .BuilderFlag .INT8 )
59+ elif precision == 'fp16' or precision == 'half' :
60+ config .set_flag (trt .BuilderFlag .HALF )
61+
62+ plan = builder .build_serialized_network (network , config )
63+ model = runtime .deserialize_cuda_engine (plan )
64+ return model
65+
66+ def run_torch_tensorrt (model , input_tensors , params , precision ):
67+ print ("Running Torch-TensorRT" )
68+
69+ # Compiling Torch-TensorRT model
70+ compile_settings = {
71+ "inputs" : input_tensors ,
72+ "enabled_precisions" : {precision_to_dtype (precision )}
73+ }
74+
75+ model = torchtrt .compile (model , ** compile_settings )
76+
77+ iters = 20 if not "iterations" in params else params ['iterations' ]
78+ # Warm up
79+ with torch .no_grad ():
80+ for _ in range (20 ):
81+ features = model (* input_tensors )
82+
83+ torch .cuda .synchronize ()
84+
85+ timings = []
86+ with torch .no_grad ():
87+ for i in range (iters ):
88+ start_time = timeit .default_timer ()
89+ features = model (* input_tensors )
90+ torch .cuda .synchronize ()
91+ end_time = timeit .default_timer ()
92+ meas_time = end_time - start_time
93+ timings .append (meas_time )
94+ print ("Iteration {}: {:.6f} s" .format (i , end_time - start_time ))
95+
96+ printStats ("Torch-TensorRT" , timings , precision )
97+
98+ def run_tensorrt (model , input_tensors , params , precision ):
99+ print ("Running TensorRT" )
100+ inputs = []
101+ outputs = []
102+ bindings = []
103+ stream = cuda .Stream ()
104+ iters = 20 if not "iterations" in params else params ['iterations' ]
105+
106+ if not "batch" in params :
107+ batch_size = 1
108+ else :
109+ batch_size = params ['batch_size' ]
110+
111+ with onnx_to_trt_engine (model , precision ) as engine , engine .create_execution_context () as context :
112+
113+ for binding in engine :
114+ size = trt .volume (engine .get_binding_shape (binding )) * engine .max_batch_size
115+ dtype = trt .nptype (engine .get_binding_dtype (binding ))
116+
117+ # Input already allocated in input_tensors
118+ mem = cuda .mem_alloc ()
119+ # Allocate host and device buffers
120+ host_mem = cuda .pagelocked_empty (size , dtype )
121+ device_mem = cuda .mem_alloc (host_mem .nbytes )
122+ # Append the device buffer to device bindings.
123+ bindings .append (int (device_mem ))
124+ # Append to the appropriate list.
125+ """
126+ if engine.binding_is_input(binding):
127+ inputs.append(HostDeviceMem(host_mem, device_mem))
128+ else:
129+ outputs.append(HostDeviceMem(host_mem, device_mem))
130+ if not engine.binding_is_input(binding):
131+ outputs.append(cuda.mem_alloc(cuda.pagelocked_empty(size, dtype).nbytes))
132+ else:
133+ bindings.append(input_tensors)
134+ """
135+ # Warm up
136+ for _ in range (20 ):
137+ context .execute_async (batch_size , bindings , stream .handle )
138+
139+ stream .synchronize ()
140+
141+ for i in range (iters ):
142+ start_time = timeit .default_timer ()
143+ context .execute_async (batch_size , bindings , stream .handle )
144+ stream .synchronize ()
145+ end_time = timeit .default_timer ()
146+ meas_time = end_time - start_time
147+
148+
149+
150+ iters = 20 if not "iterations" in params else params ['iterations' ]
151+ # Warm up
152+ with torch .no_grad ():
153+ for _ in range (20 ):
154+ features = model (input_tensors )
155+
156+ torch .cuda .synchronize ()
157+
158+ timings = []
159+ with torch .no_grad ():
160+ for i in range (iters ):
161+ start_time = timeit .default_timer ()
162+ features = model (input_tensors )
163+ torch .cuda .synchronize ()
164+ end_time = timeit .default_timer ()
165+ meas_time = end_time - start_time
166+ timings .append (meas_time )
167+ print ("Iteration {}: {:.6f} s" .format (i , end_time - start_time ))
168+
169+ printStats ("TensorRT" , timings , precision )
170+
171+ def run (model , input_tensors , params , precision ):
172+ for backend in params ['backend' ]:
173+ if backend == 'all' :
174+ run_torch (model , input_tensors , params , precision )
175+ run_torch_tensorrt (model , input_tensors , params , precision )
176+ run_tensorrt (model , input_tensors , params , precision )
177+
178+ elif backend == "torch" :
179+ run_torch (model , input_tensors , params , precision )
180+
181+ elif backend == "torch_tensorrt" :
182+ run_torch_tensorrt (model , input_tensors , params , precision )
183+
184+ elif backend == "tensorrt" :
185+ run_tensorrt (model , input_tensors , params , precision )
186+
187+
188+ def printStats (backend , timings , precision , batch_size = 1 ):
189+ times = np .array (timings )
190+ steps = len (times )
191+ speeds = batch_size / times
192+ time_mean = np .mean (times )
193+ time_med = np .median (times )
194+ time_99th = np .percentile (times , 99 )
195+ time_std = np .std (times , ddof = 0 )
196+ speed_mean = np .mean (speeds )
197+ speed_med = np .median (speeds )
198+
199+ msg = ("\n %s =================================\n "
200+ "batch size=%d, num iterations=%d\n "
201+ " Median FPS: %.1f, mean: %.1f\n "
202+ " Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n "
203+ ) % (backend ,
204+ batch_size , steps ,
205+ speed_med , speed_mean ,
206+ time_med , time_mean , time_99th , time_std )
207+ print (msg )
208+ meas = {
209+ 'Backend' : backend ,
210+ 'precision' : precision ,
211+ 'Median(FPS)' : speed_med ,
212+ 'Mean(FPS)' : speed_mean ,
213+ 'Median-Latency(ms)' : time_med ,
214+ 'Mean-Latency(ms)' : time_mean ,
215+ '99th_p' : time_99th ,
216+ 'std_dev' : time_std
217+ }
218+ results .append (meas )
219+
220+ def read_config (config_file ):
221+ with open (config_file , "r" ) as stream :
222+ try :
223+ params = yaml .safe_load (stream )
224+ except yaml .YAMLError as exc :
225+ print (exc )
226+ return params
227+
228+ def precision_to_dtype (pr ):
229+ if pr == 'fp32' :
230+ return torch .float
231+ elif pr == 'fp16' or pr == 'half' :
232+ return torch .half
233+ else :
234+ return torch .int8
235+
236+ def load_model (params ):
237+ model = None
238+ # Load traced model
239+ if "torch" in params ['backend' ] or "torch_tensorrt" in params ['backend' ]:
240+ model_path = os .path .join ("models" , params ['model' ]['filename' ])
241+ model = torch .jit .load (model_path ).cuda ()
242+
243+ elif "tensorrt" in params ['backend' ]:
244+ onnx_model_file = os .path .join ("models" , params ['model' ]['onnx_file' ])
245+ with open (onnx_model_file , 'rb' ) as onnx_model :
246+ print ('Beginning ONNX file parsing' )
247+ model = onnx_model .read ()
248+
249+ return model
250+
251+ if __name__ == '__main__' :
252+
253+ parser = argparse .ArgumentParser (description = "Run inference on a model with random input values" )
254+ parser .add_argument ("--config" , help = "Load YAML based configuration file to run the inference. If this is used other params will be ignored" )
255+ args = parser .parse_args ()
256+
257+ # Load YAML params
258+ params = read_config (args .config )
259+
260+ print ("Loading model: " , params ['model' ]['filename' ])
261+
262+ model = None
263+
264+ if "device" in params ['runtime' ]:
265+ torch .cuda .set_device (params ['runtime' ]['device' ])
266+
267+ model = load_model (params )
268+
269+ cudnn .benchmark = True
270+
271+ # Create random input tensor of certain size
272+ torch .manual_seed (12345 )
273+
274+ num_input = params ['input' ]['num_of_input' ]
275+ for precision in params ['runtime' ]['precision' ]:
276+ input_tensors = []
277+ num_input = params ['input' ]['num_of_input' ]
278+ for i in range (num_input ):
279+ inp_tensor = params ['input' ]['input' + str (i )]
280+ input_tensors .append (torch .randint (0 , 2 , tuple (d for d in inp_tensor ), dtype = precision_to_dtype (precision )).cuda ())
281+
282+ if precision == "fp16" or precision == "half" :
283+ #input_tensors = [x.half() for x in input_tensors]
284+ model = model .half ()
285+
286+ run (model , input_tensors , params , precision )
287+
288+ print ('Model Summary:' )
289+ summary = pd .DataFrame (results )
290+ print (summary )
0 commit comments