Optimization of fast-reid infernce using tensorrt #4243

smrutiranjanmohapatra · 2024-11-11T11:46:09Z

Reading engine from file /content/engine/yolo11x_fp16.engine
Total Inference Time : 17.17
Total Frame processed : 750
Average Inference FPS : 43.69
Total Feature Time : 75.53
Average feature FPS : 9.93
Total Tracking Time : 14.76
Average Tracking FPS : 50.83 .
How to reduce my inference time using tensorrt engine for fast-reid model . Below i have given the code. I need help

def load_and_inference_fastreid(fastreid_batch_images, engine, fastreid_inputs:np.ndarray,fastreid_outputs:np.ndarray,bindings,stream):

#load the tensorrt engine

if not os.path.exists(fastreid_engine_path):

raise FileNotFoundError(f"Engine file {fastreid_engine_path} not found. Please ensure the path is correct.")

with trt.Runtime(TRT_LOGGER) as runtime, open(fastreid_engine_path,"rb") as f:

fastreid_engine_data = f.read()

engine = runtime.deserialize_cuda_engine(fastreid_engine_data)

fastreid_img = cv2.resize(frame, (256,128), interpolation=cv2.INTER_LINEAR)

fastreid_input_img = preprocess_image(fastreid_img)

fastreid_batch_images = np.concatenate([fastreid_input_img], axis=0)

fastreid_inputs,fastreid_outputs,bindings,stream = allocate_buffers(engine,fastreid_output_shape,profile_idx=0)

#Create excution context
context = engine.create_execution_context()

List to hold extracted features for each image

extracted_features = []

Loop over each image in the batch and process it sequentially

for image in fastreid_batch_images:
# Set the image as input to the model
fastreid_inputs[0].host = image

# Set the input shape for the context (batch size of 1)
context.set_input_shape('input', image.shape)

# Perform inference on the single image
features = do_inference(context, engine, bindings, fastreid_inputs, fastreid_outputs, stream)

# Flatten the features and append to the list
extracted_features.append(np.array(features[0]).flatten())

fastreid_inputs[0].host = fastreid_batch_images

context.set_input_shape('input', fastreid_batch_images.shape)

#perform feature extraction

#extracted_features = get_feature_fastreid(engine,context,fastreid_inputs,fastreid_outputs,bindings,stream)

extracted_features = do_inference(context, engine, bindings, fastreid_inputs, fastreid_outputs, stream)

#Print type and structure of extracted_features

#print(f"Extracted features type: {type(extracted_features)}")

#print(f"Extracted features content: {extracted_features}")

# Reshape the extracted features to a 2D array (1, 2048)

extracted_features = np.array(extracted_features[0]).flatten()

#print(f"Reshaped extracted features shape: {extracted_features.shape}")

return extracted_features

def _do_inference_base(inputs, outputs, stream, execute_async_func):

Transfer input data to the GPU.

kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
[cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]

Run inference.

execute_async_func()

Transfer predictions back from the GPU.

kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
[cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]

Synchronize the stream

cuda_call(cudart.cudaStreamSynchronize(stream))

Return only the host outputs.

return [out.host for out in outputs]

def do_inference(context, engine, bindings, inputs, outputs, stream):
def execute_async_func():
context.execute_async_v3(stream_handle=stream)

Setup context tensor address.

num_io = engine.num_io_tensors
for i in range(num_io):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
return _do_inference_base(inputs, outputs, stream, execute_async_func)

Define target classes
target_classes = ['car', 'bus', 'truck', 'motorcycle']

def main():

Input

video_file_name = '/content/30.mp4'
video_path = os.path.join(current_directory, 'data', video_file_name)
cap = cv2.VideoCapture(video_path) # Load video

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

original_size = (frame_width, frame_height)
#original_size = (1200, 800)
fps = int(cap.get(cv2.CAP_PROP_FPS))

Model and engine paths

model_name = "yolo11x"
precision = "fp16" # int8 or fp32 or fp16
#quantization_method = ''
onnx_model_path = os.path.join(current_directory, 'onnx', f'{model_name}{device.type}.onnx')
engine_file_path = os.path.join(current_directory, 'engine', f'{model_name}{precision}.engine')

fastreid_onnx_path = "/content/fastreid_model.onnx"
fastreid_engine_path = "/content/fastreid_model.trt"
os.makedirs(os.path.dirname(engine_file_path), exist_ok=True)

Output shapes expected

output_shapes = [(1, 84, 8400)]
output_shape_ndarray = np.array(output_shapes[0], dtype=np.int32)

fastreid Output shapes expected

#fastreid_output_shapes = (1,256) #(1,2048)
fastreid_output_shapes = (1, 2048)
fastreid_output_shape_ndarray = np.array(fastreid_output_shapes, dtype=np.int32)
#print(fastreid_output_shape_ndarray)

Load or build the yolo TensorRT engine and do inference

with get_engine(onnx_model_path, engine_file_path, precision) as engine,
engine.create_execution_context() as context:

# inputs, outputs, bindings, stream = allocate_buffers(engine, output_shapes[0], profile_idx=0)
inputs, outputs, bindings, stream = allocate_buffers(engine, output_shape_ndarray, profile_idx=0)

# Video writer to save the output
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output_video.mp4', fourcc, fps, original_size)

# Variable to store the start time of processing for each frame
start_time = time.time()
total_inference_time = 0
total_feature_time = 0
total_tracking_time = 0
frame_count = 0

 # Load FastReID engine once at the beginning
fastreid_engine = load_fastreid_engine(fastreid_engine_path)
fastreid_inputs,fastreid_outputs,fastreid_bindings,fastreid_stream = allocate_buffers(engine,fastreid_output_shape_ndarray,profile_idx=0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    original_frame = frame.copy()
    original_frame = cv2.resize(original_frame, original_size)

    #input setup for yolotensorrt
    img = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_LINEAR)
    input_image = preprocess_image(img)  # Preprocess image
    batch_images = np.concatenate([input_image], axis=0)

    inputs[0].host = batch_images
    context.set_input_shape('input', batch_images.shape)

    # #input setup for fastreidtensorrt
    # fastreid_img = cv2.resize(frame, (256,128), interpolation=cv2.INTER_LINEAR)
    # fastreid_input_img = preprocess_image(fastreid_img)
    # fastreid_batch_images = np.concatenate([fastreid_input_img], axis=0)

    # Inference
    start_inference_time = time.time()
    trt_outputs = do_inference(context, engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
    torch.cuda.synchronize()

    inference_time = time.time()-start_inference_time
    total_inference_time += inference_time

    # Reshape and post-process the output
    t_outputs = np.array(trt_outputs).reshape(output_shapes[0])
    results = non_max_suppression(t_outputs,original_size)


    # Filter results to include only target classes
    filtered_results = []
    for img_idx, (boxes, scores, classes) in enumerate(results):
        filtered_boxes = []
        filtered_scores = []
        filtered_classes = []
        for box, score, classe in zip(boxes, scores, classes):
            if coco_labels[classe] in target_classes:
                filtered_boxes.append(box)
                filtered_scores.append(score)
                filtered_classes.append(classe)
        filtered_results.append((filtered_boxes, filtered_scores, filtered_classes))

    # Initialize lists for DeepSORT
    results_deepsort = []
    embeddings = []
    fastreid_batch_images = []
    appearance_features_batch = []

    # Loop over filtered results to extract appearance features and prepare for tracking
    for img_idx, (boxes, scores, classes) in enumerate(filtered_results):
        for box, score, classe in zip(boxes, scores, classes):
            # Get coordinates of the bounding box
            xmin, ymin, xmax, ymax = map(int, box)

            # Ensure coordinates are within image boundaries
            height, width = frame.shape[:2]
            ymin = max(0, ymin)
            ymax = min(height, ymax)
            xmin = max(0, xmin)
            xmax = min(width, xmax)

            # Calculate the center of the bounding box
            center_x = (xmin + xmax) // 2
            center_y = (ymin + ymax) // 2

            # Line crossing condition: Check if the center_y crosses between the start and end y-coordinates of the line
            if startTrackingPoints[0][1] <= center_y <= endTrackingPoints[0][1]:

                 #print(f"ymin,ymax,xmin,xmax :{ymin,ymax,xmin,xmax}")

                 #cropping the image from frame according to bounding box
                 cropped_image = frame[ymin:ymax, xmin:xmax]

                 # Preprocess cropped image for FastReID
                 fastreid_input_img = preprocess_image(cv2.resize(cropped_image, (256, 128), interpolation=cv2.INTER_LINEAR))
                 #  fastreid_batch_images = np.concatenate([fastreid_input_img], axis=0)
                 fastreid_batch_images.append(fastreid_input_img)

                 # Prepare the detection data (bounding box + confidence + class name)
                 results_deepsort.append([[xmin, ymin, xmax - xmin, ymax - ymin], score, coco_labels[classe]])

    # Get the appearance features using FastReID
    start_feature_time = time.time()
    #appearance_features = get_appearance_features(original_frame, [xmin, ymin, xmax, ymax])
    if len(fastreid_batch_images) > 0:
         appearance_features_batch = load_and_inference_fastreid(fastreid_batch_images,fastreid_engine, fastreid_inputs, fastreid_outputs,fastreid_bindings,fastreid_stream)

    feature_time = time.time() - start_feature_time
    total_feature_time += feature_time
    #embeddings.append(appearance_features)
    
    if len(appearance_features_batch) > 0:
        # Process features for each object in the batch
        for idx, feature in enumerate(appearance_features_batch):
                 # Each 'feature' corresponds to an appearance feature for a single object in the batch
                 embeddings.append(feature)

    # Get the appearance features using FastReID
    start_tracking_time = time.time()

    # Perform DeepSORT tracking
    tracked_objects = tracker.update_tracks(results_deepsort, embeds=embeddings, frame=original_frame)
    #tracked_objects = tracker.update_tracks(results_deepsort, frame=original_frame)

    tracking_time = time.time() - start_tracking_time
    total_tracking_time += tracking_time

    # Draw bounding boxes and track IDs
    for track in tracked_objects:
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        bbox = track.to_tlbr()  # Get top-left to bottom-right coordinates of the bounding box

        # Draw the bounding box
        x1, y1, x2, y2 = map(int, bbox)
        cv2.rectangle(original_frame, (x1, y1), (x2, y2), (0, 0, 0), 5)

        # Label the object with its track ID
        cv2.putText(original_frame, f'ID: {track_id}', (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 3)

    # # Draw bounding boxes on detection the frame
    # for img_idx, (boxes, scores, classes) in enumerate(filtered_results):
    #     labels = [coco_labels[classe] for classe in classes]
    #     output_frame = draw_bounding_boxes(original_frame, boxes, labels, scores)

    # Calculate FPS for the current frame
    end_time = time.time()
    FPS = 1 / (end_time - start_time)
    start_time = end_time

    # Display FPS on the frame
    cv2.putText(original_frame, f"FPS: {FPS:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 3)

    frame_count += 1
    # Write the frame with detections to the output video
    out.write(original_frame)
    #cv2.imshow("Frame", original_frame)
    # Exit the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
#out.release()
cv2.destroyAllWindows()

#calculate average inference FPS
avg_inference_fps = frame_count / total_inference_time
print(f"Total Inference Time : {total_inference_time: .2f}")
print(f"Total Frame processed : {frame_count}")
print(f"Average Inference FPS : {avg_inference_fps: .2f}")

# calculate average feature FPS
avg_feature_fps = frame_count / total_feature_time
print(f"Total Feature Time : {total_feature_time: .2f}")
print(f"Average feature FPS : {avg_feature_fps: .2f}")

#calculate average tracking FPS
avg_tracking_fps = frame_count / total_tracking_time
print(f"Total Tracking Time : {total_tracking_time: .2f}")
print(f"Average Tracking FPS : {avg_tracking_fps: .2f}")

The text was updated successfully, but these errors were encountered:

lix19937 · 2024-11-12T04:33:38Z

Try to use ptq & cudagraph.

smrutiranjanmohapatra · 2024-11-12T10:00:15Z

i want help with running batch inference for tensorrt in the load_and_inference_fastreid function. i am storing the detected obj in a batch and sending it for feature extraction where i am using for loop for individual obj insted i want to work on batch how to do that

poweiw · 2024-11-18T01:58:03Z

First make sure the fast reid model you've exported supports dynamic batches. If so then under this function:

 # Loop over each image in the batch and process it sequentially
for image in fastreid_batch_images:
    # Set the image as input to the model
    fastreid_inputs[0].host = image

    # Set the input shape for the context (batch size of 1)
    context.set_input_shape('input', image.shape)

    # Perform inference on the single image
    features = do_inference(context, engine, bindings, fastreid_inputs, fastreid_outputs, stream)

    # Flatten the features and append to the list
    extracted_features.append(np.array(features[0]).flatten())

Split the images in fastreid_batch_images into batches along the batch axis (under most circumstances should be 0) and pass them to do_inference.

Also ### please increase the readability using code blocks.

poweiw added the triaged Issue has been triaged by maintainers label Nov 18, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Optimization of fast-reid infernce using tensorrt #4243

Optimization of fast-reid infernce using tensorrt #4243

smrutiranjanmohapatra commented Nov 11, 2024

lix19937 commented Nov 12, 2024

smrutiranjanmohapatra commented Nov 12, 2024

poweiw commented Nov 18, 2024 •

edited

Loading

Optimization of fast-reid infernce using tensorrt #4243

Optimization of fast-reid infernce using tensorrt #4243

Comments

smrutiranjanmohapatra commented Nov 11, 2024

#load the tensorrt engine

if not os.path.exists(fastreid_engine_path):

raise FileNotFoundError(f"Engine file {fastreid_engine_path} not found. Please ensure the path is correct.")

with trt.Runtime(TRT_LOGGER) as runtime, open(fastreid_engine_path,"rb") as f:

fastreid_engine_data = f.read()

engine = runtime.deserialize_cuda_engine(fastreid_engine_data)

fastreid_img = cv2.resize(frame, (256,128), interpolation=cv2.INTER_LINEAR)

fastreid_input_img = preprocess_image(fastreid_img)

fastreid_batch_images = np.concatenate([fastreid_input_img], axis=0)

fastreid_inputs,fastreid_outputs,bindings,stream = allocate_buffers(engine,fastreid_output_shape,profile_idx=0)

List to hold extracted features for each image

Loop over each image in the batch and process it sequentially

fastreid_inputs[0].host = fastreid_batch_images

context.set_input_shape('input', fastreid_batch_images.shape)

#perform feature extraction

#extracted_features = get_feature_fastreid(engine,context,fastreid_inputs,fastreid_outputs,bindings,stream)

extracted_features = do_inference(context, engine, bindings, fastreid_inputs, fastreid_outputs, stream)

#Print type and structure of extracted_features

#print(f"Extracted features type: {type(extracted_features)}")

#print(f"Extracted features content: {extracted_features}")

# Reshape the extracted features to a 2D array (1, 2048)

extracted_features = np.array(extracted_features[0]).flatten()

#print(f"Reshaped extracted features shape: {extracted_features.shape}")

Transfer input data to the GPU.

Run inference.

Transfer predictions back from the GPU.

Synchronize the stream

Return only the host outputs.

Setup context tensor address.

Input

Model and engine paths

Output shapes expected

fastreid Output shapes expected

Load or build the yolo TensorRT engine and do inference

lix19937 commented Nov 12, 2024

smrutiranjanmohapatra commented Nov 12, 2024

poweiw commented Nov 18, 2024 • edited Loading

poweiw commented Nov 18, 2024 •

edited

Loading