diff --git a/gpumat/benchmark.py b/gpumat/benchmark.py new file mode 100755 index 000000000..572ffdec5 --- /dev/null +++ b/gpumat/benchmark.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python3 +import platform +import random +import statistics +import sys +import time +from dataclasses import dataclass +from typing import List, Dict, Callable, Tuple, Optional + +import numpy as np + +from savant.deepstream.opencv_utils import ( + nvds_to_gpu_mat, + alpha_comp, + draw_rect, + apply_cuda_filter, +) +from savant.deepstream.utils import nvds_frame_meta_iterator, get_nvds_buf_surface + +sys.path.append('../../') + +import cv2 +import gi + +gi.require_version('Gst', '1.0') +from gi.repository import GLib, Gst +import pyds + +scale = 10**6 # milliseconds +RECT_COLOR = (127, 127, 127, 255) # gray +RECT_N = 20 +RECT_WIDTH = 100 +RECT_HEIGHT = 100 +FACE_WIDTH = 30 +FACE_HEIGHT = 40 + + +@dataclass +class BenchmarkData: + overlay: np.ndarray + overlay_mat: cv2.cuda.GpuMat + points: List[Tuple[int, int]] + cuda_blur_filter: cv2.cuda.Filter + + +def benchmark_cpu_overlay( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with get_nvds_buf_surface(gst_buffer, nvds_frame_meta) as np_frame: + height, width, _ = data.overlay.shape + np_frame[:height, :width] = data.overlay + + +def benchmark_gpu_overlay( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with nvds_to_gpu_mat(gst_buffer, nvds_frame_meta) as frame_mat: + alpha_comp(frame_mat, data.overlay, (0, 0)) + + +def benchmark_gpu_overlay_single( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with nvds_to_gpu_mat(gst_buffer, nvds_frame_meta) as frame_mat: + alpha_comp(frame_mat, data.overlay_mat, (0, 0)) + + +def benchmark_cpu_draw_rectangles( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with get_nvds_buf_surface(gst_buffer, nvds_frame_meta) as np_frame: + for x, y in data.points: + cv2.rectangle( + np_frame, + (x, y), + (x + RECT_WIDTH, y + RECT_HEIGHT), + RECT_COLOR, + 4, + ) + + +def benchmark_gpu_draw_rectangles( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with nvds_to_gpu_mat(gst_buffer, nvds_frame_meta) as frame_mat: + for x, y in data.points: + draw_rect( + frame_mat, + (x, y, x + RECT_WIDTH, y + RECT_HEIGHT), + RECT_COLOR, + 4, + ) + + +def benchmark_cpu_blur_faces( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with get_nvds_buf_surface(gst_buffer, nvds_frame_meta) as np_frame: + for x, y in data.points: + np_frame[y : y + FACE_HEIGHT, x : x + FACE_WIDTH] = cv2.GaussianBlur( + np_frame[y : y + FACE_HEIGHT, x : x + FACE_WIDTH], + (31, 31), + 100, + 100, + ) + + +def benchmark_gpu_blur_faces( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with nvds_to_gpu_mat(gst_buffer, nvds_frame_meta) as frame_mat: + for x, y in data.points: + apply_cuda_filter( + data.cuda_blur_filter, frame_mat, (x, y, FACE_WIDTH, FACE_HEIGHT) + ) + + +def benchmark_gpu_blur_faces_in_cpu( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with nvds_to_gpu_mat(gst_buffer, nvds_frame_meta) as frame_mat: + for x, y in data.points: + roi = cv2.cuda.GpuMat(frame_mat, (x, y, FACE_WIDTH, FACE_HEIGHT)) + roi.upload( + cv2.GaussianBlur( + roi.download(), + (31, 31), + 100, + 100, + ) + ) + + +def benchmark_gpu_download_upload( + gst_buffer: Gst.Buffer, + nvds_frame_meta: pyds.NvDsFrameMeta, + data: BenchmarkData, +): + with nvds_to_gpu_mat(gst_buffer, nvds_frame_meta) as frame_mat: + for x, y in data.points: + roi = cv2.cuda.GpuMat(frame_mat, (x, y, RECT_WIDTH, RECT_HEIGHT)) + part = roi.download() + roi.upload(part) + + +BenchmarkFunc = Callable[[Gst.Buffer, pyds.NvDsFrameMeta, BenchmarkData], None] +BENCHMARK_FUNCS: Dict[str, Tuple[Optional[BenchmarkFunc], Optional[BenchmarkFunc]]] = { + 'overlay': (benchmark_cpu_overlay, benchmark_gpu_overlay), + 'overlay-single': (None, benchmark_gpu_overlay_single), + 'draw-rectangles': (benchmark_cpu_draw_rectangles, benchmark_gpu_draw_rectangles), + 'blur-faces': (benchmark_cpu_blur_faces, benchmark_gpu_blur_faces), + 'blur-faces-in-cpu': (None, benchmark_gpu_blur_faces_in_cpu), + 'download-upload': (None, benchmark_gpu_download_upload), +} + + +def pad_buffer_probe( + pad: Gst.Pad, + info: Gst.PadProbeInfo, + benchmark_func: BenchmarkFunc, + data: BenchmarkData, + measurements: List[float], +): + data.points = [ + (random.randint(0, 1900 - RECT_WIDTH), random.randint(0, 1000 - RECT_HEIGHT)) + for _ in range(RECT_N) + ] + gst_buffer: Gst.Buffer = info.get_buffer() + nvds_batch_meta = pyds.gst_buffer_get_nvds_batch_meta(hash(gst_buffer)) + for nvds_frame_meta in nvds_frame_meta_iterator(nvds_batch_meta): + ts1 = time.time() + benchmark_func(gst_buffer, nvds_frame_meta, data) + ts2 = time.time() + measurements.append((ts2 - ts1) * scale) + + return Gst.PadProbeReturn.OK + + +def is_aarch64(): + return platform.uname()[4] == 'aarch64' + + +def bus_call(bus, message, loop): + t = message.type + if t == Gst.MessageType.EOS: + sys.stdout.write("End-of-stream\n") + loop.quit() + elif t == Gst.MessageType.WARNING: + err, debug = message.parse_warning() + sys.stderr.write("Warning: %s: %s\n" % (err, debug)) + elif t == Gst.MessageType.ERROR: + err, debug = message.parse_error() + sys.stderr.write("Error: %s: %s\n" % (err, debug)) + loop.quit() + return True + + +def main(args): + assert ( + len(args) > 2 + ), 'Usage: ./benchmark.py [n-frames] [output-filename]' + benchmark_name = args[1] + is_gpu = args[2] == 'gpu' + assert ( + benchmark_name in BENCHMARK_FUNCS + ), f'Available benchmark names: {", ".join(BENCHMARK_FUNCS.keys())}' + benchmark_func = BENCHMARK_FUNCS[benchmark_name][int(is_gpu)] + assert benchmark_func is not None, 'Benchmark not implemented' + + output_filename = None + if len(args) > 3: + n_frames = int(args[3]) + if len(args) > 4: + output_filename = args[4] + else: + n_frames = 1 + + Gst.init(None) + + print("Creating Pipeline") + pipeline = Gst.Pipeline() + is_live = False + + print("Creating streammux") + streammux = Gst.ElementFactory.make("nvstreammux", "streammux") + pipeline.add(streammux) + + print("Creating source") + source = Gst.ElementFactory.make("videotestsrc", "source") + pipeline.add(source) + + print("Creating source converter") + source_converter = Gst.ElementFactory.make("nvvideoconvert", "source-converter") + pipeline.add(source_converter) + + print("Creating source capsfilter") + source_capsfilter = Gst.ElementFactory.make("capsfilter", "source-capsfilter") + pipeline.add(source_capsfilter) + + print("Creating workload") + workload = Gst.ElementFactory.make("identity", "workload") + pipeline.add(workload) + + print("Creating streamdemux") + streamdemux = Gst.ElementFactory.make("nvstreamdemux", "streamdemux") + pipeline.add(streamdemux) + + print("Creating queue") + queue = Gst.ElementFactory.make("queue", "queue") + pipeline.add(queue) + + if output_filename: + print("Creating converter") + converter = Gst.ElementFactory.make("nvvideoconvert", "converter") + pipeline.add(converter) + + print("Creating sink_capsfilter") + sink_capsfilter = Gst.ElementFactory.make("capsfilter", "sink_capsfilter") + pipeline.add(sink_capsfilter) + + print("Creating encoder") + encoder = Gst.ElementFactory.make("nvv4l2h264enc", "encoder") + pipeline.add(encoder) + + print("Creating parser") + parser = Gst.ElementFactory.make("h264parse", "parser") + pipeline.add(parser) + + print("Creating sink") + sink = Gst.ElementFactory.make("filesink", "sink") + pipeline.add(sink) + else: + print("Creating sink") + sink = Gst.ElementFactory.make("fakesink", "sink") + pipeline.add(sink) + + source.set_property('num-buffers', n_frames) + + if is_live: + streammux.set_property('live-source', 1) + streammux.set_property('width', 1920) + streammux.set_property('height', 1080) + streammux.set_property('batch-size', 1) + streammux.set_property('batched-push-timeout', 4000000) + + sink.set_property("sync", 0) + sink.set_property("qos", 0) + sink.set_property("enable-last-sample", 0) + if output_filename: + sink.set_property("location", output_filename) + + if not is_aarch64(): + nv_buf_memory_type = int(pyds.NVBUF_MEM_CUDA_UNIFIED) + source_converter.set_property("nvbuf-memory-type", nv_buf_memory_type) + streammux.set_property("nvbuf-memory-type", nv_buf_memory_type) + if output_filename: + converter.set_property("nvbuf-memory-type", nv_buf_memory_type) + + source_capsfilter.set_property( + 'caps', + Gst.Caps.from_string( + 'video/x-raw(memory:NVMM), format=RGBA, width=1920, height=1080' + ), + ) + if output_filename: + sink_capsfilter.set_property( + 'caps', + Gst.Caps.from_string( + 'video/x-raw(memory:NVMM), format=RGBA, width=1920, height=1080' + ), + ) + + print("Linking elements in the Pipeline") + + assert source.link(source_converter) + assert source_converter.link(source_capsfilter) + + assert ( + source_capsfilter.get_static_pad('src').link( + streammux.get_request_pad('sink_0') + ) + == Gst.PadLinkReturn.OK + ) + + assert streammux.link(workload) + assert workload.link(streamdemux) + + streamdemux_src_pad = streamdemux.get_request_pad('src_0') + streamdemux.get_request_pad('src_1') + streamdemux.get_request_pad('src_2') + streamdemux.get_request_pad('src_3') + queue_sink_pad = queue.get_static_pad('sink') + assert streamdemux_src_pad.link(queue_sink_pad) == Gst.PadLinkReturn.OK + + if output_filename: + assert queue.link(converter) + assert converter.link(encoder) + assert encoder.link(parser) + assert parser.link(sink) + else: + assert queue.link(sink) + + # create an event loop and feed gstreamer bus messages to it + loop = GLib.MainLoop() + bus = pipeline.get_bus() + bus.add_signal_watch() + bus.connect("message", bus_call, loop) + + sink_pad = workload.get_static_pad("sink") + measurements = [] + if not sink_pad: + sys.stderr.write("Unable to get sink pad") + else: + overlay = cv2.imread('logo.png', cv2.IMREAD_UNCHANGED) + benchmark_data = BenchmarkData( + overlay=overlay, + overlay_mat=cv2.cuda.GpuMat(overlay), + points=[], + cuda_blur_filter=cv2.cuda.createGaussianFilter( + cv2.CV_8UC4, + cv2.CV_8UC4, + (31, 31), + 100, + 100, + ), + ) + sink_pad.add_probe( + Gst.PadProbeType.BUFFER, + pad_buffer_probe, + benchmark_func, + benchmark_data, + measurements, + ) + + print("Starting pipeline") + ts1 = time.time() + pipeline.set_state(Gst.State.PLAYING) + try: + loop.run() + except: + pass + print("Exiting app\n") + pipeline.set_state(Gst.State.NULL) + ts2 = time.time() + elapsed = ts2 - ts1 + print(f"Elapsed: {elapsed:.2f}, framerate: {n_frames / elapsed:.2f}") + metrics = [ + ('min', min(measurements)), + ('max', max(measurements)), + ('mean', statistics.mean(measurements)), + ('median', statistics.median(measurements)), + ('80%', statistics.quantiles(measurements, n=5)[-1]), + ('90%', statistics.quantiles(measurements, n=10)[-1]), + ('95%', statistics.quantiles(measurements, n=20)[-1]), + ('99%', statistics.quantiles(measurements, n=100)[-1]), + ('stdev', statistics.stdev(measurements)), + ] + for name, val in metrics: + print(f'{name}: {val:.3f}') + device_name = "gpu" if is_gpu else "cpu" + with open('metrics.csv', 'a') as f: + f.write( + ','.join( + [benchmark_name, device_name, str(n_frames)] + + [f'{val:.3f}' for _, val in metrics] + ) + ) + f.write('\n') + measurements_filename = f'measurements-{benchmark_name}-{device_name}.txt' + with open(measurements_filename, 'w') as f: + for x in measurements: + f.write(f'{x}\n') + + +if __name__ == '__main__': + sys.exit(main(sys.argv)) diff --git a/gpumat/logo.png b/gpumat/logo.png new file mode 100644 index 000000000..3a4f7e021 Binary files /dev/null and b/gpumat/logo.png differ diff --git a/gpumat/run_benchmarks.sh b/gpumat/run_benchmarks.sh new file mode 100755 index 000000000..77b2db635 --- /dev/null +++ b/gpumat/run_benchmarks.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +if [[ -n "${1}" ]]; then + FRAME_NUM="${1}" +else + FRAME_NUM=1000 +fi + +GPU_BENCHMARK_NAMES=( + "overlay" + "overlay-single" + "draw-rectangles" + "blur-faces" + "blur-faces-in-cpu" + "download-upload" +) +CPU_BENCHMARK_NAMES=( + "overlay" + "draw-rectangles" + "blur-faces" +) + +echo "name,device,frame_num,min,max,mean,median,80%,90%,95%,99%,stdev" >metrics.csv + +for BENCHMARK_NAME in "${GPU_BENCHMARK_NAMES[@]}"; do + echo + date + echo "Running GPU benchmark ${BENCHMARK_NAME}" + docker run \ + --name test \ + --rm -it \ + --gpus all \ + -e GST_DEBUG=1 \ + -e LOGLEVEL=INFO \ + -e PYTHONUNBUFFERED=1 \ + --workdir /gpumat \ + --entrypoint ./benchmark.py \ + -v "$(pwd):/gpumat" \ + savant-deepstream:0.1.1-6.1.1-base "${BENCHMARK_NAME}" "gpu" "${FRAME_NUM}" +done + +for BENCHMARK_NAME in "${CPU_BENCHMARK_NAMES[@]}"; do + echo + date + echo "Running CPU benchmark ${BENCHMARK_NAME}" + docker run \ + --name test \ + --rm -it \ + --gpus all \ + -e GST_DEBUG=1 \ + -e LOGLEVEL=INFO \ + -e PYTHONUNBUFFERED=1 \ + --workdir /gpumat \ + --entrypoint ./benchmark.py \ + -v "$(pwd):/gpumat" \ + savant-deepstream:0.1.1-6.1.1-base "${BENCHMARK_NAME}" "cpu" "${FRAME_NUM}" +done +date