FFDNet_TensorRT: Add CUDA Graph support

WolframRhodium · Aug 30, 2021 · 2dd22dc · 2dd22dc
1 parent 8931f1a
commit 2dd22dc
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 17 deletions.
diff --git a/Collections/examples/FFDNet_TensorRT/README.md b/Collections/examples/FFDNet_TensorRT/README.md
@@ -11,5 +11,7 @@
 
 5. (Optionally) Run `benchmark.py` or `trtexec --loadEngine="ffdnet.engine" --useCudaGraph` to test the engine's raw performance.
 
+ `benchmark.py` writes a DOT file "ffdnet.dot" describing inference graph structure when `use_cuda_graph=True`. The DOT file can be visualized by running `dot -Tsvg ffdnet.dot > ffdnet.svg`.
+
 6. Run `ffdnet_test.vpy` to test in VapourSynth.
 
diff --git a/Collections/examples/FFDNet_TensorRT/benchmark.py b/Collections/examples/FFDNet_TensorRT/benchmark.py
@@ -20,6 +20,7 @@ def benchmark(
  width: int,
  height: int,
  iter: int = 5,
+ use_cuda_graph: bool = False,
  logger: trt.Logger = trt.Logger(trt.Logger.VERBOSE)
 ) -> None:
 
@@ -44,11 +45,31 @@ def benchmark(
  end = checkError(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT.value))
  end = UniqueResource(end, cuda.cuEventDestroy, end)
 
+ def execute():
+ execution_context.execute_async_v2(bindings, stream_handle=stream.obj)
+
+ if use_cuda_graph:
+ checkError(cuda.cuStreamBeginCapture(
+ stream.obj, cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED))
+
+ execute()
+
+ graph = checkError(cuda.cuStreamEndCapture(stream.obj))
+ graphexec, error_node = checkError(cuda.cuGraphInstantiate(
+ graph, logBuffer=b"", bufferSize=0))
+ graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy, graphexec)
+ checkError(cuda.cuGraphDebugDotPrint(
+ graph, b"ffdnet.dot",
+ cuda.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE.value))
+ checkError(cuda.cuGraphDestroy(graph))
+
  for _ in range(iter):
  checkError(cuda.cuEventRecord(start.obj, stream.obj))
 
- # execution_context.execute_v2(bindings)
- execution_context.execute_async_v2(bindings, stream_handle=stream.obj)
+ if use_cuda_graph:
+ checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj))
+ else:
+ execute()
 
  checkError(cuda.cuEventRecord(end.obj, stream.obj))
  checkError(cuda.cuEventSynchronize(end.obj))
@@ -59,5 +80,4 @@ def benchmark(
 
 
 if __name__ == "__main__":
- benchmark(width=1920, height=1080, iter=10)
-
+ benchmark(width=1920, height=1080, iter=10, use_cuda_graph=False)
diff --git a/Collections/examples/FFDNet_TensorRT/build_engine.py b/Collections/examples/FFDNet_TensorRT/build_engine.py
@@ -7,6 +7,7 @@ def build_engine(
  width: int,
  height: int,
  args_dict: Dict,
+ max_workspace_size: int = int(1.6 * 1024 ** 3)
  logger: trt.Logger = trt.Logger(trt.Logger.VERBOSE)
 ) -> None:
 
@@ -51,12 +52,9 @@ def build_engine(
  network.mark_output(output)
 
  config = builder.create_builder_config()
- config.max_workspace_size = int(1.6 * 1024 ** 3)
- try:
- with open("timing_cache.buffer", "rb") as cache_f:
- cache = config.create_timing_cache(cache_f.read())
- except FileNotFoundError:
- cache = config.create_timing_cache(b"")
+ config.max_workspace_size = max_workspace_size
+ with open("timing_cache.buffer", "rb") as cache_f:
+ cache = config.create_timing_cache(cache_f.read())
  config.set_timing_cache(cache=cache, ignore_mismatch=False)
 
  output = builder.build_serialized_network(network, config)
@@ -75,4 +73,3 @@ def build_engine(
  args_dict = torch.load("ffdnet_color.pth")
 
  build_engine(width=1920, height=1080, args_dict=args_dict)
-
diff --git a/Collections/examples/FFDNet_TensorRT/ffdnet_test.vpy b/Collections/examples/FFDNet_TensorRT/ffdnet_test.vpy
@@ -7,7 +7,7 @@ import vs_ffdnet
 
 src = core.lsmas.LWLibavSource(r'PV02.mkv')
 src = core.resize.Bicubic(src, 1920, 1080, format=vs.RGBS, matrix_in_s="709")
-res = vs_ffdnet.FFDNet(src, sigma=5.0)
+res = vs_ffdnet.FFDNet(src, sigma=5.0, use_cuda_graph=False)
 
 res.set_output()
 
diff --git a/Collections/examples/FFDNet_TensorRT/vs_ffdnet.py b/Collections/examples/FFDNet_TensorRT/vs_ffdnet.py
@@ -15,6 +15,7 @@
 def FFDNet(
  clip: vs.VideoNode,
  sigma: float = 5.0,
+ use_cuda_graph: bool = False,
  logger: trt.Logger = trt.Logger(trt.Logger.WARNING)
 ) -> vs.VideoNode:
 
@@ -76,10 +77,7 @@ def FFDNet(
  checkError(cuda.cuMemcpyHtoDAsync(
  d_sigma.obj, h_sigma.obj, sigma_size, stream.obj))
 
- def inference_core(n, f):
- for i in range(3):
- h_input_array[0, i, :, :] = np.asarray(f.get_read_array(i))
-
+ def execute():
  checkError(cuda.cuMemcpyHtoDAsync(
  d_input.obj, h_input.obj, input_size, stream.obj))
 
@@ -90,6 +88,27 @@ def inference_core(n, f):
  checkError(cuda.cuMemcpyDtoHAsync(
  h_output.obj, d_output.obj, output_size, stream.obj))
 
+ if use_cuda_graph:
+ checkError(cuda.cuStreamBeginCapture(
+ stream.obj, cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED))
+
+ execute()
+
+ graph = checkError(cuda.cuStreamEndCapture(stream.obj))
+ graphexec, error_node = checkError(cuda.cuGraphInstantiate(
+ graph, logBuffer=b"", bufferSize=0))
+ graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy, graphexec)
+ checkError(cuda.cuGraphDestroy(graph))
+
+ def inference_core(n, f):
+ for i in range(3):
+ h_input_array[0, i, :, :] = np.asarray(f.get_read_array(i))
+
+ if use_cuda_graph:
+ checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj))
+ else:
+ execute()
+
  fout = f.copy()
  fout.get_write_array(0) # triggers COW
  checkError(cuda.cuStreamSynchronize(stream.obj))
@@ -100,4 +119,3 @@ def inference_core(n, f):
  return fout
 
  return core.std.ModifyFrame(clip, clips=[clip], selector=inference_core)
-