Skip to content

Commit

Permalink
FFDNet_TensorRT: Add CUDA Graph support
Browse files Browse the repository at this point in the history
  • Loading branch information
WolframRhodium committed Aug 30, 2021
1 parent 8931f1a commit 2dd22dc
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 17 deletions.
2 changes: 2 additions & 0 deletions Collections/examples/FFDNet_TensorRT/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,7 @@

5. (Optionally) Run `benchmark.py` or `trtexec --loadEngine="ffdnet.engine" --useCudaGraph` to test the engine's raw performance.

`benchmark.py` writes a DOT file "ffdnet.dot" describing inference graph structure when `use_cuda_graph=True`. The DOT file can be visualized by running `dot -Tsvg ffdnet.dot > ffdnet.svg`.

6. Run `ffdnet_test.vpy` to test in VapourSynth.

28 changes: 24 additions & 4 deletions Collections/examples/FFDNet_TensorRT/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def benchmark(
width: int,
height: int,
iter: int = 5,
use_cuda_graph: bool = False,
logger: trt.Logger = trt.Logger(trt.Logger.VERBOSE)
) -> None:

Expand All @@ -44,11 +45,31 @@ def benchmark(
end = checkError(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT.value))
end = UniqueResource(end, cuda.cuEventDestroy, end)

def execute():
execution_context.execute_async_v2(bindings, stream_handle=stream.obj)

if use_cuda_graph:
checkError(cuda.cuStreamBeginCapture(
stream.obj, cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED))

execute()

graph = checkError(cuda.cuStreamEndCapture(stream.obj))
graphexec, error_node = checkError(cuda.cuGraphInstantiate(
graph, logBuffer=b"", bufferSize=0))
graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy, graphexec)
checkError(cuda.cuGraphDebugDotPrint(
graph, b"ffdnet.dot",
cuda.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE.value))
checkError(cuda.cuGraphDestroy(graph))

for _ in range(iter):
checkError(cuda.cuEventRecord(start.obj, stream.obj))

# execution_context.execute_v2(bindings)
execution_context.execute_async_v2(bindings, stream_handle=stream.obj)
if use_cuda_graph:
checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj))
else:
execute()

checkError(cuda.cuEventRecord(end.obj, stream.obj))
checkError(cuda.cuEventSynchronize(end.obj))
Expand All @@ -59,5 +80,4 @@ def benchmark(


if __name__ == "__main__":
benchmark(width=1920, height=1080, iter=10)

benchmark(width=1920, height=1080, iter=10, use_cuda_graph=False)
11 changes: 4 additions & 7 deletions Collections/examples/FFDNet_TensorRT/build_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def build_engine(
width: int,
height: int,
args_dict: Dict,
max_workspace_size: int = int(1.6 * 1024 ** 3)
logger: trt.Logger = trt.Logger(trt.Logger.VERBOSE)
) -> None:

Expand Down Expand Up @@ -51,12 +52,9 @@ def build_engine(
network.mark_output(output)

config = builder.create_builder_config()
config.max_workspace_size = int(1.6 * 1024 ** 3)
try:
with open("timing_cache.buffer", "rb") as cache_f:
cache = config.create_timing_cache(cache_f.read())
except FileNotFoundError:
cache = config.create_timing_cache(b"")
config.max_workspace_size = max_workspace_size
with open("timing_cache.buffer", "rb") as cache_f:
cache = config.create_timing_cache(cache_f.read())
config.set_timing_cache(cache=cache, ignore_mismatch=False)

output = builder.build_serialized_network(network, config)
Expand All @@ -75,4 +73,3 @@ def build_engine(
args_dict = torch.load("ffdnet_color.pth")

build_engine(width=1920, height=1080, args_dict=args_dict)

2 changes: 1 addition & 1 deletion Collections/examples/FFDNet_TensorRT/ffdnet_test.vpy
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import vs_ffdnet

src = core.lsmas.LWLibavSource(r'PV02.mkv')
src = core.resize.Bicubic(src, 1920, 1080, format=vs.RGBS, matrix_in_s="709")
res = vs_ffdnet.FFDNet(src, sigma=5.0)
res = vs_ffdnet.FFDNet(src, sigma=5.0, use_cuda_graph=False)

res.set_output()

28 changes: 23 additions & 5 deletions Collections/examples/FFDNet_TensorRT/vs_ffdnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
def FFDNet(
clip: vs.VideoNode,
sigma: float = 5.0,
use_cuda_graph: bool = False,
logger: trt.Logger = trt.Logger(trt.Logger.WARNING)
) -> vs.VideoNode:

Expand Down Expand Up @@ -76,10 +77,7 @@ def FFDNet(
checkError(cuda.cuMemcpyHtoDAsync(
d_sigma.obj, h_sigma.obj, sigma_size, stream.obj))

def inference_core(n, f):
for i in range(3):
h_input_array[0, i, :, :] = np.asarray(f.get_read_array(i))

def execute():
checkError(cuda.cuMemcpyHtoDAsync(
d_input.obj, h_input.obj, input_size, stream.obj))

Expand All @@ -90,6 +88,27 @@ def inference_core(n, f):
checkError(cuda.cuMemcpyDtoHAsync(
h_output.obj, d_output.obj, output_size, stream.obj))

if use_cuda_graph:
checkError(cuda.cuStreamBeginCapture(
stream.obj, cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED))

execute()

graph = checkError(cuda.cuStreamEndCapture(stream.obj))
graphexec, error_node = checkError(cuda.cuGraphInstantiate(
graph, logBuffer=b"", bufferSize=0))
graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy, graphexec)
checkError(cuda.cuGraphDestroy(graph))

def inference_core(n, f):
for i in range(3):
h_input_array[0, i, :, :] = np.asarray(f.get_read_array(i))

if use_cuda_graph:
checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj))
else:
execute()

fout = f.copy()
fout.get_write_array(0) # triggers COW
checkError(cuda.cuStreamSynchronize(stream.obj))
Expand All @@ -100,4 +119,3 @@ def inference_core(n, f):
return fout

return core.std.ModifyFrame(clip, clips=[clip], selector=inference_core)

0 comments on commit 2dd22dc

Please sign in to comment.