diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp index 5a5c1ad83d..410ea4b6f1 100644 --- a/core/runtime/TRTEngine.cpp +++ b/core/runtime/TRTEngine.cpp @@ -99,6 +99,9 @@ TRTEngine::TRTEngine( exec_ctx = make_trt(cuda_engine->createExecutionContext()); TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context"); + runtime_states.old_cudagraphs = CUDAGRAPHS_MODE; + runtime_states.old_pre_allocated_outputs = false; + if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) { uint64_t inputs = 0; uint64_t outputs = 0; diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h index 88fb7ab275..80b3a5ed5e 100644 --- a/core/runtime/TRTEngine.h +++ b/core/runtime/TRTEngine.h @@ -30,6 +30,33 @@ using FlattenedState = std::tuple< std::tuple, // serialized metadata std::tuple>; // Platform +struct TorchTRTRuntimeStates { + // Indicates whether CUDAGraphs were enabled in the previous execute_engine + bool old_cudagraphs; + // Indicates whether pre-allocated output was enabled in the previous execute_engine + bool old_pre_allocated_outputs; + + // Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs + // based on the current and previous states, as well as input shape has changed + std::tuple set_runtime_states(bool new_cudagraphs, bool new_pre_allocated_output, bool shape_changed) { + bool need_cudagraphs_record = false; + bool can_use_pre_allocated_outputs = false; + + // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change + if (new_cudagraphs && (!old_cudagraphs || shape_changed)) { + need_cudagraphs_record = true; + } + // Pre-allocated output can be used when previous and current state are true without shape change + if (old_pre_allocated_outputs && new_pre_allocated_output && !shape_changed) { + can_use_pre_allocated_outputs = true; + } + old_cudagraphs = new_cudagraphs; + old_pre_allocated_outputs = new_pre_allocated_output; + + return {need_cudagraphs_record, can_use_pre_allocated_outputs}; + } +}; + struct TRTEngine : torch::CustomClassHolder { // Each engine needs it's own runtime object std::shared_ptr rt; @@ -88,6 +115,8 @@ struct TRTEngine : torch::CustomClassHolder { int64_t get_streamable_device_memory_budget(); int64_t get_automatic_device_memory_budget(); std::vector infer_outputs(std::vector> input_shapes); + void set_pre_allocated_outputs(bool enable); + TorchTRTRuntimeStates runtime_states; friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine); static const char BINDING_DELIM = '%'; @@ -101,7 +130,9 @@ struct TRTEngine : torch::CustomClassHolder { at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream(); std::vector input_buffers = {}; std::vector output_buffers = {}; - std::string shape_key; + std::string shape_key = "None"; + bool use_pre_allocated_outputs = false; + std::vector pre_allocated_outputs; // TODO: Implement a call method // c10::List Run(c10::List inputs); diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index 280c805295..e871cd3467 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -60,9 +60,8 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de return new_target_device_opt.value(); } -bool _cudagraphs_validate_shapes(std::vector inputs, c10::intrusive_ptr compiled_engine) { - // Validate whether the current input shapes to the engine - // invalidate the existing cudagraphs object +bool _validate_shapes(std::vector inputs, c10::intrusive_ptr compiled_engine) { + // Validate whether the current input shapes to the engine has changed // Populate the shape key for the inputs // x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5) @@ -83,15 +82,102 @@ bool _cudagraphs_validate_shapes(std::vector inputs, c10::intrusive_ auto new_shape_key = new_shape_key_ss.str(); - // Compare the shape key to the original key and invalidate shapes if they do not match + // Compare the shape key to the original key if (new_shape_key != compiled_engine->shape_key) { - LOG_DEBUG("Resetting Cudagraph on New Shape Key " << new_shape_key); + LOG_DEBUG("Input shape changed " << compiled_engine->shape_key << " -> " << new_shape_key); compiled_engine->shape_key = new_shape_key; - compiled_engine->cudagraph.reset(); - return false; + return true; + } + + return false; +} +void setup_input_tensors( + std::vector inputs, + c10::intrusive_ptr compiled_engine, + bool need_cudagraphs_record) { + // this is a buffer to store shape tensor input addresses throughout the runtime scope + std::list> inputShapeTensorValues; + std::list formatted_inputs(compiled_engine->num_io.first); + + for (size_t i = 0; i < inputs.size(); i++) { + std::string name = compiled_engine->in_binding_names[i]; + + TORCHTRT_CHECK( + inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device()); + + auto expected_type = + util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str())); + TORCHTRT_CHECK( + inputs[i].dtype() == expected_type, + "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype()); + + auto dims = core::util::toDims(inputs[i].sizes()); + auto shape = core::util::toVec(dims); + LOG_DEBUG("Input Name: " << name << " Shape: " << dims); + + if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) { + // Shape tensor inputs are casted to int64 explicitly. + // Refer to + // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435 + auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64); + std::vector inputs_cpu_vec( + input_cpu.data_ptr(), input_cpu.data_ptr() + input_cpu.numel()); + inputShapeTensorValues.emplace_back(inputs_cpu_vec); + TORCHTRT_CHECK( + compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()), + "Error while setting the tensor address for shape inputs"); + + if (CUDAGRAPHS_MODE) { + // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers + compiled_engine->input_buffers[i] = input_cpu; + } + TORCHTRT_CHECK( + compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()), + "Error while setting the tensor address for shape inputs"); + + } else { + at::Tensor contig_input = inputs[i].view(shape).contiguous(); + formatted_inputs.emplace_back(std::move(contig_input)); + + if (need_cudagraphs_record) { + // Create a new persistent input buffer + compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone()); + } + + TORCHTRT_CHECK( + compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape"); + + if (CUDAGRAPHS_MODE) { + // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer + compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true); + TORCHTRT_CHECK( + compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()), + "Error while setting the input tensor address for inputs"); + } else { + // Otherwise use the formatted buffer directly + TORCHTRT_CHECK( + compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()), + "Error while setting the input tensor address for inputs"); + } + } + } +} +std::vector create_output_tensors(c10::intrusive_ptr compiled_engine) { + std::vector outputs(compiled_engine->num_io.second); + for (auto output_indices : compiled_engine->out_binding_map) { + // out_binding_map stores TRT_IDX: PYT_IDX + auto pyt_idx = output_indices.second; + + std::string name = compiled_engine->out_binding_names[pyt_idx]; + auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str()); + LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape); + + auto dims = core::util::toVec(out_shape); + auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str())); + outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous()); } - return true; + return outputs; } std::vector execute_engine(std::vector inputs, c10::intrusive_ptr compiled_engine) { @@ -116,18 +202,20 @@ std::vector execute_engine(std::vector inputs, c10::intr compiled_engine->cudagraph.enable_debug_mode(); } + bool shape_changed = _validate_shapes(inputs, compiled_engine); + // Whether cudagraphs needs to record the graph on this pass - bool need_cudagraphs_record = (CUDAGRAPHS_MODE && (!_cudagraphs_validate_shapes(inputs, compiled_engine))); + auto result = compiled_engine->runtime_states.set_runtime_states( + CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs, shape_changed); - if (!CUDAGRAPHS_MODE) { + bool need_cudagraphs_record = std::get<0>(result); + bool can_use_pre_allocated_outputs = std::get<1>(result); + + if (!CUDAGRAPHS_MODE || shape_changed) { compiled_engine->cudagraph.reset(); } - // this is a buffer to store shape tensor input addresses throughout the runtime scope - std::list> inputShapeTensorValues; - // Intialize inputs and outputs to be available throughout the succeeding scopes - std::list formatted_inputs(compiled_engine->num_io.first); std::vector outputs(compiled_engine->num_io.second); if (MULTI_DEVICE_SAFE_MODE) { @@ -185,68 +273,7 @@ std::vector execute_engine(std::vector inputs, c10::intr std::make_unique(compiled_engine->input_profile_path); } - for (size_t i = 0; i < inputs.size(); i++) { - std::string name = compiled_engine->in_binding_names[i]; - - TORCHTRT_CHECK( - inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device()); - - auto expected_type = - util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str())); - TORCHTRT_CHECK( - inputs[i].dtype() == expected_type, - "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype()); - - auto dims = core::util::toDims(inputs[i].sizes()); - auto shape = core::util::toVec(dims); - LOG_DEBUG("Input Name: " << name << " Shape: " << dims); - - if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) { - // Shape tensor inputs are casted to int64 explicitly. - // Refer to - // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435 - auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64); - std::vector inputs_cpu_vec( - input_cpu.data_ptr(), input_cpu.data_ptr() + input_cpu.numel()); - inputShapeTensorValues.emplace_back(inputs_cpu_vec); - TORCHTRT_CHECK( - compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()), - "Error while setting the tensor address for shape inputs"); - - if (CUDAGRAPHS_MODE) { - // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers - compiled_engine->input_buffers[i] = input_cpu; - } - TORCHTRT_CHECK( - compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()), - "Error while setting the tensor address for shape inputs"); - - } else { - at::Tensor contig_input = inputs[i].view(shape).contiguous(); - formatted_inputs.emplace_back(std::move(contig_input)); - - if (need_cudagraphs_record) { - // Create a new persistent input buffer - compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone()); - } - - TORCHTRT_CHECK( - compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape"); - - if (CUDAGRAPHS_MODE) { - // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer - compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true); - TORCHTRT_CHECK( - compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()), - "Error while setting the input tensor address for inputs"); - } else { - // Otherwise use the formatted buffer directly - TORCHTRT_CHECK( - compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()), - "Error while setting the input tensor address for inputs"); - } - } - } + setup_input_tensors(inputs, compiled_engine, need_cudagraphs_record); // Check if input shapes can be inferred. int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()}; @@ -265,19 +292,15 @@ std::vector execute_engine(std::vector inputs, c10::intr output_profiler_guard = std::make_unique(compiled_engine->output_profile_path); } + if (can_use_pre_allocated_outputs) { + outputs = compiled_engine->pre_allocated_outputs; + } else { + outputs = create_output_tensors(compiled_engine); + } for (auto output_indices : compiled_engine->out_binding_map) { - // out_binding_map stores TRT_IDX: PYT_IDX auto pyt_idx = output_indices.second; - std::string name = compiled_engine->out_binding_names[pyt_idx]; - auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str()); - LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape); - - auto dims = core::util::toVec(out_shape); - auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str())); - outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous()); - if (need_cudagraphs_record) { // If we are recording the cuda graph then we need to update the persistent output buffer compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone()); @@ -344,6 +367,11 @@ std::vector execute_engine(std::vector inputs, c10::intr } } // End engine exeuction (resets to caller stream) + // Create output buffer for next execution of graph or trt context. + if (compiled_engine->use_pre_allocated_outputs) { + compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine); + } + // Block caller stream until engine execution is complete at::cuda::CUDAEvent trt_exec_complete; trt_exec_complete.record(compiled_engine->engine_stream); diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp index 042bf085c8..e5edcf9729 100644 --- a/core/runtime/register_jit_hooks.cpp +++ b/core/runtime/register_jit_hooks.cpp @@ -88,6 +88,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion = .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info) .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info) .def("infer_outputs", &TRTEngine::infer_outputs) + .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs) .def_property( "device_memory_budget", &TRTEngine::get_device_memory_budget, diff --git a/docsrc/index.rst b/docsrc/index.rst index fdcaacf4c8..e7d5250e52 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -67,6 +67,7 @@ Tutorials * :ref:`custom_kernel_plugins` * :ref:`mutable_torchtrt_module_example` * :ref:`weight_streaming_example` +* :ref:`pre_allocated_output_example` .. toctree:: :caption: Tutorials @@ -85,6 +86,7 @@ Tutorials tutorials/_rendered_examples/dynamo/auto_generate_converters tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example tutorials/_rendered_examples/dynamo/weight_streaming_example + tutorials/_rendered_examples/dynamo/pre_allocated_output_example Dynamo Frontend ---------------- diff --git a/examples/dynamo/pre_allocated_output_example.py b/examples/dynamo/pre_allocated_output_example.py new file mode 100644 index 0000000000..d938034758 --- /dev/null +++ b/examples/dynamo/pre_allocated_output_example.py @@ -0,0 +1,113 @@ +""" +.. _pre_allocated_output_example: + +Pre-allocated output buffer +====================================================== + +The TensorRT runtime module acts as a wrapper around a PyTorch model (or subgraph) that has been compiled and optimized into a TensorRT engine. + +When the compiled module is executed, input and output tensors are set to TensorRT context for processing. +If output buffer allocation is moved after the execution of the TensorRT context and used it for next inference, GPU tasks and memory allocation tasks can operate concurrently. This overlap allows for more efficient use of GPU resources, potentially improving the performance of inference. + +This optimization is particularly effective in below cases + +1. Small inference time + - The allocation of output buffers typically requires minimal CPU cycles, as the caching mechanism efficiently handles memory reuse. The time taken for this allocation is relatively constant compared to the overall inference time, leading to noticeable performance improvements, especially in scenarios involving small inference workloads. This is because the reduced allocation time contributes to faster execution when the computational workload is not large enough to overshadow these savings. +2. Multiple graph breaks + - If the module contains operations that are not supported by TensorRT, the unsupported parts are handled by PyTorch and this fallback results in a graph break. The cumulative effect of optimized buffer allocations across multiple subgraphs can enhance overall inference performance. + - While optimizing output buffers can mitigate some of this overhead, reducing or removing graph breaks should be prioritized as it enables more comprehensive optimizations +3. Static input or infrequent input shape change + - If shape is changed, pre-allocated buffer cannot be used for next inference and there will new allocation before executing the TensorRT context. This feature is not suitable for use cases with frequent input shape changes +""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +import timeit + +import numpy as np +import torch +import torch_tensorrt +from transformers import BertModel + +# %% +# Define function to measure inference performance +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +def test_module_perf(model, *input): + timings = [] + + # Warm-up phase to ensure consistent and accurate performance measurements. + with torch.no_grad(): + for _ in range(3): + model(*input) + torch.cuda.synchronize() + + # Timing phase to measure inference performance + with torch.no_grad(): + for i in range(10): + start_time = timeit.default_timer() + model(*input) + torch.cuda.synchronize() + end_time = timeit.default_timer() + timings.append(end_time - start_time) + times = np.array(timings) + time_med = np.median(times) + + # Return the median time as a representative performance metric + return time_med + + +# %% +# Load model and compile +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Load bert model +model = ( + BertModel.from_pretrained("bert-base-uncased", torchscript=True) + .eval() + .half() + .to("cuda") +) +# Define sample inputs +inputs = [ + torch.randint(0, 5, (1, 128), dtype=torch.int32).to("cuda"), + torch.randint(0, 5, (1, 128), dtype=torch.int32).to("cuda"), +] +# Next, we compile the model using torch_tensorrt.compile +optimized_model = torch_tensorrt.compile( + model, + ir="dynamo", + enabled_precisions={torch.half}, + inputs=inputs, +) + +# %% +# Enable/Disable pre-allocated output buffer feature using runtime api +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Enable pre-allocated output buffer using a context manager +with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model): + out_trt = optimized_model(*inputs) + # Subsequent inferences can use the pre-allocated output buffer (no shape change) + out_trt = optimized_model(*inputs) + +# Alternatively, we can enable the feature using a context object +pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs( + optimized_model +) +pre_allocated_output_ctx.set_pre_allocated_output(True) +time_opt = test_module_perf(optimized_model, *inputs) + +# Disable the pre-allocated output buffer feature and perform inference normally +pre_allocated_output_ctx.set_pre_allocated_output(False) +out_trt = optimized_model(*inputs) +time_normal = test_module_perf(optimized_model, *inputs) + +time_opt_ms = time_opt * 1000 +time_normal_ms = time_normal * 1000 + +print(f"normal trt model time: {time_normal_ms:.3f} ms") +print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms") diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index ffe7e9e03a..e70d90086e 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -23,6 +23,42 @@ logger = logging.getLogger(__name__) +class TorchTRTRuntimeStates: + def __init__(self, new_cudagraphs: bool, new_pre_allocated_output: bool): + # Indicates whether CUDAGraphs were enabled in the previous execute_engine + self.old_cudagraphs = new_cudagraphs + # Indicates whether pre-allocated output was enabled in the previous execute_engine + self.old_pre_allocated_outputs = new_pre_allocated_output + + def validate_states( + self, + new_cudagraphs: bool, + new_pre_allocated_output: bool, + shape_changed: bool, + ) -> Tuple[bool, bool]: + # Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs + # based on the current and previous states, as well as input shape has changed + need_cudagraphs_record = False + can_use_pre_allocated_outputs = False + + # Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change + if new_cudagraphs and (not self.old_cudagraphs or shape_changed): + need_cudagraphs_record = True + + # Pre-allocated output can be used when previous and current state are true without shape change + if ( + self.old_pre_allocated_outputs + and new_pre_allocated_output + and (not shape_changed) + ): + can_use_pre_allocated_outputs = True + + self.old_cudagraphs = new_cudagraphs + self.old_pre_allocated_outputs = new_pre_allocated_output + + return need_cudagraphs_record, can_use_pre_allocated_outputs + + class PythonTorchTensorRTModule(Module): # type: ignore[misc] """PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine. @@ -108,6 +144,11 @@ def __init__( self.engine = None self.weight_name_map = weight_name_map self.target_platform = Platform.current_platform() + self.runtime_states = TorchTRTRuntimeStates( + torch_tensorrt.runtime.get_cudagraphs_mode(), False + ) + self.pre_allocated_outputs: List[torch.Tensor] = [] + self.use_pre_allocated_outputs = False if self.serialized_engine is not None and not self.settings.lazy_engine_init: self.setup_engine() @@ -172,7 +213,7 @@ def setup_engine(self) -> None: self.engine.get_tensor_shape(input_name) for input_name in self.input_names ] self.output_dtypes = [ - dtype._from(self.engine.get_tensor_dtype(output_name)) + dtype._from(self.engine.get_tensor_dtype(output_name)).to(torch.dtype) for output_name in self.output_names ] self.output_shapes = [ @@ -233,6 +274,73 @@ def __del__(self) -> None: if self.cudagraph: self.cudagraph.reset() + def setup_input_tensors( + self, + contiguous_inputs: List[torch.Tensor], + cudagraphs_enabled: bool, + need_cudagraphs_record: bool, + ) -> None: + for i, input_name in enumerate(self.input_names): + if not contiguous_inputs[i].is_cuda: + logger.warning( + f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. " + "This tensor is being moved by the runtime but for performance considerations, " + "ensure your inputs are all on GPU and open an issue here " + "(https://github.com/pytorch/TensorRT/issues) if this warning persists." + ) + contiguous_inputs = ( + contiguous_inputs[:i] + + [contiguous_inputs[i].cuda()] + + contiguous_inputs[i + 1 :] + ) + + assert ( + contiguous_inputs[i].dtype == self.input_dtypes[i] + ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." + + if need_cudagraphs_record: + # If cudagraphs is enabled, this memory is reserved for future cudagraph runs + # Clone is required to avoid re-using user-provided GPU memory + self._input_buffers[i] = contiguous_inputs[i].clone() + + # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers + # as per TensorRT requirements + if self.engine.is_shape_inference_io(input_name): + # Shape tensor inputs are casted to int64 explicitly + # Currently Torch CPU pointers are not working; numpy pointers are used instead + # to refer to underlying memory + inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy() + self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data) + else: + self.context.set_input_shape( + input_name, tuple(contiguous_inputs[i].shape) + ) + if cudagraphs_enabled: + self._input_buffers[i].copy_(contiguous_inputs[i]) + self.context.set_tensor_address( + input_name, self._input_buffers[i].data_ptr() + ) + else: + self.context.set_tensor_address( + input_name, contiguous_inputs[i].data_ptr() + ) + + def create_output_tensors(self) -> List[torch.Tensor]: + # create output tensors + outputs: List[torch.Tensor] = [] + + for o, _ in enumerate(self.output_names): + output = torch.empty( + size=self.output_shapes[o], + dtype=self.output_dtypes[o], + device=torch.cuda.current_device(), + ) + outputs.append(output) + return outputs + + def set_pre_allocated_outputs(self, enable: bool) -> None: + self.use_pre_allocated_outputs = enable + def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]: # Ensure inputs are available in all scopes and cast symbolic integers to Tensors contiguous_inputs: List[torch.Tensor] = [ @@ -248,11 +356,16 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . self._check_initialized() cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() - need_cudagraphs_record = ( - cudagraphs_enabled and not self.cudagraphs_validate_shapes(inputs) + shape_changed = self.validate_input_shapes(inputs) + need_cudagraphs_record, can_use_pre_allocated_outputs = ( + self.runtime_states.validate_states( + cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed + ) ) if need_cudagraphs_record: + if self.cudagraph: + self.cudagraph.reset() self._input_buffers = [None] * len(self.input_names) self._output_buffers = [None] * len(self.output_names) @@ -260,7 +373,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . self.cudagraph.reset() self.cudagraph = None - # If in safe mode, check at each iteration for for whether a switch is required + # If in safe mode, check at each iteration for whether a switch is required if ( torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE ): @@ -303,62 +416,18 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . self.input_names ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}." - for i, input_name in enumerate(self.input_names): - if not contiguous_inputs[i].is_cuda: - logger.warning( - f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. " - "This tensor is being moved by the runtime but for performance considerations, " - "ensure your inputs are all on GPU and open an issue here " - "(https://github.com/pytorch/TensorRT/issues) if this warning persists." - ) - contiguous_inputs = ( - contiguous_inputs[:i] - + [contiguous_inputs[i].cuda()] - + contiguous_inputs[i + 1 :] - ) - - assert ( - contiguous_inputs[i].dtype == self.input_dtypes[i] - ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." + self.setup_input_tensors( + contiguous_inputs, cudagraphs_enabled, need_cudagraphs_record + ) - if need_cudagraphs_record: - # If cudagraphs is enabled, this memory is reserved for future cudagraph runs - # Clone is required to avoid re-using user-provided GPU memory - self._input_buffers[i] = contiguous_inputs[i].clone() - - # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers - # as per TensorRT requirements - if self.engine.is_shape_inference_io(input_name): - # Shape tensor inputs are casted to int64 explicitly - # Currently Torch CPU pointers are not working; numpy pointers are used instead - # to refer to underlying memory - inputs_cpu = ( - contiguous_inputs[i].cpu().to(torch.int64).numpy().copy() - ) - self.context.set_tensor_address( - input_name, inputs_cpu.ctypes.data - ) - else: - self.context.set_input_shape( - input_name, tuple(contiguous_inputs[i].shape) + if shape_changed: + # Check if input shapes can be inferred. + uninferred_input_names = self.context.infer_shapes() + if uninferred_input_names: + logger.warning( + f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \ + This could happen if the input tensor addresses/shapes haven't been configured correctly" ) - if cudagraphs_enabled: - self._input_buffers[i].copy_(contiguous_inputs[i]) - self.context.set_tensor_address( - input_name, self._input_buffers[i].data_ptr() - ) - else: - self.context.set_tensor_address( - input_name, contiguous_inputs[i].data_ptr() - ) - - # Check if input shapes can be inferred. - uninferred_input_names = self.context.infer_shapes() - if uninferred_input_names: - logger.warning( - f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \ - This could happen if the input tensor addresses/shapes haven't been configured correctly" - ) with ( torch.autograd.profiler.record_function( @@ -367,24 +436,20 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . if self.profiling_enabled else nullcontext() ): - # create output tensors - outputs: List[torch.Tensor] = [] - - for o, output_name in enumerate(self.output_names): - shape = tuple(self.context.get_tensor_shape(output_name)) - - if DYNAMIC_DIM in shape: + if can_use_pre_allocated_outputs: + outputs = self.pre_allocated_outputs + else: + self.output_shapes = [ + tuple(self.context.get_tensor_shape(output_name)) + for output_name in self.output_names + ] + if DYNAMIC_DIM in self.output_shapes: raise ValueError( "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." ) + outputs = self.create_output_tensors() - output = torch.empty( - size=shape, - dtype=self.output_dtypes[o].to(torch.dtype), - device=torch.cuda.current_device(), - ) - - outputs.append(output) + for o, output_name in enumerate(self.output_names): if need_cudagraphs_record: self._output_buffers[o] = outputs[o].clone() @@ -445,6 +510,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . self._caller_stream.wait_stream(self._engine_stream) + if self.use_pre_allocated_outputs: + self.pre_allocated_outputs = self.create_output_tensors() + if cudagraphs_enabled: for idx, o in enumerate(outputs): o.copy_(self._output_buffers[idx]) @@ -486,10 +554,9 @@ def get_layer_info(self) -> str: ) return engine_json - def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: + def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: """ - Validates the input shapes of the forward function - versus the version currently active for the + Validates the input shapes of the forward function has changed """ # Representation of input shapes to a given model # Shapes are concatenated as so: @@ -499,10 +566,8 @@ def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: # If the new shape key differs from the existing one, # invalidate the old shape key and remove the CUDAGraph if new_shape_key != self.shape_key: - logger.debug(f"Resetting Cudagraph on new shape key {new_shape_key}") + logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}") self.shape_key = new_shape_key - if self.cudagraph: - self.cudagraph.reset() - return False + return True - return True + return False diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index d7cfc6608b..b809e70ddf 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -272,6 +272,9 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: self.input_binding_names = state[2] self.output_binding_names = state[3] + def set_pre_allocated_outputs(self, enable: bool) -> None: + self.engine.use_pre_allocated_outputs = enable + def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: """Implementation of the forward pass for a TensorRT engine diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py index 77b4401222..9960460b60 100644 --- a/py/torch_tensorrt/runtime/__init__.py +++ b/py/torch_tensorrt/runtime/__init__.py @@ -8,4 +8,5 @@ set_cudagraphs_mode, ) from torch_tensorrt.runtime._multi_device_safe_mode import set_multi_device_safe_mode +from torch_tensorrt.runtime._pre_allocated_outputs import enable_pre_allocated_outputs from torch_tensorrt.runtime._weight_streaming import weight_streaming diff --git a/py/torch_tensorrt/runtime/_pre_allocated_outputs.py b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py new file mode 100644 index 0000000000..c392c38838 --- /dev/null +++ b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py @@ -0,0 +1,41 @@ +import logging +from typing import Any + +import torch +from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule + +logger = logging.getLogger(__name__) + + +class _PreAllocatedOutputContextManager(object): + """ + Helper class used to enable pre-allocated output feature in runtime module + """ + + def __init__(self, module: torch.fx.GraphModule) -> None: + rt_mods = [] + for name, rt_mod in module.named_children(): + if "_run_on_acc" in name and isinstance( + rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule) + ): + rt_mods.append(rt_mod) + self.rt_mods = rt_mods + + def set_pre_allocated_output(self, enable: bool) -> None: + for mod in self.rt_mods: + mod.set_pre_allocated_outputs(enable) + + def __enter__(self) -> "_PreAllocatedOutputContextManager": + # Enable pre-allocated output + self.set_pre_allocated_output(True) + return self + + def __exit__(self, *args: Any) -> None: + # Disable pre-allocated output + self.set_pre_allocated_output(False) + + +def enable_pre_allocated_outputs( + module: torch.fx.GraphModule, +) -> _PreAllocatedOutputContextManager: + return _PreAllocatedOutputContextManager(module) diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py index a017eaabca..8649ca8e84 100644 --- a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py +++ b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py @@ -17,6 +17,9 @@ "Torch-TensorRT runtime is not available", ) class TestCudagraphsCPP(TestCase): + def tearDown(self): + # Reset to default cuda graph mode after each test + torch_tensorrt.runtime.set_cudagraphs_mode(False) def test_cudagraphs_on(self): torch_tensorrt.runtime.set_cudagraphs_mode(True) diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py index 4f962083a8..4bdcfbbef4 100644 --- a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py +++ b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py @@ -13,6 +13,10 @@ class TestCudagraphsPython(TestCase): + def tearDown(self): + # Reset to default cuda graph mode after each test + torch_tensorrt.runtime.set_cudagraphs_mode(False) + def test_cudagraphs_on(self): torch_tensorrt.runtime.set_cudagraphs_mode(True) self.assertTrue(torch_tensorrt.runtime.get_cudagraphs_mode()) diff --git a/tests/py/dynamo/runtime/test_pre_allocated_outputs.py b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py new file mode 100644 index 0000000000..b8c7b61fb3 --- /dev/null +++ b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py @@ -0,0 +1,130 @@ +import torch +import torch_tensorrt as torchtrt +from parameterized import parameterized +from torch.testing._internal.common_utils import TestCase, run_tests + +INPUT_SIZE = (3, 16, 16) +TRIALS = 5 + + +class TestPreAllocatedOutputs(TestCase): + @parameterized.expand( + [ + ("python_runtime", True), + ("cpp_runtime", False), + ] + ) + def test_pre_allocated_outputs_default(self, _, use_python_runtime): + class SampleModel(torch.nn.Module): + def forward(self, x): + return torch.softmax((x + 2) * 7, dim=0) + + model = SampleModel().eval().cuda() + inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)] + fx_graph = torch.fx.symbolic_trace(model) + + # Validate that the results between Torch and Torch-TRT are similar + optimized_model = torchtrt.compile( + fx_graph, + "torch_compile", + inputs[0], + min_block_size=1, + pass_through_build_failures=True, + use_python_runtime=use_python_runtime, + ) + + ref_out_list = [] + trt_out_list = [] + with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model): + for i in inputs: + ref_out_list.append(fx_graph(i).detach().cpu()) + trt_out_list.append(optimized_model(i).detach().cpu()) + + for torch_model_results, optimized_model_results in zip( + ref_out_list, trt_out_list + ): + torch.testing.assert_close( + torch_model_results, + optimized_model_results, + rtol=5e-03, + atol=5e-03, + equal_nan=True, + check_dtype=True, + ) + + torch._dynamo.reset() + + @parameterized.expand( + [ + ("python_runtime", True), + ("cpp_runtime", False), + ] + ) + def test_pre_allocated_outputs_dynamic(self, _, use_python_runtime): + class SampleModel(torch.nn.Module): + def forward(self, x): + return torch.relu((x + 2) * 0.5) + + inputs = torchtrt.Input( + min_shape=(1, 3, 128, 224), + opt_shape=(8, 3, 192, 224), + max_shape=(16, 3, 224, 224), + dtype=torch.float, + name="x", + ) + fx_graph = torch.fx.symbolic_trace(SampleModel()) + + optimized_model = torchtrt.compile( + fx_graph, + "dynamo", + inputs, + min_block_size=1, + pass_through_build_failures=True, + torch_executed_ops={"torch.ops.aten.mul.Tensor"}, + use_python_runtime=use_python_runtime, + ) + + input_list = [] + ref_out_list = [] + trt_out_list = [] + # Alternating cuda_graphs enable and input shapes at every five iterations. + for i in [1, 3, 8, 11, 16]: + for j in [128, 128, 222, 222, 224]: + input_list.append(torch.randn((i, 3, j, 224)).cuda()) + + pre_allocated_output_ctx = torchtrt.runtime.enable_pre_allocated_outputs( + optimized_model + ) + pre_allocated_output = False + for enable_cuda_graphs in [False, True]: + for i in range(len(input_list)): + # Toggles cuda graph at all index in TRIALS + if i % TRIALS == i // TRIALS: + cuda_graphs = enable_cuda_graphs + else: + cuda_graphs = not enable_cuda_graphs + if i % 3 == 0: + pre_allocated_output = not pre_allocated_output + + torchtrt.runtime.set_cudagraphs_mode(cuda_graphs) + pre_allocated_output_ctx.set_pre_allocated_output(pre_allocated_output) + + ref_out_list.append(fx_graph(input_list[i])) + trt_out_list.append(optimized_model(input_list[i])) + + for torch_model_results, optimized_model_results in zip( + ref_out_list, trt_out_list + ): + torch.testing.assert_close( + torch_model_results, + optimized_model_results, + rtol=5e-03, + atol=5e-03, + equal_nan=True, + check_dtype=True, + ) + torch._dynamo.reset() + + +if __name__ == "__main__": + run_tests()