diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 3e10fd7c7d..9f93fe4b4e 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -30,6 +30,29 @@ std::vector<std::string> split(const std::string& str, char delim) {
   return strings;
 }
 
+DynamicOutputAllocator::DynamicOutputAllocator(const std::unordered_map<std::string, at::ScalarType>& output_dtypes)
+    : dtypes(output_dtypes) {}
+
+void* DynamicOutputAllocator::reallocateOutputAsync(
+    char const* tensorName,
+    void* currentMemory,
+    uint64_t size,
+    uint64_t alignment,
+    cudaStream_t stream) {
+  std::vector<int64_t> shape = {static_cast<int64_t>(size)};
+  auto it = buffers.find(tensorName);
+  if (it == buffers.end() || it->second.sizes() != shape) {
+    buffers[tensorName] = at::empty(shape, at::TensorOptions().dtype(dtypes.at(tensorName)).device(at::kCUDA));
+    return buffers[tensorName].data_ptr();
+  } else {
+    return it->second.data_ptr();
+  }
+}
+
+void DynamicOutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept {
+  shapes[tensorName] = dims;
+}
+
 TRTEngine::TRTEngine(
     const std::string& serialized_engine,
     const RTDevice& cuda_device,
@@ -37,6 +60,7 @@ TRTEngine::TRTEngine(
     const std::vector<std::string>& _out_binding_names,
     const Platform& target_platform,
     bool hardware_compatible,
+    bool requires_output_allocator,
     const std::string& serialized_metadata)
     : TRTEngine(
           "deserialized_trt",
@@ -46,6 +70,7 @@ TRTEngine::TRTEngine(
           _out_binding_names,
           target_platform,
           hardware_compatible,
+          requires_output_allocator,
           serialized_metadata) {}
 
 TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
@@ -57,6 +82,7 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
           split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM),
           Platform(serialized_info[TARGET_PLATFORM_IDX]),
           static_cast<bool>(std::stoi(serialized_info[HW_COMPATIBLE_IDX])),
+          static_cast<bool>(std::stoi(serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX])),
           serialized_info[SERIALIZED_METADATA_IDX]) {}
 
 TRTEngine::TRTEngine(
@@ -67,6 +93,7 @@ TRTEngine::TRTEngine(
     const std::vector<std::string>& _out_binding_names,
     const Platform& target_platform,
     bool hardware_compatible,
+    bool requires_output_allocator,
     const std::string& serialized_metadata) {
   TORCHTRT_CHECK(
       is_supported_on_current_platform(target_platform),
@@ -79,6 +106,7 @@ TRTEngine::TRTEngine(
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
 
   this->serialized_metadata = serialized_metadata;
+  this->requires_output_allocator = requires_output_allocator;
   device_info = most_compatible_device.value();
   multi_gpu_device_check();
   set_rt_device(device_info);
@@ -397,6 +425,7 @@ FlattenedState TRTEngine::__obj_flatten__() {
       std::tuple("out_binding_names", serialized_info[OUTPUT_BINDING_NAMES_IDX]),
       std::tuple("hardware_compatible", serialized_info[HW_COMPATIBLE_IDX]),
       std::tuple("serialized_metadata", serialized_info[SERIALIZED_METADATA_IDX]),
+      std::tuple("requires_output_allocator", serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]),
       std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]));
 }
 
@@ -417,6 +446,7 @@ std::vector<std::string> TRTEngine::serialize() {
   serialized_info[INPUT_BINDING_NAMES_IDX] = serialize_bindings(this->in_binding_names);
   serialized_info[OUTPUT_BINDING_NAMES_IDX] = serialize_bindings(this->out_binding_names);
   serialized_info[HW_COMPATIBLE_IDX] = this->hardware_compatible ? "1" : "0";
+  serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = this->requires_output_allocator ? "1" : "0";
   serialized_info[SERIALIZED_METADATA_IDX] = this->serialized_metadata;
   serialized_info[TARGET_PLATFORM_IDX] = this->target_platform.serialize();
 
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index e1d8ba5471..e9b1905610 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -27,6 +27,7 @@ using FlattenedState = std::tuple<
     std::tuple<std::string, std::string>, // input binding names
     std::tuple<std::string, std::string>, // output binding names
     std::tuple<std::string, std::string>, // HW compatibility
+    std::tuple<std::string, std::string>, // requires_output_allocator
     std::tuple<std::string, std::string>, // serialized metadata
     std::tuple<std::string, std::string>>; // Platform
 
@@ -69,6 +70,33 @@ struct TorchTRTRuntimeStates {
   }
 };
 
+class DynamicOutputAllocator : public nvinfer1::IOutputAllocator {
+ public:
+  DynamicOutputAllocator(const std::unordered_map<std::string, at::ScalarType>& output_dtypes);
+
+  void* reallocateOutputAsync(
+      char const* tensorName,
+      void* currentMemory,
+      uint64_t size,
+      uint64_t alignment,
+      cudaStream_t stream) override;
+
+  void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override;
+
+  const std::unordered_map<std::string, at::Tensor>& getBuffers() const {
+    return buffers;
+  }
+
+  const std::unordered_map<std::string, nvinfer1::Dims>& getShapes() const {
+    return shapes;
+  }
+
+ private:
+  std::unordered_map<std::string, at::ScalarType> dtypes;
+  std::unordered_map<std::string, at::Tensor> buffers;
+  std::unordered_map<std::string, nvinfer1::Dims> shapes;
+};
+
 struct TRTEngine : torch::CustomClassHolder {
   // Each engine needs it's own runtime object
   std::shared_ptr<nvinfer1::IRuntime> rt;
@@ -99,6 +127,7 @@ struct TRTEngine : torch::CustomClassHolder {
       const std::vector<std::string>& out_binding_names,
       const Platform& target_platform = get_current_platform(),
       bool hardware_compatible = false,
+      bool requires_output_allocator = false,
       const std::string& serialized_metadata = "");
 
   TRTEngine(std::vector<std::string> serialized_info);
@@ -111,6 +140,7 @@ struct TRTEngine : torch::CustomClassHolder {
       const std::vector<std::string>& out_binding_names,
       const Platform& target_platform = get_current_platform(),
       bool hardware_compatible = false,
+      bool requires_output_allocator = false,
       const std::string& serialized_metadata = "");
 
   TRTEngine& operator=(const TRTEngine& other);
@@ -146,6 +176,11 @@ struct TRTEngine : torch::CustomClassHolder {
   bool use_pre_allocated_outputs = false;
   std::vector<at::Tensor> pre_allocated_outputs;
 
+  // Output Allocator-Related Functionality
+  bool requires_output_allocator = false; // engine requires output allocator
+  bool use_output_allocator_outputs = false; // users specify to use output allocator
+  std::shared_ptr<DynamicOutputAllocator> output_allocator;
+
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
 
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index 5348ade8c4..64b111750f 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -13,7 +13,7 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-// Checks if the context switch requred for device ID
+// Checks if the context switch required for device ID
 bool is_switch_required(const RTDevice& curr_device, const RTDevice& engine_device) {
   // If SM capability is not the same as configured then switch
   if ((curr_device.major != engine_device.major) || (curr_device.minor != engine_device.minor)) {
@@ -91,6 +91,7 @@ bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngi
 
   return false;
 }
+
 void setup_input_tensors(
     std::vector<at::Tensor> inputs,
     c10::intrusive_ptr<TRTEngine> compiled_engine,
@@ -163,6 +164,7 @@ void setup_input_tensors(
     }
   }
 }
+
 std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
   for (auto output_indices : compiled_engine->out_binding_map) {
@@ -181,7 +183,270 @@ std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> comp
   return outputs;
 }
 
+void create_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {
+  if (compiled_engine->output_allocator == nullptr) {
+    std::unordered_map<std::string, at::ScalarType> output_dtypes_dict;
+    for (size_t o = 0; o < compiled_engine->out_binding_names.size(); ++o) {
+      auto name = compiled_engine->out_binding_names[o];
+      output_dtypes_dict[name] =
+          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+    }
+    compiled_engine->output_allocator = std::make_shared<DynamicOutputAllocator>(output_dtypes_dict);
+  }
+  for (const auto& output_name : compiled_engine->out_binding_names) {
+    if (!compiled_engine->exec_ctx->setOutputAllocator(output_name.c_str(), compiled_engine->output_allocator.get())) {
+      TORCHTRT_THROW_ERROR("Failed to set output allocator for " + output_name);
+    }
+  }
+}
+
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
+  auto run_standard_execution = [&]() {
+    bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
+    bool shape_changed = _validate_shapes(inputs, compiled_engine);
+
+    // Whether cudagraphs needs to record the graph on this pass
+    auto result = compiled_engine->runtime_states.set_runtime_states(
+        cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);
+
+    bool need_cudagraphs_record = std::get<0>(result);
+    bool can_use_pre_allocated_outputs = std::get<1>(result);
+    bool need_cudagraphs_reset = std::get<2>(result);
+
+    if (need_cudagraphs_reset) {
+      compiled_engine->cudagraph.reset();
+    }
+
+    std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
+
+    // Intialize inputs and outputs to be available throughout the succeeding scopes
+    { // Input Setup
+      std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
+      if (compiled_engine->profile_execution) {
+        input_profiler_guard =
+            std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
+      }
+
+      setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
+      // Check if input shapes can be inferred.
+      int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
+      std::vector<char const*> names(io_size);
+      int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
+      TORCHTRT_CHECK(
+          nbNames == 0,
+          "The shapes of the inputs: "
+              << names
+              << " cannot be inferred. This could happen if the input tensor addresses/shapes haven't been configured correctly");
+    }
+
+    { // Output Setup
+      std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
+      if (compiled_engine->profile_execution) {
+        output_profiler_guard =
+            std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
+      }
+      if (can_use_pre_allocated_outputs) {
+        outputs = compiled_engine->pre_allocated_outputs;
+      } else {
+        outputs = create_output_tensors(compiled_engine);
+      }
+
+      for (auto output_indices : compiled_engine->out_binding_map) {
+        auto pyt_idx = output_indices.second;
+        std::string name = compiled_engine->out_binding_names[pyt_idx];
+        if (need_cudagraphs_record) {
+          // If we are recording the cuda graph then we need to update the persistent output buffer
+          compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
+        }
+
+        if (cudagraphs_enabled) {
+          TORCHTRT_CHECK(
+              compiled_engine->exec_ctx->setTensorAddress(
+                  name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
+              "Error while setting the output tensor address");
+        } else {
+          TORCHTRT_CHECK(
+              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
+              "Error while setting the output tensor address");
+        }
+      }
+    }
+
+    auto current_device_id = -1;
+    if (inputs.size() > 0) {
+      current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
+    } else if (outputs.size() > 0) {
+      current_device_id = outputs[0].device().index(); // Done this way to avoid a call to cudart
+    }
+
+    compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
+    if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
+      // Create a new stream if the engine stream is the default stream
+      compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
+    }
+
+    { // Engine Execution (execute on engine stream)
+      c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
+
+      std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
+      if (compiled_engine->profile_execution) {
+        enqueue_profiler_guard =
+            std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
+      }
+
+      // Block engine stream until results are available on caller stream
+      at::cuda::CUDAEvent caller_exec_complete;
+      caller_exec_complete.record(compiled_engine->caller_stream);
+      caller_exec_complete.block(compiled_engine->engine_stream);
+
+      if (!cudagraphs_enabled) {
+        // Direct execution uses the caller buffers directly
+        compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
+      } else {
+        if (need_cudagraphs_record) {
+          // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
+          c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream;
+          compiled_engine->cudagraph.capture_begin();
+          compiled_engine->exec_ctx->enqueueV3(recording_stream);
+          compiled_engine->cudagraph.capture_end();
+
+          if (compiled_engine->profile_execution) {
+            compiled_engine->cudagraph.debug_dump(compiled_engine->cuda_graph_debug_path);
+          }
+        }
+
+        // Replay the CUDAGraph
+        compiled_engine->cudagraph.replay(); // Has a cudaDeviceSynchronize internally
+      }
+    } // End engine exeuction (resets to caller stream)
+
+    // Create output buffer for next execution of graph or trt context.
+    if (compiled_engine->use_pre_allocated_outputs) {
+      compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
+    }
+
+    // Block caller stream until engine execution is complete
+    at::cuda::CUDAEvent trt_exec_complete;
+    trt_exec_complete.record(compiled_engine->engine_stream);
+    trt_exec_complete.block(compiled_engine->caller_stream);
+
+    if (cudagraphs_enabled) {
+      // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
+      for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
+        outputs[o].copy_(compiled_engine->output_buffers[o], false);
+      }
+    }
+
+    if (compiled_engine->profile_execution) {
+      LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
+      dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
+      compiled_engine->dump_engine_layer_info();
+    }
+
+    return outputs;
+  };
+
+  auto run_output_allocator = [&]() {
+    { // Input Setup
+      std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
+      if (compiled_engine->profile_execution) {
+        input_profiler_guard =
+            std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
+      }
+
+      setup_input_tensors(inputs, compiled_engine, false, false);
+      // Check if input shapes can be inferred.
+      int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
+      std::vector<char const*> names(io_size);
+      int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
+      TORCHTRT_CHECK(
+          nbNames == 0,
+          "The shapes of the inputs: "
+              << names
+              << " cannot be inferred. This could happen if the input tensor addresses/shapes haven't been configured correctly");
+    }
+
+    { // OutputAllocator Setup
+      std::unique_ptr<torch::autograd::profiler::RecordProfile> output_allocator_profiler_guard;
+      if (compiled_engine->profile_execution) {
+        output_allocator_profiler_guard =
+            std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
+      }
+      create_output_allocator(compiled_engine);
+    }
+
+    auto current_device_id = -1;
+    if (inputs.size() > 0) {
+      current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
+    } else {
+      current_device_id = at::cuda::current_device();
+    }
+
+    compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
+    if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
+      // Create a new stream if the engine stream is the default stream
+      compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
+    }
+
+    { // Engine Execution (execute on engine stream)
+      c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
+
+      std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
+      if (compiled_engine->profile_execution) {
+        enqueue_profiler_guard =
+            std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
+      }
+
+      // Block engine stream until results are available on caller stream
+      at::cuda::CUDAEvent caller_exec_complete;
+      caller_exec_complete.record(compiled_engine->caller_stream);
+      caller_exec_complete.block(compiled_engine->engine_stream);
+
+      // Direct execution uses the caller buffers directly
+      compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
+
+    } // End engine exeuction (resets to caller stream)
+
+    // Block caller stream until engine execution is complete
+    at::cuda::CUDAEvent trt_exec_complete;
+    trt_exec_complete.record(compiled_engine->engine_stream);
+    trt_exec_complete.block(compiled_engine->caller_stream);
+
+    std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
+    if (compiled_engine->profile_execution) {
+      output_profiler_guard =
+          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
+    }
+    std::vector<at::Tensor> outputs;
+    for (size_t i = 0; i < compiled_engine->out_binding_names.size(); i++) {
+      auto name = compiled_engine->out_binding_names[i];
+      auto dims = compiled_engine->output_allocator->getShapes().at(name);
+      auto dtype =
+          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+      at::Tensor output = compiled_engine->output_allocator->getBuffers().at(name).clone().detach();
+      int64_t prod = 1;
+      for (int i = 0; i < dims.nbDims; ++i) {
+        prod *= dims.d[i];
+      }
+      std::vector<int64_t> shape(dims.nbDims);
+      for (int i = 0; i < dims.nbDims; ++i) {
+        shape[i] = dims.d[i];
+      }
+      // When using the OutputAllocator, the allocated buffer might be larger than the size of the output,
+      // so we need to reshape the buffer to the output shape
+      output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(shape);
+      outputs.push_back(output);
+    }
+
+    if (compiled_engine->profile_execution) {
+      LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
+      dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
+      compiled_engine->dump_engine_layer_info();
+    }
+
+    return outputs;
+  };
+
   LOG_DEBUG(
       "Attempting to run engine (ID: " << compiled_engine->name
                                        << "); Hardware Compatible: " << compiled_engine->hardware_compatible);
@@ -203,22 +468,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     compiled_engine->cudagraph.enable_debug_mode();
   }
   bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
-  bool shape_changed = _validate_shapes(inputs, compiled_engine);
-
-  // Whether cudagraphs needs to record the graph on this pass
-  auto result = compiled_engine->runtime_states.set_runtime_states(
-      cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);
-
-  bool need_cudagraphs_record = std::get<0>(result);
-  bool can_use_pre_allocated_outputs = std::get<1>(result);
-  bool need_cudagraphs_reset = std::get<2>(result);
-
-  if (need_cudagraphs_reset) {
-    compiled_engine->cudagraph.reset();
-  }
-
-  // Intialize inputs and outputs to be available throughout the succeeding scopes
-  std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
 
   if (MULTI_DEVICE_SAFE_MODE) {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
@@ -268,130 +517,26 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
   }
 
-  { // Input Setup
-    std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
-    if (compiled_engine->profile_execution) {
-      input_profiler_guard =
-          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
-    }
-
-    setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
-    // Check if input shapes can be inferred.
-    int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
-    std::vector<char const*> names(io_size);
-    int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
-    TORCHTRT_CHECK(
-        nbNames == 0,
-        "The shapes of the inputs: "
-            << names
-            << " cannot be inferred. This could happen if the input tensor addresses/shapes haven't been configured correctly");
-  }
-
-  { // Output Setup
-    std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
-    if (compiled_engine->profile_execution) {
-      output_profiler_guard =
-          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
-    }
-    if (can_use_pre_allocated_outputs) {
-      outputs = compiled_engine->pre_allocated_outputs;
-    } else {
-      outputs = create_output_tensors(compiled_engine);
+  if (compiled_engine->requires_output_allocator) { // engine requires OA
+    if (cudagraphs_enabled) {
+      TORCHTRT_THROW_ERROR(
+          "The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.");
     }
-
-    for (auto output_indices : compiled_engine->out_binding_map) {
-      auto pyt_idx = output_indices.second;
-      std::string name = compiled_engine->out_binding_names[pyt_idx];
-      if (need_cudagraphs_record) {
-        // If we are recording the cuda graph then we need to update the persistent output buffer
-        compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
-      }
-
+    LOG_DEBUG("Using the dynamic allocator runtime mode.");
+    return run_output_allocator();
+  } else {
+    if (compiled_engine->use_output_allocator_outputs) { // users call OA context manager
       if (cudagraphs_enabled) {
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(
-                name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
-            "Error while setting the output tensor address");
-      } else {
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
-            "Error while setting the output tensor address");
+        TORCHTRT_THROW_ERROR(
+            "Both CUDA Graphs and dynamic output allocation are enabled, which are incompatible runtime modes. Please disable one of the two.");
       }
-    }
-  }
-
-  auto current_device_id = -1;
-  if (inputs.size() > 0) {
-    current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
-  } else if (outputs.size() > 0) {
-    current_device_id = outputs[0].device().index(); // Done this way to avoid a call to cudart
-  }
-
-  compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
-  if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
-    // Create a new stream if the engine stream is the default stream
-    compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
-  }
-
-  { // Engine Execution (execute on engine stream)
-    c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
-
-    std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
-    if (compiled_engine->profile_execution) {
-      enqueue_profiler_guard =
-          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
-    }
-
-    // Block engine stream until results are available on caller stream
-    at::cuda::CUDAEvent caller_exec_complete;
-    caller_exec_complete.record(compiled_engine->caller_stream);
-    caller_exec_complete.block(compiled_engine->engine_stream);
-
-    if (!cudagraphs_enabled) {
-      // Direct execution uses the caller buffers directly
-      compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
+      LOG_DEBUG("Using the dynamic allocator runtime mode.");
+      return run_output_allocator();
     } else {
-      if (need_cudagraphs_record) {
-        // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
-        c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream;
-        compiled_engine->cudagraph.capture_begin();
-        compiled_engine->exec_ctx->enqueueV3(recording_stream);
-        compiled_engine->cudagraph.capture_end();
-
-        if (compiled_engine->profile_execution) {
-          compiled_engine->cudagraph.debug_dump(compiled_engine->cuda_graph_debug_path);
-        }
-      }
-
-      // Replay the CUDAGraph
-      compiled_engine->cudagraph.replay(); // Has a cudaDeviceSynchronize internally
+      LOG_DEBUG("Using the standard execution runtime mode with cudagraphs=" << cudagraphs_enabled << ".");
+      return run_standard_execution();
     }
-  } // End engine exeuction (resets to caller stream)
-
-  // Create output buffer for next execution of graph or trt context.
-  if (compiled_engine->use_pre_allocated_outputs) {
-    compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
   }
-
-  // Block caller stream until engine execution is complete
-  at::cuda::CUDAEvent trt_exec_complete;
-  trt_exec_complete.record(compiled_engine->engine_stream);
-  trt_exec_complete.block(compiled_engine->caller_stream);
-
-  if (cudagraphs_enabled) {
-    // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
-    for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
-      outputs[o].copy_(compiled_engine->output_buffers[o], false);
-    }
-  }
-
-  if (compiled_engine->profile_execution) {
-    LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
-    dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
-    compiled_engine->dump_engine_layer_info();
-  }
-
-  return outputs;
 }
 
 } // namespace runtime
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 3ded080b1d..c05be4e8aa 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -89,6 +89,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
         .def("infer_outputs", &TRTEngine::infer_outputs)
         .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
+        .def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs)
         .def_property(
             "device_memory_budget",
             &TRTEngine::get_device_memory_budget,
@@ -130,6 +131,7 @@ TORCH_LIBRARY(tensorrt, m) {
   m.def("HW_COMPATIBLE_IDX", []() -> int64_t { return HW_COMPATIBLE_IDX; });
   m.def("SERIALIZED_METADATA_IDX", []() -> int64_t { return SERIALIZED_METADATA_IDX; });
   m.def("TARGET_PLATFORM_IDX", []() -> int64_t { return TARGET_PLATFORM_IDX; });
+  m.def("REQUIRES_OUTPUT_ALLOCATOR_IDX", []() -> int64_t { return REQUIRES_OUTPUT_ALLOCATOR_IDX; });
   m.def("SERIALIZATION_LEN", []() -> int64_t { return SERIALIZATION_LEN; });
   m.def("_platform_linux_x86_64", []() -> std::string {
     auto it = get_platform_name_map().find(Platform::PlatformEnum::kLINUX_X86_64);
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
index 6f1436c745..894df55bfe 100644
--- a/core/runtime/runtime.h
+++ b/core/runtime/runtime.h
@@ -16,7 +16,7 @@ namespace core {
 namespace runtime {
 
 using EngineID = int64_t;
-const std::string ABI_VERSION = "6";
+const std::string ABI_VERSION = "7";
 extern bool MULTI_DEVICE_SAFE_MODE;
 
 typedef enum {
@@ -37,6 +37,7 @@ typedef enum {
   HW_COMPATIBLE_IDX,
   SERIALIZED_METADATA_IDX,
   TARGET_PLATFORM_IDX,
+  REQUIRES_OUTPUT_ALLOCATOR_IDX,
   SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
diff --git a/docsrc/py_api/runtime.rst b/docsrc/py_api/runtime.rst
index 4ece390816..719d8f6555 100644
--- a/docsrc/py_api/runtime.rst
+++ b/docsrc/py_api/runtime.rst
@@ -19,12 +19,14 @@ Functions
 
 .. autofunction:: get_whole_cudagraphs_mode
 
-.. autofunction:: set_cudagraphs_modue
+.. autofunction:: set_cudagraphs_mode
 
 .. autofunction:: enable_pre_allocated_outputs
 
 .. autofunction:: weight_streaming
 
+.. autofunction:: enable_output_allocator
+
 Classes
 ---------
 
diff --git a/docsrc/user_guide/runtime.rst b/docsrc/user_guide/runtime.rst
index 8672fdebe4..5ca842514e 100644
--- a/docsrc/user_guide/runtime.rst
+++ b/docsrc/user_guide/runtime.rst
@@ -92,3 +92,44 @@ Cudagraphs can accelerate certain models by reducing kernel overheads, as docume
 In the current implementation, use of a new input shape (for instance in dynamic shape 
 cases), will cause the cudagraph to be re-recorded. Cudagraph recording is generally 
 not latency intensive, and future improvements include caching cudagraphs for multiple input shapes.
+
+Dynamic Output Allocation Mode
+------------------------------
+
+Dynamic output allocation is a feature in Torch-TensorRT which allows the output buffer of TensorRT engines to be
+dynamically allocated. This is useful for models with dynamic output shapes, especially ops with data-dependent shapes.
+Dynamic output allocation mode cannot be used in conjunction with CUDA Graphs nor pre-allocated outputs feature.
+Without dynamic output allocation, the output buffer is allocated based on the inferred output shape based on input size.
+
+There are two scenarios in which dynamic output allocation is enabled:
+
+1. The model has been identified at compile time to require dynamic output allocation for at least one TensorRT subgraph. 
+These models will engage the runtime mode automatically (with logging) and are incompatible with other runtime modes 
+such as CUDA Graphs.
+
+Converters can declare that subgraphs that they produce will require the output allocator using `requires_output_allocator=True` 
+there by forcing any model which utilizes the converter to automatically use the output allocator runtime mode. e.g.,
+
+.. code-block:: python
+
+    @dynamo_tensorrt_converter(
+        torch.ops.aten.nonzero.default,
+        supports_dynamic_shapes=True,
+        requires_output_allocator=True,
+    )
+    def aten_ops_nonzero(
+        ctx: ConversionContext,
+        target: Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        name: str,
+    ) -> Union[TRTTensor, Sequence[TRTTensor]]:
+        ...
+
+2. Users may manually enable dynamic output allocation mode via the ``torch_tensorrt.runtime.enable_output_allocator`` context manager.
+
+.. code-block:: python
+
+    # Enables Dynamic Output Allocation Mode, then resets the mode to its prior setting
+    with torch_tensorrt.runtime.enable_output_allocator(trt_module):
+        ...
diff --git a/examples/dynamo/converter_overloading.py b/examples/dynamo/converter_overloading.py
index dc25a18287..e27c53cb50 100644
--- a/examples/dynamo/converter_overloading.py
+++ b/examples/dynamo/converter_overloading.py
@@ -58,12 +58,11 @@ def forward(self, x):
 
 from typing import Dict, Sequence, Tuple, Union
 
+import tensorrt as trt
 from torch.fx.node import Argument, Node, Target
 from torch_tensorrt.dynamo import CompilationSettings
 from torch_tensorrt.dynamo.conversion import ConversionContext
 
-import tensorrt as trt
-
 # %%
 # Converter Metadata
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -80,6 +79,8 @@ def forward(self, x):
     supports_dynamic_shapes=True,
     # Set the priority of the converter to supersede the default one
     priority=torch_tensorrt.dynamo.conversion.ConverterPriority.HIGH,
+    # Whether the converter requires a dynamic output allocator to run (e.g. data dependent ops)
+    requires_output_allocator=True,
 )
 
 # %%
@@ -98,7 +99,7 @@ def forward(self, x):
 #
 # Finally there is the ``priority`` argument, which is an enum from the ``torch_tensorrt.dynamo.conversion.ConverterPriority`` class that defines the priority of the converter. The two options are ``HIGH`` and ``STANDARD``.
 # Converters registered with ``STANDARD`` will be appended to the converter list for a given operation, while converters registered with ``HIGH`` will be prepended to the list.
-# Candidate converters are evalated for their suitablity in this priority order and the first converter that passes the validator is used.
+# Candidate converters are evalated for their suitability in this priority order and the first converter that passes the validator is used.
 
 
 # %%
diff --git a/py/torch_tensorrt/dynamo/_engine_cache.py b/py/torch_tensorrt/dynamo/_engine_cache.py
index 83f75dc4e9..a6d9a1face 100644
--- a/py/torch_tensorrt/dynamo/_engine_cache.py
+++ b/py/torch_tensorrt/dynamo/_engine_cache.py
@@ -25,6 +25,7 @@
     Sequence[Input],
     CompilationSettings,
     Optional[Dict[str, Any]],
+    bool,
 ]
 
 
@@ -106,6 +107,7 @@ def pack(
         input_specs: Sequence[Input],
         compilation_settings: CompilationSettings,
         weight_name_map: Optional[Dict[Any, Any]],
+        requires_output_allocator: bool,
     ) -> bytes:
         """Pack serialized engine, input names, output names, and weight map into a single blob
 
@@ -116,7 +118,7 @@ def pack(
             input_specs (Sequence[Input]): input specs of TRT engine
             compilation_settings (CompilationSettings): compilation settings of TRT engine
             weight_name_map (Optional[Dict[Any, Any]]): weight name map for refitting
-
+            requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
         Returns:
             bytes: packed blob
         """
@@ -130,6 +132,7 @@ def pack(
                 "input_specs": input_specs,
                 "compilation_settings": settings,
                 "weight_name_map": weight_name_map,
+                "requires_output_allocator": requires_output_allocator,
             }
         )
 
@@ -151,6 +154,7 @@ def unpack(packed_obj: bytes) -> UnpackedCacheHit:
             unpacked["input_specs"],
             unpacked["compilation_settings"],
             unpacked["weight_name_map"],
+            unpacked["requires_output_allocator"],
         )
 
     def insert(
diff --git a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py
index 37581f76cd..0dbdb2a8f4 100644
--- a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py
+++ b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py
@@ -11,9 +11,11 @@ class ConversionContext:
     Args:
         net: TensorRT Network being built
         compilation_settings: Settings selected by the user for compilation
+        requires_output_allocator: Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
     """
 
     net: TRTNetwork
     compilation_settings: CompilationSettings = field(
         default_factory=CompilationSettings
     )
+    requires_output_allocator: bool = False
diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py
index 1efacea619..eb1692e392 100644
--- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py
+++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py
@@ -18,6 +18,7 @@
     cast,
 )
 
+import tensorrt as trt
 import torch
 from torch import SymBool, SymFloat, SymInt
 from torch._ops import OpOverloadPacket
@@ -26,8 +27,6 @@
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.fx.converter_registry import CONVERTERS as FX_CONVERTERS
 
-import tensorrt as trt
-
 logger = logging.getLogger(__name__)
 
 LegacyConverterImplSignature = Callable[
@@ -81,6 +80,7 @@ class ConverterSupport:
             whether that node can be supported by its companion converter. Note that
             this function must not modify the node or its graph
         supports_dynamic_shapes: Boolean flag indicating if the converter has support for dynamic inputs.
+        requires_output_allocator: Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators).
     """
 
     converter_implementation: ConverterImplSignature
@@ -88,6 +88,7 @@ class ConverterSupport:
         default=lambda node, compilation_settings: True
     )
     supports_dynamic_shapes: bool = False
+    requires_output_allocator: bool = False
 
 
 # Dictionary representing Dynamo aten-only converters
@@ -197,6 +198,7 @@ def dynamo_tensorrt_converter(
     capability_validator: Optional[Callable[[Node, CompilationSettings], bool]] = None,
     priority: ConverterPriority = ConverterPriority.STANDARD,
     supports_dynamic_shapes: bool = False,
+    requires_output_allocator: bool = False,
 ) -> Callable[[ConverterImplSignature], ConverterImplSignature]:
     """Decorator for Dynamo TensorRT Converter
 
@@ -212,6 +214,8 @@ def dynamo_tensorrt_converter(
             this means all nodes of "key" kind can be supported by this converter
         priority: Converter's level of priority relative to other converters with the
             same target
+        supports_dynamic_shapes: Boolean flag indicating if the converter has support for dynamic shapes.
+        requires_output_allocator: Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators).
     Returns:
         The converter being decorated
     """
@@ -225,6 +229,7 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat
             converter_support = ConverterSupport(
                 converter_implementation=converter,
                 supports_dynamic_shapes=supports_dynamic_shapes,
+                requires_output_allocator=requires_output_allocator,
             )
         else:
             assert callable(
@@ -234,6 +239,7 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat
                 converter_implementation=converter,
                 capability_validator=capability_validator,
                 supports_dynamic_shapes=supports_dynamic_shapes,
+                requires_output_allocator=requires_output_allocator,
             )
 
         # OpOverloadPackets are only valid if they have a single overload, or
@@ -404,7 +410,7 @@ def __getitem_without_validation__(
     def __getitem__(
         self, node: Node
     ) -> Tuple[
-        Any, CallingConvention
+        Any, CallingConvention, Dict[str, bool]
     ]:  # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
         """Get the first-found validated converter in any registry
 
@@ -462,6 +468,10 @@ def __getitem__(
                             return (
                                 candidate.converter_implementation,
                                 calling_convention,
+                                {
+                                    "supports_dynamic_shapes": candidate.supports_dynamic_shapes,
+                                    "requires_output_allocator": candidate.requires_output_allocator,
+                                },
                             )
                         else:
                             logger.debug(
@@ -471,7 +481,14 @@ def __getitem__(
                 else:
                     # Assuming FX converters don't have dynamic shapes supported
                     if not node_has_dynamic_shapes(node):
-                        return converters, calling_convention
+                        return (
+                            converters,
+                            calling_convention,
+                            {
+                                "supports_dynamic_shapes": False,
+                                "requires_output_allocator": False,
+                            },
+                        )
 
         raise KeyError(
             f"None of the converter registries have a validated entry for {key}, with node {node}"
@@ -495,7 +512,7 @@ def get_unvalidated(
     def get(
         self, node: Node, value: Optional[ConverterImplSignature] = None
     ) -> Union[
-        Any, Tuple[Any, CallingConvention]
+        Any, Tuple[Any, CallingConvention, Dict[str, bool]]
     ]:  # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
         """Get validated converter for input node with a default return"""
         try:
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index 2f35a6d124..7f26a7c3e6 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -64,6 +64,7 @@ class TRTInterpreterResult(NamedTuple):
     input_names: Sequence[str]
     output_names: Sequence[str]
     weight_name_map: Optional[dict[Any, Any]]
+    requires_output_allocator: bool
 
 
 class TRTInterpreter(torch.fx.Interpreter):  # type: ignore[misc]
@@ -582,6 +583,7 @@ def _insert_engine_to_cache(self, hash_val: str, serialized_engine: bytes) -> No
                 self.input_specs,
                 self.compilation_settings,
                 self.weight_name_map,
+                self.ctx.requires_output_allocator,
             ),
         )
 
@@ -596,6 +598,7 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                 cached_engine_input_specs,
                 engine_compilation_settings,
                 self.weight_name_map,
+                self.ctx.requires_output_allocator,
             ) = cached_data
 
             setting_compatiblity, incompattible_settings = settings_are_compatible(
@@ -657,6 +660,7 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                 self._input_names,
                 self._output_names,
                 self.weight_name_map,
+                self.ctx.requires_output_allocator,
             )
         return None
 
@@ -742,6 +746,7 @@ def run(
             self._input_names,
             self._output_names,
             self.weight_name_map,
+            self.ctx.requires_output_allocator,
         )
 
     def run_node(self, n: torch.fx.Node) -> torch.fx.Node:
@@ -835,7 +840,7 @@ def call_module(
                 f"Conversion of module of type {submod_type} not currently supported!"
             )
 
-        converter, calling_convention = converter_packet
+        converter, calling_convention, _ = converter_packet
 
         assert self._cur_node_name is not None
 
@@ -852,7 +857,10 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any:
                 f"Conversion of function {torch.typename(target)} not currently supported!"
             )
 
-        converter, calling_convention = converter_packet
+        converter, calling_convention, converter_info = converter_packet
+        if converter_info.get("requires_output_allocator", False):
+            self.ctx.requires_output_allocator = True
+            _LOGGER.debug(f"{target} requires output allocator")
 
         if calling_convention is CallingConvention.LEGACY:
             return converter(self.ctx.net, target, args, kwargs, self._cur_node_name)
@@ -882,7 +890,7 @@ def call_method(self, target: str, args: Any, kwargs: Any) -> Any:
             raise UnsupportedOperatorException(
                 f"Conversion of method {target} not currently supported!"
             )
-        converter, calling_convention = converter_packet
+        converter, calling_convention, _ = converter_packet
 
         if calling_convention is CallingConvention.LEGACY:
             return converter(self.ctx.net, target, args, kwargs, self._cur_node_name)
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
index 1dad18989c..adb7039e7e 100644
--- a/py/torch_tensorrt/dynamo/conversion/_conversion.py
+++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -30,7 +30,7 @@ def infer_module_output_dtypes(
     """
     outputs = [node for node in module.graph.nodes if node.op == "output"]
     outputs = outputs[0].args
-    return get_output_dtypes(outputs, truncate_double)
+    return get_output_dtypes(outputs, truncate_double)  # type: ignore[no-any-return]
 
 
 def interpret_module_to_result(
@@ -112,4 +112,5 @@ def convert_module(
         name=name,
         settings=settings,
         weight_name_map=interpreter_result.weight_name_map,
+        requires_output_allocator=interpreter_result.requires_output_allocator,
     )
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
index 2a9255ed68..e4b5d732b6 100644
--- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -3552,3 +3552,24 @@ def aten_ops_full(
         fill_value=args[1],
         dtype=kwargs.get("dtype", None),
     )
+
+
+@dynamo_tensorrt_converter(
+    torch.ops.aten.nonzero.default,
+    supports_dynamic_shapes=True,
+    requires_output_allocator=True,
+)
+def aten_ops_nonzero(
+    ctx: ConversionContext,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    return impl.unary.nonzero(
+        ctx,
+        target,
+        SourceIR.ATEN,
+        name,
+        args[0],
+    )
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py
index 34b667acf1..89e490392d 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py
@@ -624,3 +624,18 @@ def native_dropout(
         mask = np.ones(input_val.shape, dtype=bool)
         mask = get_trt_tensor(ctx, mask, f"{name}_mask")
         return identity_layer.get_output(0), mask
+
+
+def nonzero(
+    ctx: ConversionContext,
+    target: Target,
+    source_ir: Optional[SourceIR],
+    name: str,
+    input_val: TRTTensor,
+) -> TRTTensor:
+    non_zero_layer = ctx.net.add_non_zero(input_val)
+    set_layer_name(non_zero_layer, target, f"{name}_non_zero", source_ir)
+    shuffle_layer = ctx.net.add_shuffle(non_zero_layer.get_output(0))
+    shuffle_layer.first_transpose = trt.Permutation([1, 0])
+    set_layer_name(shuffle_layer, target, f"{name}_transpose", source_ir)
+    return shuffle_layer.get_output(0)
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
index b66f36c11e..2ecc45ecf3 100644
--- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
+++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -13,6 +13,7 @@
 from .remove_assert_nodes import remove_assert_nodes
 from .remove_detach import remove_detach
 from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones
+from .remove_num_users_is_0_nodes import remove_num_users_is_0_nodes
 from .repair_input_as_output import repair_input_as_output
 from .replace_max_pool_with_indices import replace_max_pool_with_indices
 
@@ -24,6 +25,7 @@
     replace_max_pool_with_indices,
     remove_assert_nodes,
     accumulate_fp32_matmul,
+    remove_num_users_is_0_nodes,
 ]
 
 if not is_tegra_platform():
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_num_users_is_0_nodes.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_num_users_is_0_nodes.py
new file mode 100644
index 0000000000..2a2c8e9d5e
--- /dev/null
+++ b/py/torch_tensorrt/dynamo/lowering/passes/remove_num_users_is_0_nodes.py
@@ -0,0 +1,30 @@
+import logging
+
+import torch
+from torch_tensorrt.dynamo._settings import CompilationSettings
+from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
+    clean_up_graph_after_modifications,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def remove_num_users_is_0_nodes(
+    gm: torch.fx.GraphModule, settings: CompilationSettings
+) -> torch.fx.GraphModule:
+    """Remove ops that [num_users=0] in the graph"""
+    nodes = list(gm.graph.nodes)
+    output_node = nodes[-1]
+
+    for node in nodes[::-1]:
+        if (
+            node != output_node
+            and len(node.users) == 0
+            and len(node.all_input_nodes) > 0
+        ):
+            gm.graph.erase_node(node)
+            gm = clean_up_graph_after_modifications(gm)
+
+    logger.debug(f"Removed ops that [num_users=0] nodes:\n{gm.graph}")
+
+    return gm
diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py
index 45c3508458..685ec6ebef 100644
--- a/py/torch_tensorrt/dynamo/partitioning/common.py
+++ b/py/torch_tensorrt/dynamo/partitioning/common.py
@@ -31,8 +31,10 @@ def construct_dynamic_input(
         if isinstance(dim, torch.SymInt):
             min_max_opt = extract_var_range_info(dim)
             min_shape.append(min_max_opt["min"])
-            # opt might not exist
-            opt_shape.append(min_max_opt.get("opt"))
+            # if opt not exist, set it to the mean of min and max
+            opt_shape.append(
+                min_max_opt.get("opt", int(min_max_opt["min"] + min_max_opt["max"] / 2))
+            )
             max_shape.append(min_max_opt["max"])
         else:
             min_shape.append(dim)
diff --git a/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
index 1cc6d6c785..b3ac25bc3a 100644
--- a/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
@@ -32,6 +32,7 @@ def __init__(
         self._input_buffers: List[torch.Tensor] = []
         self._output_buffers: List[torch.Tensor] = []
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
+        self.use_output_allocator_outputs = False
         self.shape_key: Optional[str] = None
         self._caller_stream: Optional[torch.cuda.Stream] = None
         self._engine_stream: Optional[torch.cuda.Stream] = None
@@ -73,6 +74,9 @@ def __del__(self) -> None:
         if self.cudagraph:
             self.cudagraph.reset()
 
+    def set_use_output_allocator(self, enable: bool) -> None:
+        self.use_output_allocator_outputs = enable
+
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         cudagraphs_enabled = torch_tensorrt.runtime.get_whole_cudagraphs_mode()
         if cudagraphs_enabled:
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 9086de657f..891d063ed3 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -23,6 +23,41 @@
 logger = logging.getLogger(__name__)
 
 
+class DynamicOutputAllocator(trt.IOutputAllocator):  # type: ignore[misc]
+    def __init__(self, output_dtypes: Dict[str, torch.dtype]) -> None:
+        trt.IOutputAllocator.__init__(self)
+        self.buffers: Dict[str, torch.Tensor] = {}
+        self.shapes: Dict[str, Tuple[int, ...]] = {}
+        self.dtypes: Dict[str, torch.dtype] = output_dtypes
+
+    def reallocate_output_async(
+        self,
+        tensor_name: str,
+        memory: int,
+        size: int,
+        alignment: int,
+        stream: torch.cuda.Stream,
+    ) -> Any:
+        shape = (size,)
+        if tensor_name not in self.buffers:
+            self.buffers[tensor_name] = torch.empty(
+                shape,
+                dtype=self.dtypes[tensor_name],
+                device=torch.cuda.current_device(),
+            )
+        else:
+            if self.buffers[tensor_name].shape != shape:
+                self.buffers[tensor_name] = torch.empty(
+                    shape,
+                    dtype=self.dtypes[tensor_name],
+                    device=torch.cuda.current_device(),
+                )
+        return self.buffers[tensor_name].data_ptr()
+
+    def notify_shape(self, tensor_name: str, shape: Tuple[int, ...]) -> None:
+        self.shapes[tensor_name] = tuple(shape)
+
+
 class TorchTRTRuntimeStates:
     def __init__(self, new_cudagraphs: bool):
         # Indicates whether CUDAGraphs were enabled in the previous execute_engine
@@ -92,6 +127,7 @@ def __init__(
         name: str = "",
         settings: CompilationSettings = CompilationSettings(),
         weight_name_map: Optional[dict[Any, Any]] = None,
+        requires_output_allocator: bool = False,
     ):
         """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it. Uses TensorRT Python APIs to run the engine
@@ -105,6 +141,7 @@ def __init__(
             name (str): Name for module
             settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
             weight_name_map (dict): Mapping of engine weight name to state_dict weight name
+            requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
 
         Example:
 
@@ -164,9 +201,15 @@ def __init__(
         self.runtime_states = TorchTRTRuntimeStates(
             torch_tensorrt.runtime.get_cudagraphs_mode()
         )
+
+        self.cudagraphs_enabled = False
         self.pre_allocated_outputs: List[torch.Tensor] = []
         self.use_pre_allocated_outputs = False
 
+        self.requires_output_allocator = requires_output_allocator
+        self.output_allocator: Optional[DynamicOutputAllocator] = None
+        self.use_output_allocator_outputs = False
+
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
 
@@ -238,6 +281,9 @@ def setup_engine(self) -> None:
             for output_name in self.output_names
         ]
 
+        if self.requires_output_allocator:
+            self.create_output_allocator()
+
         if torch_tensorrt.runtime.get_cudagraphs_mode():
             self.cudagraph = torch.cuda.CUDAGraph()
 
@@ -358,27 +404,26 @@ def create_output_tensors(self) -> List[torch.Tensor]:
     def set_pre_allocated_outputs(self, enable: bool) -> None:
         self.use_pre_allocated_outputs = enable
 
+    def set_use_output_allocator(self, enable: bool) -> None:
+        self.use_output_allocator_outputs = enable
+
+    def create_output_allocator(self) -> None:
+        if self.output_allocator is None:
+            output_dtypes_dict = {}
+            for o, output_name in enumerate(self.output_names):
+                output_dtypes_dict[output_name] = self.output_dtypes[o]
+            self.output_allocator = DynamicOutputAllocator(output_dtypes_dict)
+
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
-        # Ensure inputs are available in all scopes and cast symbolic integers to Tensors
-        contiguous_inputs: List[torch.Tensor] = [
-            (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
-            for i in inputs
-        ]
-        with (
-            torch.autograd.profiler.record_function("PythonTorchTensorRTModule:Forward")
-            if self.profiling_enabled
-            else nullcontext()
-        ):
-            self._check_initialized()
 
-            cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
+        def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
             shape_changed = self.validate_input_shapes(inputs)
             (
                 need_cudagraphs_record,
                 can_use_pre_allocated_outputs,
                 need_cudagraphs_reset,
             ) = self.runtime_states.set_runtime_states(
-                cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed
+                self.cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed
             )
 
             if need_cudagraphs_reset and self.cudagraph:
@@ -389,38 +434,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 self._input_buffers = [None] * len(self.input_names)
                 self._output_buffers = [None] * len(self.output_names)
 
-            # If in safe mode, check at each iteration for whether a switch is required
-            if (
-                torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE
-            ):
-                curr_device_id = torch.cuda.current_device()
-                curr_device_properties = torch.cuda.get_device_properties(
-                    curr_device_id
-                )
-                logger.debug(f"Current Device: cuda:{curr_device_id}")
-
-                # If a switch is required, move all inputs to new device and set as active device
-                if _is_switch_required(
-                    curr_device_id,
-                    self.target_device_id,
-                    curr_device_properties,
-                    self.target_device_properties,
-                ):
-                    device_id, _ = _select_rt_device(
-                        curr_device_id,
-                        self.target_device_id,
-                        self.target_device_properties,
-                    )
-
-                    # Update current device
-                    device = torch.device(device_id)
-                    torch.cuda.set_device(device_id)
-
-                    contiguous_inputs = [
-                        tensor.to(device) for tensor in contiguous_inputs
-                    ]
-                    logger.warning(f"Moved all input Tensors to cuda:{device_id}")
-
             with (
                 torch.autograd.profiler.record_function(
                     "PythonTorchTensorRTModule:ProcessInputs"
@@ -433,7 +446,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
 
                 self.setup_input_tensors(
-                    contiguous_inputs, cudagraphs_enabled, need_cudagraphs_record
+                    contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record
                 )
 
                 if shape_changed:
@@ -469,7 +482,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                     if need_cudagraphs_record:
                         self._output_buffers[o] = outputs[o].clone()
 
-                    if cudagraphs_enabled:
+                    if self.cudagraphs_enabled:
                         self.context.set_tensor_address(
                             output_name, self._output_buffers[o].data_ptr()
                         )
@@ -495,7 +508,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 self._engine_stream.wait_stream(self._caller_stream)
 
                 with torch.cuda.stream(self._engine_stream):
-                    if cudagraphs_enabled:
+                    if self.cudagraphs_enabled:
                         if need_cudagraphs_record:
                             self.cudagraph = torch.cuda.CUDAGraph()
 
@@ -527,7 +540,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             if self.use_pre_allocated_outputs:
                 self.pre_allocated_outputs = self.create_output_tensors()
 
-            if cudagraphs_enabled:
+            if self.cudagraphs_enabled:
                 for idx, o in enumerate(outputs):
                     o.copy_(self._output_buffers[idx])
 
@@ -536,6 +549,158 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
 
             return outputs
 
+        def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
+            assert (
+                not torch_tensorrt.runtime.get_cudagraphs_mode()
+            ), "CUDA Graphs are not compatible with OutputAllocator."
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:ProcessInputs"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
+                assert len(contiguous_inputs) == len(
+                    self.input_names
+                ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
+
+                self.setup_input_tensors(contiguous_inputs, False, False)
+
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:SetupOutputAllocator"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
+                self.create_output_allocator()
+                # need to set output allocator every run
+                for output_name in self.output_names:
+                    if not self.context.set_output_allocator(
+                        output_name, self.output_allocator
+                    ):
+                        raise RuntimeError(
+                            f"Failed to set output allocator for {output_name}"
+                        )
+
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:TensorRTRuntime"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
+                self._caller_stream = torch.cuda.current_stream()
+                if (
+                    self._engine_stream == torch.cuda.default_stream()
+                    or self._engine_stream is None
+                ):
+                    self._engine_stream = torch.cuda.Stream()
+
+                self._engine_stream.wait_stream(self._caller_stream)
+
+                with torch.cuda.stream(self._engine_stream):
+                    self.context.execute_async_v3(
+                        self._engine_stream.cuda_stream
+                    )  # The OutputAllocator is called by execute_async_v3()
+
+                self._caller_stream.wait_stream(self._engine_stream)
+
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:ProcessOutputs"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
+                outputs = []
+                assert self.output_allocator is not None
+                for o, output_name in enumerate(self.output_names):
+                    shape = self.output_allocator.shapes.get(output_name, None)
+                    dtype = self.output_dtypes[o]
+                    output = (
+                        self.output_allocator.buffers.get(output_name, None)
+                        .clone()
+                        .detach()
+                    )
+                    prod = int(torch.prod(torch.tensor(shape)))
+                    # When using the OutputAllocator, the allocated buffer might be larger than the size of the output,
+                    # so we need to reshape the buffer to the output shape
+                    output = output.reshape(-1).view(dtype)[:prod].reshape(shape)
+                    outputs.append(output)
+
+            if len(outputs) == 1:
+                return outputs[0]
+
+            return outputs
+
+        self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
+
+        # Run forward function
+        contiguous_inputs: List[torch.Tensor] = [
+            (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
+            for i in inputs
+        ]
+        with (
+            torch.autograd.profiler.record_function("PythonTorchTensorRTModule:Forward")
+            if self.profiling_enabled
+            else nullcontext()
+        ):
+            self._check_initialized()
+
+            # If in safe mode, check at each iteration for whether a switch is required
+            if (
+                torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE
+            ):
+                curr_device_id = torch.cuda.current_device()
+                curr_device_properties = torch.cuda.get_device_properties(
+                    curr_device_id
+                )
+                logger.debug(f"Current Device: cuda:{curr_device_id}")
+
+                # If a switch is required, move all inputs to new device and set as active device
+                if _is_switch_required(
+                    curr_device_id,
+                    self.target_device_id,
+                    curr_device_properties,
+                    self.target_device_properties,
+                ):
+                    device_id, _ = _select_rt_device(
+                        curr_device_id,
+                        self.target_device_id,
+                        self.target_device_properties,
+                    )
+
+                    # Update current device
+                    device = torch.device(device_id)
+                    torch.cuda.set_device(device_id)
+
+                    contiguous_inputs = [
+                        tensor.to(device) for tensor in contiguous_inputs
+                    ]
+                    logger.warning(f"Moved all input Tensors to cuda:{device_id}")
+
+            if self.requires_output_allocator:  # engine requires OA
+                if self.cudagraphs_enabled:
+                    raise RuntimeError(
+                        "The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs."
+                    )
+                logger.debug("Using the dynamic allocator runtime mode.")
+                return run_output_allocator()
+            else:
+                if self.use_output_allocator_outputs:  # users call OA context manager
+                    if self.cudagraphs_enabled:
+                        raise RuntimeError(
+                            "Both CUDA Graphs and dynamic output allocation are enabled, which are incompatible runtime modes. Please disable one of the two."
+                        )
+                    logger.debug("Using the dynamic allocator runtime mode.")
+                    return run_output_allocator()
+                else:
+                    logger.debug(
+                        f"Using the standard execution runtime mode with cudagraphs={self.cudagraphs_enabled}."
+                    )
+                    return run_standard_execution()
+
     def enable_profiling(self, profiler: "trt.IProfiler" = None) -> None:
         """
         Enable TensorRT profiling. After calling this function, TensorRT will report
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index b809e70ddf..e6b6a21421 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -34,6 +34,7 @@
 HW_COMPATIBLE_IDX = -1  # Not implemented
 SERIALIZED_METADATA_IDX = -1  # Not implemented
 TARGET_PLATFORM_IDX = -1  # Not implemented
+REQUIRES_OUTPUT_ALLOCATOR_IDX = -1  # Not implemented
 SERIALIZATION_LEN = -1  # Not implemented
 
 if ENABLED_FEATURES.torch_tensorrt_runtime:
@@ -46,7 +47,10 @@
     HW_COMPATIBLE_IDX = torch.ops.tensorrt.HW_COMPATIBLE_IDX()  # 6
     SERIALIZED_METADATA_IDX = torch.ops.tensorrt.SERIALIZED_METADATA_IDX()  # 7
     TARGET_PLATFORM_IDX = torch.ops.tensorrt.TARGET_PLATFORM_IDX()  # 8
-    SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN()  # 9
+    REQUIRES_OUTPUT_ALLOCATOR_IDX = (
+        torch.ops.tensorrt.REQUIRES_OUTPUT_ALLOCATOR_IDX()
+    )  # 9
+    SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN()  # 10
 
 
 @for_all_methods(needs_torch_tensorrt_runtime)
@@ -79,6 +83,7 @@ def __init__(
         name: str = "",
         settings: CompilationSettings = CompilationSettings(),  # Assumes engine was built with default compilation settings if object not passed
         weight_name_map: Optional[dict[Any, Any]] = None,
+        requires_output_allocator: bool = False,
     ):
         """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it. Uses the Torch-TensorRT runtime extension to run the engines
@@ -97,6 +102,7 @@ def __init__(
             name (str): Name for module
             settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
             weight_name_map (dict): Mapping of engine weight name to state_dict weight name
+            requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
 
         Example:
 
@@ -132,6 +138,7 @@ def __init__(
         self.weight_name_map = weight_name_map
         self.serialized_engine = serialized_engine
         self.engine = None
+        self.requires_output_allocator = requires_output_allocator
 
         if (
             serialized_engine
@@ -146,7 +153,10 @@ def _pack_engine_info(self) -> List[str | bytes]:
             if self.settings.device is not None
             else Device._current_device()
         )
-        metadata = {"settings": self.settings, "weight_name_map": self.weight_name_map}
+        metadata = {
+            "settings": self.settings,
+            "weight_name_map": self.weight_name_map,
+        }
         target_platform = (
             Platform.current_platform()
             if not self.settings.enable_cross_compile_for_windows
@@ -171,6 +181,9 @@ def _pack_engine_info(self) -> List[str | bytes]:
         engine_info[HW_COMPATIBLE_IDX] = str(int(self.hardware_compatible))
         engine_info[SERIALIZED_METADATA_IDX] = self.encode_metadata(metadata)
         engine_info[TARGET_PLATFORM_IDX] = target_platform._to_serialized_rt_platform()
+        engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = str(
+            int(self.requires_output_allocator)
+        )
 
         return engine_info
 
@@ -256,7 +269,12 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
                 serialized_engine_info[ENGINE_IDX]
             )
             self.engine = torch.classes.tensorrt.Engine(serialized_engine_info)
-            self.hardware_compatible = bool(int(state[1][HW_COMPATIBLE_IDX]))
+            self.hardware_compatible = bool(
+                int(serialized_engine_info[HW_COMPATIBLE_IDX])
+            )
+            self.requires_output_allocator = bool(
+                int(serialized_engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX])
+            )
 
             serialized_metadata = serialized_engine_info[SERIALIZED_METADATA_IDX]
             assert isinstance(serialized_metadata, bytes)
@@ -275,6 +293,9 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
     def set_pre_allocated_outputs(self, enable: bool) -> None:
         self.engine.use_pre_allocated_outputs = enable
 
+    def set_use_output_allocator(self, enable: bool) -> None:
+        self.engine.use_output_allocator_outputs = enable
+
     def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         """Implementation of the forward pass for a TensorRT engine
 
diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
index eee743c497..f481c5b2b8 100644
--- a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
+++ b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
@@ -93,6 +93,9 @@ def __init__(self, engine_info: List[str]) -> None:
         self.serialized_metadata = engine_info[
             torch.ops.tensorrt.SERIALIZED_METADATA_IDX()
         ]
+        self.requires_output_allocator = engine_info[
+            torch.ops.tensorrt.REQUIRES_OUTPUT_ALLOCATOR_IDX()
+        ]
         self.target_platform = engine_info[torch.ops.tensorrt.TARGET_PLATFORM_IDX()]
 
     @classmethod
diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py
index 470074a377..cfc9b322b5 100644
--- a/py/torch_tensorrt/runtime/__init__.py
+++ b/py/torch_tensorrt/runtime/__init__.py
@@ -9,5 +9,6 @@
     set_cudagraphs_mode,
 )
 from torch_tensorrt.runtime._multi_device_safe_mode import set_multi_device_safe_mode
+from torch_tensorrt.runtime._output_allocator import enable_output_allocator
 from torch_tensorrt.runtime._pre_allocated_outputs import enable_pre_allocated_outputs
 from torch_tensorrt.runtime._weight_streaming import weight_streaming
diff --git a/py/torch_tensorrt/runtime/_cudagraphs.py b/py/torch_tensorrt/runtime/_cudagraphs.py
index d1564cb4dc..c771564826 100644
--- a/py/torch_tensorrt/runtime/_cudagraphs.py
+++ b/py/torch_tensorrt/runtime/_cudagraphs.py
@@ -74,7 +74,15 @@ def __enter__(self) -> torch.nn.Module:
 
         num_torch_module = 0
         num_trt_module = 0
-        for name, _ in self.compiled_module.named_children():
+        for name, module in self.compiled_module.named_children():
+            # need to disable cudagraphs if any model requires output allocator
+            if (
+                hasattr(module, "requires_output_allocator")
+                and module.requires_output_allocator
+            ):
+                raise RuntimeError(
+                    "The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs."
+                )
             if "_run_on_acc" in name:
                 num_trt_module += 1
             elif "_run_on_gpu" in name:
diff --git a/py/torch_tensorrt/runtime/_output_allocator.py b/py/torch_tensorrt/runtime/_output_allocator.py
new file mode 100644
index 0000000000..163fc26306
--- /dev/null
+++ b/py/torch_tensorrt/runtime/_output_allocator.py
@@ -0,0 +1,51 @@
+import logging
+from typing import Any, Union
+
+import torch
+from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
+from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import (
+    CudaGraphsTorchTensorRTModule,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class _OutputAllocatorContextManager(object):
+    """
+    Helper class to set up output_allocator
+    """
+
+    def __init__(
+        self, module: Union[torch.fx.GraphModule, CudaGraphsTorchTensorRTModule]
+    ) -> None:
+        if isinstance(module, CudaGraphsTorchTensorRTModule):
+            rt_mods = [module]
+        else:
+            rt_mods = []
+
+            for name, rt_mod in module.named_children():
+                if "_run_on_acc" in name and isinstance(
+                    rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule)
+                ):
+                    rt_mods.append(rt_mod)
+
+        self.rt_mods = rt_mods
+
+    def set_output_allocator_output(self, enable: bool) -> None:
+        for mod in self.rt_mods:
+            mod.set_use_output_allocator(enable)
+
+    def __enter__(self) -> "_OutputAllocatorContextManager":
+        # Enable output_allocator for TRT submodules
+        self.set_output_allocator_output(True)
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        # Disable output_allocator
+        self.set_output_allocator_output(False)
+
+
+def enable_output_allocator(
+    module: torch.fx.GraphModule,
+) -> _OutputAllocatorContextManager:
+    return _OutputAllocatorContextManager(module)
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
index 9813548a10..6ff45507a0 100644
--- a/tests/py/dynamo/conversion/harness.py
+++ b/tests/py/dynamo/conversion/harness.py
@@ -207,6 +207,7 @@ def run_test(
                 input_binding_names=list(interpreter_result.input_names),
                 output_binding_names=list(interpreter_result.output_names),
                 name="test_engine",
+                requires_output_allocator=interpreter_result.requires_output_allocator,
             )
             mod = mod.cuda()
             if pyt_inputs is not None:
@@ -289,6 +290,7 @@ def run_test_custom_compare_results(
                 input_binding_names=list(interpreter_result.input_names),
                 output_binding_names=list(interpreter_result.output_names),
                 name="test_engine",
+                requires_output_allocator=interpreter_result.requires_output_allocator,
             )
             res_trt = trt_mod(*cuda_inputs).cpu()
             res_cpu = mod(*cuda_inputs).cpu()
diff --git a/tests/py/dynamo/conversion/test_nonzero_aten.py b/tests/py/dynamo/conversion/test_nonzero_aten.py
new file mode 100644
index 0000000000..f2c5123575
--- /dev/null
+++ b/tests/py/dynamo/conversion/test_nonzero_aten.py
@@ -0,0 +1,144 @@
+import torch
+import torch.nn as nn
+from parameterized import parameterized
+from torch.testing._internal.common_utils import run_tests
+from torch_tensorrt import Input
+
+from .harness import DispatchTestCase
+
+
+class TestNonZeroConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ((10,), torch.int),
+            ((1, 20), torch.int32),
+            ((2, 3), torch.int64),
+            ((2, 3, 4), torch.float),
+            ((2, 3, 4, 5), torch.float),
+        ]
+    )
+    def test_nonzero_dds(self, input_shape, dtype):
+        class NonZero(nn.Module):
+            # This is a DDS network
+            def forward(self, input):
+                out = torch.ops.aten.nonzero.default(input)
+                return out
+
+        inputs = [torch.randint(low=0, high=3, size=input_shape, dtype=dtype)]
+        self.run_test(
+            NonZero(),
+            inputs,
+        )
+
+    @parameterized.expand(
+        [
+            ((10,), torch.int),
+            ((1, 20), torch.int32),
+            ((2, 3), torch.int64),
+            ((2, 3, 4), torch.float),
+            ((2, 3, 4, 5), torch.float),
+        ]
+    )
+    def test_nonzero_non_dds(self, input_shape, dtype):
+        class NonZero(nn.Module):
+            # This is a static network
+            def forward(self, input):
+                out = torch.ops.aten.nonzero.default(input)
+                out = torch.ops.aten.sum.dim_IntList(out, 0)
+                return out
+
+        inputs = [torch.randint(low=0, high=3, size=input_shape, dtype=dtype)]
+        self.run_test(
+            NonZero(),
+            inputs,
+        )
+
+    @parameterized.expand(
+        [
+            (
+                "1d",
+                (1,),
+                (10,),
+                (100,),
+                torch.int32,
+            ),
+            (
+                "2d",
+                (1, 2),
+                (5, 10),
+                (20, 40),
+                torch.float16,
+            ),
+            (
+                "3d",
+                (1, 2, 3),
+                (5, 10, 20),
+                (30, 40, 50),
+                torch.float,
+            ),
+        ]
+    )
+    def test_nonzero_dynamic_shape_dds(self, _, min_shape, opt_shape, max_shape, dtype):
+        class NonZero(nn.Module):
+            def forward(self, input):
+                return torch.ops.aten.nonzero.default(input)
+
+        input_specs = [
+            Input(
+                min_shape=min_shape,
+                opt_shape=opt_shape,
+                max_shape=max_shape,
+                dtype=dtype,
+            ),
+        ]
+
+        self.run_test_with_dynamic_shape(NonZero(), input_specs)
+
+    @parameterized.expand(
+        [
+            (
+                "1d",
+                (1,),
+                (10,),
+                (100,),
+                torch.int32,
+            ),
+            (
+                "2d",
+                (1, 2),
+                (5, 10),
+                (20, 40),
+                torch.float16,
+            ),
+            (
+                "3d",
+                (1, 2, 3),
+                (5, 10, 20),
+                (30, 40, 50),
+                torch.float,
+            ),
+        ]
+    )
+    def test_nonzero_dynamic_shape_non_dds(
+        self, _, min_shape, opt_shape, max_shape, dtype
+    ):
+        class NonZero(nn.Module):
+            def forward(self, input):
+                out = torch.ops.aten.nonzero.default(input)
+                out = torch.ops.aten.sum.dim_IntList(out, 0)
+                return out
+
+        input_specs = [
+            Input(
+                min_shape=min_shape,
+                opt_shape=opt_shape,
+                max_shape=max_shape,
+                dtype=dtype,
+            ),
+        ]
+
+        self.run_test_with_dynamic_shape(NonZero(), input_specs)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/py/dynamo/runtime/test_output_allocator.py b/tests/py/dynamo/runtime/test_output_allocator.py
new file mode 100644
index 0000000000..c915f42173
--- /dev/null
+++ b/tests/py/dynamo/runtime/test_output_allocator.py
@@ -0,0 +1,479 @@
+import pytest
+import torch
+import torch_tensorrt
+from parameterized import parameterized
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+from ..testing_utilities import DECIMALS_OF_AGREEMENT
+
+INPUT_SIZE = (3, 16, 16)
+TRIALS = 5
+
+
+class StaticModel(torch.nn.Module):
+    def forward(self, input):
+        return torch.ops.aten.abs.default(input)
+
+
+class DDSModel(torch.nn.Module):
+    def forward(self, input):
+        return torch.ops.aten.nonzero.default(input)
+
+
+class DDSOpWithReductionOpModel(torch.nn.Module):
+    """
+    DDSOpWithReductionOpModel is a model that contains DDS op + reduction op.
+    Since nonzero requires output allocator, this model will use output allocator by default.
+    """
+
+    def forward(self, inputs):
+        out = torch.ops.aten.nonzero.default(inputs)
+        out = torch.ops.aten.sum.dim_IntList(out, 0)
+        return out
+
+
+class DDSModel2(torch.nn.Module):
+    def forward(self, input):
+        # combination of multiple non-zero and other ops
+        out = torch.ops.aten.nonzero.default(input)
+        out = torch.ops.aten.abs.default(out)
+        out = torch.ops.aten.nonzero.default(out)
+        return out
+
+
+class TestOutputAllocatorStaticModel(TestCase):
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_cudagraphs_and_output_allocator(self, _, use_python_runtime):
+        model = StaticModel().eval().cuda()
+        inputs = [torch.randn((2, 3), dtype=torch.float).cuda()]
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+
+        ref_out = model(*inputs)
+
+        with torch_tensorrt.runtime.enable_cudagraphs(
+            compiled_model
+        ) as cudagraphs_module:
+            cg_out = cudagraphs_module(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - cg_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="CUDA Graphs runtime outputs don't match with the original model.",
+        )
+
+        with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+            oa_out = compiled_model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - oa_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Output Allocator runtime outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_default(self, _, use_python_runtime):
+        """
+        Static models use standard execution with cudagraphs=False by default.
+        """
+        model = StaticModel().eval().cuda()
+        inputs = [torch.randn((2, 3), dtype=torch.float).cuda()]
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+        standard_out = compiled_model(*inputs)
+        ref_out = model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - standard_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Default standard execution (cudagraphs=False) outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_combination_of_cg_and_oa(self, _, use_python_runtime):
+        model = StaticModel().eval().cuda()
+        inputs = [torch.randn((2, 3), dtype=torch.float).cuda()]
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match="Both CUDA Graphs and dynamic output allocation are enabled, which are incompatible runtime modes. Please disable one of the two.",
+        ):
+            with torch_tensorrt.runtime.enable_cudagraphs(
+                compiled_model
+            ) as cudagraphs_module:
+                with torch_tensorrt.runtime.enable_output_allocator(cudagraphs_module):
+                    out = cudagraphs_module(*inputs)
+
+        with pytest.raises(
+            RuntimeError,
+            match="Both CUDA Graphs and dynamic output allocation are enabled, which are incompatible runtime modes. Please disable one of the two.",
+        ):
+            with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+                with torch_tensorrt.runtime.enable_cudagraphs(
+                    compiled_model
+                ) as cudagraphs_module:
+                    out = cudagraphs_module(*inputs)
+
+
+class TestOutputAllocatorDDSModel(TestCase):
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_cudagraphs_and_output_allocator(self, _, use_python_runtime):
+        model = DDSModel().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_cudagraphs(
+                compiled_model
+            ) as cudagraphs_module:
+                cg_out = cudagraphs_module(*inputs)
+
+        with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+            oa_out = compiled_model(*inputs)
+
+        ref_out = model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - oa_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Output Allocator runtime outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_default(self, _, use_python_runtime):
+        """
+        DDS models use OutputAllocator by default.
+        """
+        model = DDSModel().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+        oa_out = compiled_model(*inputs)
+        ref_out = model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - oa_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Default Output Allocator runtime outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_combination_of_cg_and_oa(self, _, use_python_runtime):
+        model = DDSModel().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_cudagraphs(
+                compiled_model
+            ) as cudagraphs_module:
+                with torch_tensorrt.runtime.enable_output_allocator(cudagraphs_module):
+                    out = cudagraphs_module(*inputs)
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+                with torch_tensorrt.runtime.enable_cudagraphs(
+                    compiled_model
+                ) as cudagraphs_module:
+                    out = cudagraphs_module(*inputs)
+
+
+class TestOutputAllocatorDDSOpWithReductionOpModel(TestCase):
+    """
+    The DDSOpWithReductionOpModel is a model that contains DDS op + reduction op.
+    """
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_cudagraphs_and_output_allocator(self, _, use_python_runtime):
+        model = DDSOpWithReductionOpModel().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_cudagraphs(
+                compiled_model
+            ) as cudagraphs_module:
+                cg_out = cudagraphs_module(*inputs)
+
+        with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+            oa_out = compiled_model(*inputs)
+
+        ref_out = model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - oa_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Output Allocator runtime outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_default(self, _, use_python_runtime):
+        """
+        The DDSOpWithReductionOpModel is a model that contains nonzero op + reduction op, in which nonzero op requires output allocator.
+        """
+        model = DDSOpWithReductionOpModel().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+        oa_out = compiled_model(*inputs)
+        ref_out = model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - oa_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Default Output Allocator runtime outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_combination_of_cg_and_oa(self, _, use_python_runtime):
+        model = DDSOpWithReductionOpModel().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_cudagraphs(
+                compiled_model
+            ) as cudagraphs_module:
+                with torch_tensorrt.runtime.enable_output_allocator(cudagraphs_module):
+                    out = cudagraphs_module(*inputs)
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+                with torch_tensorrt.runtime.enable_cudagraphs(
+                    compiled_model
+                ) as cudagraphs_module:
+                    out = cudagraphs_module(*inputs)
+
+
+class TestOutputAllocatorDDSModelWithGraphBreak(TestCase):
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_cudagraphs_and_output_allocator(self, _, use_python_runtime):
+        model = DDSModel2().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+            torch_executed_ops={"torch.ops.aten.abs.default"},
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_cudagraphs(
+                compiled_model
+            ) as cudagraphs_module:
+                cg_out = cudagraphs_module(*inputs)
+
+        with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+            oa_out = compiled_model(*inputs)
+
+        ref_out = model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - oa_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Output Allocator runtime outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_default(self, _, use_python_runtime):
+        """
+        Use Output Allocator by default.
+        """
+        model = DDSModel2().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+            torch_executed_ops={"torch.ops.aten.abs.default"},
+        )
+        oa_out = compiled_model(*inputs)
+        ref_out = model(*inputs)
+
+        self.assertAlmostEqual(
+            float(torch.max(torch.abs(ref_out - oa_out))),
+            0,
+            DECIMALS_OF_AGREEMENT,
+            msg="Default Output Allocator runtime outputs don't match with the original model.",
+        )
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_combination_of_cg_and_oa(self, _, use_python_runtime):
+        model = DDSModel2().eval().cuda()
+        inputs = (torch.randint(low=0, high=3, size=(10,), dtype=torch.int).to("cuda"),)
+        compiled_model = torch_tensorrt.compile(
+            model,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            use_python_runtime=use_python_runtime,
+            torch_executed_ops={"torch.ops.aten.abs.default"},
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_cudagraphs(
+                compiled_model
+            ) as cudagraphs_module:
+                with torch_tensorrt.runtime.enable_output_allocator(cudagraphs_module):
+                    out = cudagraphs_module(*inputs)
+
+        with pytest.raises(
+            RuntimeError,
+            match="The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs.",
+        ):
+            with torch_tensorrt.runtime.enable_output_allocator(compiled_model):
+                with torch_tensorrt.runtime.enable_cudagraphs(
+                    compiled_model
+                ) as cudagraphs_module:
+                    out = cudagraphs_module(*inputs)
+
+
+if __name__ == "__main__":
+    run_tests()