From e9cc026aac4776d402602d0b7ff86917281e8b3e Mon Sep 17 00:00:00 2001 From: lucylq Date: Tue, 26 Aug 2025 16:24:59 -0700 Subject: [PATCH 1/2] Use exension module for pytbind Deprecate the module class in pybindings, previously it was blocked by bundled program, and now it's resolved with https://github.com/pytorch/executorch/blob/2bb567f15a54fbc0ef621e5e85ff24e5da505b11/extension/module/bundled_module.h#L4 Differential Revision: [D70516347](https://our.internmc.facebook.com/intern/diff/D70516347/) [ghstack-poisoned] --- exir/backend/test/test_backends.py | 6 +- exir/backend/test/test_backends_lifted.py | 5 +- exir/backend/test/test_compatibility.py | 3 +- extension/pybindings/pybindings.cpp | 349 ++++-------------- .../extension/pybindings/pybindings.bzl | 4 + 5 files changed, 80 insertions(+), 287 deletions(-) diff --git a/exir/backend/test/test_backends.py b/exir/backend/test/test_backends.py index 544b97bb53c..68b1845e484 100644 --- a/exir/backend/test/test_backends.py +++ b/exir/backend/test/test_backends.py @@ -319,16 +319,16 @@ def forward(self, x): ) buff = exec_prog.buffer - + executorch_module = _load_for_executorch_from_buffer(buff) # This line should raise an exception like # RuntimeError: failed with error 0x12 - _load_for_executorch_from_buffer(buff) + executorch_module.run_method("forward") @vary_segments def test_backend_with_compiler_out_of_range(self, extract_delegate_segments: bool): with self.assertRaisesRegex( RuntimeError, - "loading method forward failed with error 0x12", + "Failed to execute method forward, error: 0x12", ): self.run_model_in_unsupported_backend( extract_delegate_segments=extract_delegate_segments diff --git a/exir/backend/test/test_backends_lifted.py b/exir/backend/test/test_backends_lifted.py index b6aea7f8bb3..53b2c6b6fb2 100644 --- a/exir/backend/test/test_backends_lifted.py +++ b/exir/backend/test/test_backends_lifted.py @@ -347,15 +347,16 @@ def forward(self, x): buff = exec_prog.buffer + executorch_module = _load_for_executorch_from_buffer(buff) # This line should raise an exception like # RuntimeError: failed with error 0x12 - _load_for_executorch_from_buffer(buff) + executorch_module.run_method("forward") @vary_segments def test_backend_with_compiler_out_of_range(self, extract_delegate_segments: bool): with self.assertRaisesRegex( RuntimeError, - "loading method forward failed with error 0x12", + "Failed to execute method forward, error: 0x12", ): self.run_model_in_unsupported_backend( extract_delegate_segments=extract_delegate_segments diff --git a/exir/backend/test/test_compatibility.py b/exir/backend/test/test_compatibility.py index bcda1d36516..f52fb357483 100644 --- a/exir/backend/test/test_compatibility.py +++ b/exir/backend/test/test_compatibility.py @@ -65,9 +65,10 @@ def forward(self, x): # Throw runtime error with error code 0x30, meaning delegate is incompatible. with self.assertRaisesRegex( RuntimeError, - "loading method forward failed with error 0x30", + "Failed to execute method forward, error: 0x30", ): executorch_module = _load_for_executorch_from_buffer(buff) + executorch_module.run_method("forward") def test_compatibility_in_runtime_edge_program_manager(self): class SinModule(torch.nn.Module): diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 7a9d8c1faf3..66169b7406a 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -156,239 +156,6 @@ void setup_output_storage( } } -class Module final { - public: - explicit Module( - std::unique_ptr loader, - std::unique_ptr tracer = nullptr, - size_t debug_buffer_size = 0, - Program::Verification program_verification = - Program::Verification::InternalConsistency) - : loader_(std::move(loader)), - event_tracer_(std::move(tracer)), - debug_buffer_size_(debug_buffer_size) { - ::executorch::runtime::runtime_init(); - Result program = - Program::load(loader_.get(), program_verification); - THROW_IF_ERROR( - program.error(), - "loading program failed with error: 0x%" PRIx32, - static_cast(program.error())); - program_ = std::make_unique(std::move(program.get())); - - // Figure out the size of each non_const layer we need to support every - // method in the program. Map will be easier to use than a list because we - // dont know how many non_const arenas there will be - std::map non_const_buffer_sizes; - for (size_t i = 0; i < program_->num_methods(); ++i) { - auto name = program_->get_method_name(i).get(); - auto method_meta = program_->method_meta(name).get(); - for (size_t j = 0; j < method_meta.num_non_const_buffers(); j++) { - int64_t buffer_size = method_meta.non_const_buffer_size(j).get(); - if (non_const_buffer_sizes.find(j) == non_const_buffer_sizes.end()) { - non_const_buffer_sizes.insert({j, buffer_size}); - } else { - non_const_buffer_sizes[j] = - std::max(non_const_buffer_sizes[j], buffer_size); - } - } - } - - // Allocate the arenas. Using vector because we need to remember the size as - // well, so vector is easier then unique_ptr. - std::vector> non_const_buffers_; - for (std::map::iterator i = non_const_buffer_sizes.begin(); - i != non_const_buffer_sizes.end(); - i++) { - non_const_buffers_.push_back(std::vector(i->second)); - } - - memory_ = std::make_unique(std::move(non_const_buffers_)); - if (event_tracer_ && debug_buffer_size > 0) { - // If a debug buffer was requested for the ETDump, allocate it and make - // sure its lifetime is as long as the event_tracer. - debug_buffer_ = std::make_unique(debug_buffer_size); - event_tracer_->set_debug_buffer(get_etdump_debug_buffer()); - event_tracer_->set_event_tracer_debug_level( - EventTracerDebugLogLevel::kIntermediateOutputs); - } - - // Load methods - for (size_t i = 0; i < program_->num_methods(); ++i) { - auto name = program_->get_method_name(i).get(); - // It's safe to use the same memory manager for all modules because - // we can guarantee that only one will be executing at a time. - // Everything in this module runs on a single thread. - Result method = program_->load_method( - name, memory_->mem_manager(), event_tracer_.get()); - THROW_IF_ERROR( - method.error(), - "loading method %s failed with error 0x%" PRIx32, - name, - static_cast(method.error())); - methods_.insert( - {std::string(name), - std::make_unique(std::move(method.get()))}); - } - } - - Module(const Module&) = delete; - Module& operator=(const Module&) = delete; - Module(Module&&) = default; - Module& operator=(Module&&) = default; - - /// Executes the specified method on the provided inputs and returns its - /// outputs. - std::vector run_method( - const std::string& method_name, - const std::vector& args, - const std::optional>>& output_storages = - std::nullopt) { - auto& method = get_method(method_name); - executorch::aten::ArrayRef input_evalue_list( - args.data(), args.size()); - - Error set_inputs_status = method.set_inputs(input_evalue_list); - THROW_IF_ERROR( - set_inputs_status, - "method->set_inputs() for method '%s' failed with error 0x%" PRIx32, - method_name.c_str(), - static_cast(set_inputs_status)); - -#ifdef USE_ATEN_LIB - // [TLS handling] This is to workaround an assertion failure - // (https://fburl.com/code/302jyn8d) running `gelu` in ATen mode in fbcode - // (such as bento). The problem is ExecuTorch ATen mode doesn't have - // Thread Local State, but `torch-cpp` is assuming tls init is done. There - // are two more checks: MKLDNN disabled and C10_MOBILE, if any of them is - // true we won't be hitting this assertion error. However in `torch-cpp` - // lib both checks are false. Production impact: this should not make any - // impact in production environment, given that in xplat we are depending - // on a library that enables C10_MOBILE (`torch_mobile_core`). - c10::impl::ExcludeDispatchKeyGuard no_autograd( - c10::autograd_dispatch_keyset); -#endif - if (output_storages) { - setup_output_storage(method, *output_storages); - } - Error execute_status = method.execute(); - THROW_IF_ERROR( - execute_status, - "method->execute() failed with error 0x%" PRIx32, - static_cast(execute_status)); - // process outputs - return get_outputs(method_name); - } - - std::vector get_outputs(const std::string& method_name) { - auto& method = methods_[method_name]; - std::vector result(method->outputs_size()); - - Error get_outputs_status = - method->get_outputs(result.data(), method->outputs_size()); - THROW_IF_ERROR( - get_outputs_status, - "method->get_outputs() for method '%s' failed with error 0x%" PRIx32, - method_name.c_str(), - static_cast(get_outputs_status)); - - return result; - } - - Method& get_method(const std::string& method_name) { - if (methods_.count(method_name) == 0) { - THROW_IF_ERROR( - Error::InvalidArgument, - "no such method in program: %s", - method_name.c_str()); - } - return *methods_[method_name].get(); - } - - /// Returns the names of all methods in the program. - std::vector method_names() const { - std::vector names; - for (const auto& method : methods_) { - names.push_back(method.first); - } - return names; - } - - bool has_etdump() { - return static_cast(event_tracer_); - } - - ETDumpGen& etdump() { - return *event_tracer_; - } - - bool has_etdump_debug_buffer() const { - return static_cast(debug_buffer_); - } - - Span get_etdump_debug_buffer() { - return Span(debug_buffer_.get(), debug_buffer_size_); - } - - private: - /// A wrapper/util class for executorch memory allocations/manager. - class Memory { - public: - explicit Memory(std::vector>&& non_const_buffers) - : runtime_allocator_(), - non_const_buffers_(std::move(non_const_buffers)), - non_const_spans_(create_non_const_spans()), - non_const_allocator_( - {non_const_spans_.data(), non_const_spans_.size()}), - mem_manager_( - &const_allocator_, - &non_const_allocator_, - &runtime_allocator_, - &temp_allocator_) {} - - /// Returns a pointer to the internal memory manager, the Memory instance - /// must outlive this pointer. - MemoryManager* mem_manager() { - return &mem_manager_; - } - - Memory(const Memory&) = delete; - Memory& operator=(const Memory&) = delete; - - private: - MemoryAllocator const_allocator_{MemoryAllocator(0, nullptr)}; - - MallocMemoryAllocator runtime_allocator_; - - MallocMemoryAllocator temp_allocator_{}; - - std::vector> non_const_buffers_; - - std::vector> non_const_spans_; - - HierarchicalAllocator non_const_allocator_; - - MemoryManager mem_manager_; - - std::vector> create_non_const_spans() { - std::vector> result; - for (size_t i = 0; i < non_const_buffers_.size(); i++) { - result.push_back( - {non_const_buffers_[i].data(), non_const_buffers_[i].size()}); - } - return result; - } - }; - - std::unique_ptr memory_; - std::unique_ptr loader_; // program_ points to this. - std::unique_ptr program_; // methods_ entries points to this. - std::unordered_map> methods_; - std::unique_ptr event_tracer_; - std::unique_ptr debug_buffer_; - size_t debug_buffer_size_; -}; - inline std::unique_ptr load_module_from_buffer( const void* ptr, size_t ptr_len, @@ -399,9 +166,10 @@ inline std::unique_ptr load_module_from_buffer( auto loader = std::make_unique(ptr, ptr_len); return std::make_unique( std::move(loader), + nullptr, // memory_allocator + nullptr, // temp_allocator enable_etdump ? std::make_unique() : nullptr, - debug_buffer_size, - program_verification); + nullptr); // data_map_loader } inline std::unique_ptr load_module_from_file( @@ -422,9 +190,10 @@ inline std::unique_ptr load_module_from_file( auto loader = std::make_unique(std::move(res.get())); return std::make_unique( std::move(loader), + nullptr, // memory_allocator + nullptr, // temp_allocator enable_etdump ? std::make_unique() : nullptr, - debug_buffer_size, - program_verification); + nullptr); // data_map_loader } inline py::list get_outputs_as_py_list( @@ -863,19 +632,17 @@ struct PyModule final { } } - const auto& method = module_->get_method(method_name); - const auto num_outputs = method.outputs_size(); - output_storages_ = make_output_storages(method); - std::vector> output_storage_spans(num_outputs); - for (int i = 0; i < output_storages_.size(); ++i) { - output_storage_spans[i] = - Span(output_storages_[i].data(), output_storages_[i].size()); - } - auto outputs = - module_->run_method(method_name, cpp_inputs, output_storage_spans); + // Set up output storage before execution. + allocate_output_storages(method_name); + auto outputs = module_->execute(method_name, cpp_inputs); + THROW_IF_ERROR( + outputs.error(), + "Failed to execute method %s, error: 0x%" PRIx32, + method_name.c_str(), + static_cast(outputs.error())); // Retrieve outputs - return get_outputs_as_py_list(outputs, clone_outputs); + return get_outputs_as_py_list(outputs.get(), clone_outputs); } py::list forward(const py::sequence& inputs, bool clone_outputs = true) { @@ -891,7 +658,8 @@ struct PyModule final { } bool has_etdump() { - return module_->has_etdump(); + ETDumpGen* etdump = dynamic_cast(module_->event_tracer()); + return etdump != nullptr; } void write_etdump_result_to_file( @@ -900,19 +668,20 @@ struct PyModule final { if (!has_etdump()) { throw std::runtime_error("No etdump found"); } - auto& etdump = module_->etdump(); - etdump_result result = etdump.get_etdump_data(); + ETDumpGen* etdump = dynamic_cast(module_->event_tracer()); + etdump_result result = etdump->get_etdump_data(); if (result.buf != nullptr && result.size > 0) { write_data_to_file(path, result.buf, result.size); free(result.buf); - if (module_->has_etdump_debug_buffer() && - py::isinstance(debug_buffer_path)) { + if (py::isinstance(debug_buffer_path)) { // Also write out the debug buffer to a separate file if requested. std::string debug_buffer_path_str = py::cast(debug_buffer_path); - const auto debug_buffer = module_->get_etdump_debug_buffer(); - write_data_to_file( - debug_buffer_path_str, debug_buffer.data(), debug_buffer.size()); + const auto debug_buffer = module_->debug_buffer(); + if (debug_buffer.size() > 0) { + write_data_to_file( + debug_buffer_path_str, debug_buffer.data(), debug_buffer.size()); + } } } else { ET_LOG( @@ -926,32 +695,33 @@ struct PyModule final { py::list plan_execute( const std::string method_name, bool clone_outputs = true) { - auto& method = module_->get_method(method_name); - // Need to pre-allocate space for outputs just like in run_method. - const auto num_outputs = method.outputs_size(); - output_storages_ = make_output_storages(method); - std::vector> output_storage_spans(num_outputs); - for (int i = 0; i < output_storages_.size(); ++i) { - output_storage_spans[i] = - Span(output_storages_[i].data(), output_storages_[i].size()); - } - setup_output_storage(method, output_storage_spans); - auto status = method.execute(); + auto status = module_->load_method(method_name); + + THROW_IF_ERROR( + status, + "executing execution plan for method 'load' failed with error: 0x%" PRIx32, + static_cast(status)); + auto output = module_->execute(method_name.c_str()); THROW_IF_ERROR( status, "executing execution plan for method 'forward' failed with error: 0x%" PRIx32, static_cast(status)); - const auto outputs = module_->get_outputs(method_name); - return get_outputs_as_py_list(outputs, clone_outputs); + return get_outputs_as_py_list(output.get(), clone_outputs); } std::unique_ptr method_meta(const std::string method_name) { - auto& method = module_->get_method(method_name); - return std::make_unique(module_, method.method_meta()); + auto method_data = module_->method_meta(method_name); + return std::make_unique(module_, method_data.get()); } std::vector method_names() { - return module_->method_names(); + auto result = module_->method_names(); + THROW_IF_ERROR( + result.error(), + "Failed to get method names, error: 0x%" PRIx32, + static_cast(result.error())); + const auto& method_set = result.get(); + return std::vector(method_set.begin(), method_set.end()); } private: @@ -960,38 +730,55 @@ struct PyModule final { // bundled programs. std::vector> output_storages_; - std::vector> make_output_storages(const Method& method) { - const auto num_outputs = method.outputs_size(); + void allocate_output_storages(const std::string& method_name) { + auto method_result = module_->method(method_name); + THROW_IF_ERROR( + method_result.error(), + "Failed to get method %s, error: 0x%" PRIx32, + method_name.c_str(), + static_cast(method_result.error())); + + auto* method = method_result.get(); + const auto num_outputs = method->outputs_size(); + // Skip if we already have the right number of storages. + if (output_storages_.size() == num_outputs) { + return; + } // Create a buffer for each output tensor. Memory planned outputs and non // tensor outputs get an empty buffer in this list which is ignored later. - std::vector> output_storages; output_storages_.reserve(num_outputs); - auto meta = method.method_meta(); + auto meta = method->method_meta(); for (size_t i = 0; i < num_outputs; ++i) { auto output_type = meta.output_tag(i); THROW_IF_ERROR( output_type.error(), "Failed to get output type for output %zu", i); if (output_type.get() != Tag::Tensor) { // Skip allocating storage for non-tensor outputs. - output_storages.emplace_back(); + output_storages_.emplace_back(); continue; } const auto& output_tensor_meta = - method.method_meta().output_tensor_meta(i); + method->method_meta().output_tensor_meta(i); THROW_IF_ERROR( output_tensor_meta.error(), "Failed to get output tensor meta for output %zu", i); if (output_tensor_meta.get().is_memory_planned()) { // Skip allocating storage for planned memory outputs. - output_storages.emplace_back(); + output_storages_.emplace_back(); continue; } // Allocate storage for the output tensor. const size_t output_size = output_tensor_meta.get().nbytes(); - output_storages.emplace_back(output_size); + output_storages_.emplace_back(output_size); + } + // Set up output storage for non-empty buffers + std::vector> output_storage_spans(num_outputs); + for (size_t i = 0; i < output_storages_.size(); ++i) { + output_storage_spans[i] = + Span(output_storages_[i].data(), output_storages_[i].size()); } - return output_storages; + setup_output_storage(*method, output_storage_spans); } }; diff --git a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl index 55a268d5d34..7e14ca8713a 100644 --- a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl +++ b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl @@ -17,6 +17,8 @@ PORTABLE_MODULE_DEPS = [ "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", "//executorch/extension/module:bundled_module", + "//executorch/extension/module:module", + "//executorch/extension/tensor:tensor", "//executorch/runtime/executor/test:test_backend_compiler_lib", "//executorch/devtools/etdump:etdump_flatcc", ] + get_all_cpu_backend_targets() @@ -30,6 +32,8 @@ ATEN_MODULE_DEPS = [ "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", "//executorch/extension/module:bundled_module_aten", + "//executorch/extension/module:module_aten", + "//executorch/extension/tensor:tensor_aten", "//executorch/devtools/bundled_program:runtime_aten", "//executorch/runtime/executor/test:test_backend_compiler_lib_aten", "//executorch/devtools/etdump:etdump_flatcc", From 579147a7cf707153e2e4324bc8f48eea3ad4e4cf Mon Sep 17 00:00:00 2001 From: lucylq Date: Tue, 26 Aug 2025 16:27:55 -0700 Subject: [PATCH 2/2] Update on "Use exension module for pytbind" Deprecate the module class in pybindings, previously it was blocked by bundled program, and now it's resolved with https://github.com/pytorch/executorch/blob/2bb567f15a54fbc0ef621e5e85ff24e5da505b11/extension/module/bundled_module.h#L4 Differential Revision: [D70516347](https://our.internmc.facebook.com/intern/diff/D70516347/) [ghstack-poisoned] --- extension/pybindings/pybindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 66169b7406a..01d968a8dca 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -739,7 +739,7 @@ struct PyModule final { static_cast(method_result.error())); auto* method = method_result.get(); - const auto num_outputs = method->outputs_size(); + const auto num_outputs = method->outputs_size(); // Skip if we already have the right number of storages. if (output_storages_.size() == num_outputs) { return;