@@ -115,7 +115,7 @@ void Bank::evaluate_and_allocate() {
115115void Bank::evaluate_cpu (Bank::DeviceBank& device_bank, const std::vector<LazyTensor>& to_process) {
116116 // Note: not locking here. This is a private function, so Bank should handle the locks around it
117117 // as we lock in evaluate_and_allocate() now.
118- ov::parallel_for (to_process.size (), [&](std::size_t idx) {
118+ ov::npuw::util::non_parallel_for (to_process.size (), [&](std::size_t idx) {
119119 const auto & lt = to_process[idx];
120120 auto iter_device_registered = device_bank.registered_tensors .find (lt);
121121 NPUW_ASSERT (iter_device_registered != device_bank.registered_tensors .end () &&
@@ -124,7 +124,8 @@ void Bank::evaluate_cpu(Bank::DeviceBank& device_bank, const std::vector<LazyTen
124124 auto t = lt.eval ();
125125 device_bank.storage .at (uid).tensor = ov::Tensor (t.get_element_type (), t.get_shape ());
126126 // Get ownership of the weights, might be a mmaped object during import
127- t.copy_to (device_bank.storage .at (uid).tensor );
127+ // t.copy_to(device_bank.storage.at(uid).tensor);
128+ ov::npuw::util::copy (t, device_bank.storage .at (uid).tensor );
128129 const_cast <LazyTensor&>(lt).detach ();
129130 });
130131}
@@ -172,7 +173,8 @@ void Bank::evaluate_and_allocate_on_device(Bank::DeviceBank& device_bank,
172173 auto & stored_tensor = device_bank.storage .at (allocated.uid );
173174
174175 auto transformed = stored_tensor.lt .eval ();
175- transformed.copy_to (allocated.allocated_tensor );
176+ // transformed.copy_to(allocated.allocated_tensor);
177+ ov::npuw::util::copy (transformed, allocated.allocated_tensor );
176178 stored_tensor.tensor = std::move (allocated.allocated_tensor );
177179
178180 // Detach the evaluated LazyTensor from its memory here - when it is 100%
0 commit comments