Skip to content

Commit 804c548

Browse files
WIP vectorized copy
1 parent b258ed0 commit 804c548

File tree

6 files changed

+39
-4
lines changed

6 files changed

+39
-4
lines changed

src/plugins/intel_npu/src/plugin/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ cross_compiled_file(${TARGET_NAME}
5050
ARCH AVX2 ANY
5151
npuw/util_xarch.cpp
5252
API npuw/util_xarch.hpp
53-
NAME unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16 copy_row_as_column transpose_i4 transpose_f16 transpose_f32
53+
NAME unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16 copy_row_as_column transpose_i4 transpose_f16 transpose_f32 copy
5454
NAMESPACE ov::npuw::util::XARCH
5555
)
5656

src/plugins/intel_npu/src/plugin/npuw/util.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,10 @@ ov::Tensor ov::npuw::util::transpose(const ov::Tensor& t) {
654654
return tnew;
655655
}
656656

657+
void ov::npuw::util::copy(const ov::Tensor& src, ov::Tensor& dst) {
658+
ov::npuw::util::XARCH::copy(src, dst);
659+
}
660+
657661
ov::Tensor ov::npuw::util::permute(const ov::Tensor& t, const std::vector<std::size_t>& axes) {
658662
ov::Shape shape = t.get_shape();
659663
NPUW_ASSERT(shape.size() == 3); // Yes, so far only transpose 3D tensors

src/plugins/intel_npu/src/plugin/npuw/util.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ namespace ov {
1818
namespace npuw {
1919
namespace util {
2020

21+
void copy(const ov::Tensor& src, ov::Tensor& dst);
22+
2123
bool is_set(const std::size_t sub_idx,
2224
const std::string& opt,
2325
const std::size_t real_idx = SIZE_MAX,

src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,6 +1492,31 @@ void ov::npuw::util::XARCH::copy_row_as_column(const ov::SoPtr<ov::ITensor>& fro
14921492
#endif
14931493
}
14941494

1495+
void ov::npuw::util::XARCH::copy(const ov::Tensor& from, ov::Tensor& to) {
1496+
#if defined(HAVE_AVX2)
1497+
constexpr uint32_t block_size = sizeof(__m256i) / sizeof(uint32_t);
1498+
size_t total_bytes = from.get_size() * from.get_element_type().size() / 4;
1499+
if (from.get_element_type() == ov::element::u4 || from.get_element_type() == ov::element::i4 ||
1500+
from.get_element_type() == ov::element::f4e2m1 || from.get_element_type() == ov::element::nf4) {
1501+
total_bytes = from.get_size() / 8;
1502+
}
1503+
1504+
const auto* pSrc = reinterpret_cast<uint32_t*>(from.data());
1505+
auto* pDst = reinterpret_cast<uint32_t*>(to.data());
1506+
1507+
size_t i = 0;
1508+
for (; i + block_size < total_bytes; i += block_size) {
1509+
__m256i input = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(pSrc + i));
1510+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pDst + i), input);
1511+
}
1512+
if (i < total_bytes) {
1513+
std::memcpy(pDst + i, pSrc + i, (total_bytes - i) * 4);
1514+
}
1515+
#else
1516+
from.copy_to(to);
1517+
#endif
1518+
}
1519+
14951520
void ov::npuw::util::XARCH::transpose_i4(const uint8_t* src, uint8_t* dst, size_t rows, size_t cols) {
14961521
#if defined(HAVE_AVX2)
14971522
size_t c_step = 8;

src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ void transpose_i4(const uint8_t* src, uint8_t* dst, size_t rows, size_t cols);
8888
void transpose_f16(const uint16_t* src, uint16_t* dst, size_t rows, size_t cols);
8989
void transpose_f32(const float* src, float* dst, size_t rows, size_t cols);
9090

91+
void copy(const ov::Tensor& src, ov::Tensor& dst);
92+
9193
} // namespace XARCH
9294
} // namespace util
9395
} // namespace npuw

src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ void Bank::evaluate_and_allocate() {
115115
void Bank::evaluate_cpu(Bank::DeviceBank& device_bank, const std::vector<LazyTensor>& to_process) {
116116
// Note: not locking here. This is a private function, so Bank should handle the locks around it
117117
// as we lock in evaluate_and_allocate() now.
118-
ov::parallel_for(to_process.size(), [&](std::size_t idx) {
118+
ov::npuw::util::non_parallel_for(to_process.size(), [&](std::size_t idx) {
119119
const auto& lt = to_process[idx];
120120
auto iter_device_registered = device_bank.registered_tensors.find(lt);
121121
NPUW_ASSERT(iter_device_registered != device_bank.registered_tensors.end() &&
@@ -124,7 +124,8 @@ void Bank::evaluate_cpu(Bank::DeviceBank& device_bank, const std::vector<LazyTen
124124
auto t = lt.eval();
125125
device_bank.storage.at(uid).tensor = ov::Tensor(t.get_element_type(), t.get_shape());
126126
// Get ownership of the weights, might be a mmaped object during import
127-
t.copy_to(device_bank.storage.at(uid).tensor);
127+
// t.copy_to(device_bank.storage.at(uid).tensor);
128+
ov::npuw::util::copy(t, device_bank.storage.at(uid).tensor);
128129
const_cast<LazyTensor&>(lt).detach();
129130
});
130131
}
@@ -172,7 +173,8 @@ void Bank::evaluate_and_allocate_on_device(Bank::DeviceBank& device_bank,
172173
auto& stored_tensor = device_bank.storage.at(allocated.uid);
173174

174175
auto transformed = stored_tensor.lt.eval();
175-
transformed.copy_to(allocated.allocated_tensor);
176+
// transformed.copy_to(allocated.allocated_tensor);
177+
ov::npuw::util::copy(transformed, allocated.allocated_tensor);
176178
stored_tensor.tensor = std::move(allocated.allocated_tensor);
177179

178180
// Detach the evaluated LazyTensor from its memory here - when it is 100%

0 commit comments

Comments
 (0)