feat(C++): The use of SIMD accelerated to implement and optimize utf1…

…6 utf8 (#1732)  ## What does this PR do? Use SIMD acceleration to implement and optimize utf16 to utf8. Adapter x86, arm and risc-v Add 8 tests verify the function <img width="261" alt="fury_cpp_simd_utf_1" src="https://github.com/user-attachments/assets/029fe6ea-b4be-4e26-85d2-3c5e02e64899"> And efficiency has improved <img width="401" alt="fury_cpp_simd_utf_2" src="https://github.com/user-attachments/assets/6e86c125-f5a2-46df-b3bd-3d12496e9238"> done. ## Related issues Closes #1546  ## Does this PR introduce any user-facing change?  - [x] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change?
apache · Jul 15, 2024 · caf0e48 · caf0e48
1 parent b32f3f9
commit caf0e48
Show file tree

Hide file tree

Showing 3 changed files with 503 additions and 3 deletions.
diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc
@@ -27,8 +27,37 @@
 #include <riscv_vector.h>
 #endif
 
+#include <chrono>
+#include <string>
+
 namespace fury {
 
+// Swap bytes to convert from big endian to little endian
+inline uint16_t swapBytes(uint16_t value) {
+ return (value >> 8) | (value << 8);
+}
+
+inline void utf16ToUtf8(uint16_t code_unit, char *&output) {
+ if (code_unit < 0x80) {
+ *output++ = static_cast<char>(code_unit);
+ } else if (code_unit < 0x800) {
+ *output++ = static_cast<char>(0xC0 | (code_unit >> 6));
+ *output++ = static_cast<char>(0x80 | (code_unit & 0x3F));
+ } else {
+ *output++ = static_cast<char>(0xE0 | (code_unit >> 12));
+ *output++ = static_cast<char>(0x80 | ((code_unit >> 6) & 0x3F));
+ *output++ = static_cast<char>(0x80 | (code_unit & 0x3F));
+ }
+}
+
+inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) {
+ uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00);
+ *utf8++ = static_cast<char>((code_point >> 18) | 0xF0);
+ *utf8++ = static_cast<char>(((code_point >> 12) & 0x3F) | 0x80);
+ *utf8++ = static_cast<char>(((code_point >> 6) & 0x3F) | 0x80);
+ *utf8++ = static_cast<char>((code_point & 0x3F) | 0x80);
+}
+
 #if defined(__x86_64__) || defined(_M_X64)
 
 bool isLatin(const std::string &str) {
@@ -55,6 +84,90 @@ bool isLatin(const std::string &str) {
  return true;
 }
 
+std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
+ std::string utf8;
+ utf8.reserve(utf16.size() *
+ 3); // Reserve enough space to avoid frequent reallocations
+
+ const __m256i limit1 = _mm256_set1_epi16(0x80);
+ const __m256i limit2 = _mm256_set1_epi16(0x800);
+ const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800);
+ const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF);
+ const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00);
+ const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF);
+
+ char buffer[64]; // Buffer to hold temporary UTF-8 bytes
+ char *output = buffer;
+
+ size_t i = 0;
+ size_t n = utf16.size();
+
+ while (i + 16 <= n) {
+ __m256i in =
+ _mm256_loadu_si256(reinterpret_cast<const __m256i *>(utf16.data() + i));
+
+ if (!is_little_endian) {
+ in = _mm256_or_si256(
+ _mm256_slli_epi16(in, 8),
+ _mm256_srli_epi16(in, 8)); // Swap bytes for big-endian
+ }
+
+ __m256i mask1 = _mm256_cmpgt_epi16(in, limit1);
+ __m256i mask2 = _mm256_cmpgt_epi16(in, limit2);
+ __m256i high_surrogate_mask =
+ _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start),
+ _mm256_cmpgt_epi16(in, surrogate_high_end));
+ __m256i low_surrogate_mask =
+ _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start),
+ _mm256_cmpgt_epi16(in, surrogate_low_end));
+
+ if (_mm256_testz_si256(mask1, mask1)) {
+ // All values < 0x80, 1 byte per character
+ for (int j = 0; j < 16; ++j) {
+ *output++ = static_cast<char>(utf16[i + j]);
+ }
+ } else if (_mm256_testz_si256(mask2, mask2)) {
+ // All values < 0x800, 2 bytes per character
+ for (int j = 0; j < 16; ++j) {
+ utf16ToUtf8(utf16[i + j], output);
+ }
+ } else {
+ // Mix of 1, 2, and 3 byte characters
+ for (int j = 0; j < 16; ++j) {
+ if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) &&
+ j + 1 < 16 &&
+ !_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) {
+ // Surrogate pair
+ utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output);
+ ++j;
+ } else {
+ utf16ToUtf8(utf16[i + j], output);
+ }
+ }
+ }
+
+ utf8.append(buffer, output - buffer);
+ output = buffer; // Reset output buffer pointer
+ i += 16;
+ }
+
+ // Handle remaining characters
+ while (i < n) {
+ if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF &&
+ utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
+ // Surrogate pair
+ utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output);
+ ++i;
+ } else {
+ utf16ToUtf8(utf16[i], output);
+ }
+ ++i;
+ }
+ utf8.append(buffer, output - buffer);
+
+ return utf8;
+}
+
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
 
 bool isLatin(const std::string &str) {
@@ -80,18 +193,89 @@ bool isLatin(const std::string &str) {
  return true;
 }
 
+std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
+ std::string utf8;
+ utf8.reserve(utf16.size() * 3);
+
+ uint16x8_t limit1 = vdupq_n_u16(0x80);
+ uint16x8_t limit2 = vdupq_n_u16(0x800);
+ uint16x8_t surrogate_high_start = vdupq_n_u16(0xD800);
+ uint16x8_t surrogate_high_end = vdupq_n_u16(0xDBFF);
+ uint16x8_t surrogate_low_start = vdupq_n_u16(0xDC00);
+ uint16x8_t surrogate_low_end = vdupq_n_u16(0xDFFF);
+
+ char buffer[64];
+ char *output = buffer;
+ size_t i = 0;
+ size_t n = utf16.size();
+
+ while (i + 8 <= n) {
+ uint16x8_t in =
+ vld1q_u16(reinterpret_cast<const uint16_t *>(utf16.data() + i));
+ if (!is_little_endian) {
+ in = vorrq_u16(vshlq_n_u16(in, 8),
+ vshrq_n_u16(in, 8)); // Swap bytes for big-endian
+ }
+
+ uint16x8_t mask1 = vcgtq_u16(in, limit1);
+ uint16x8_t mask2 = vcgtq_u16(in, limit2);
+ uint16x8_t high_surrogate_mask = vandq_u16(
+ vcgtq_u16(in, surrogate_high_start), vcltq_u16(in, surrogate_high_end));
+ uint16x8_t low_surrogate_mask = vandq_u16(
+ vcgtq_u16(in, surrogate_low_start), vcltq_u16(in, surrogate_low_end));
+
+ if (vmaxvq_u16(mask1) == 0) {
+ for (int j = 0; j < 8; ++j) {
+ *output++ = static_cast<char>(utf16[i + j]);
+ }
+ } else if (vmaxvq_u16(mask2) == 0) {
+ for (int j = 0; j < 8; ++j) {
+ utf16ToUtf8(utf16[i + j], output);
+ }
+ } else {
+ for (int j = 0; j < 8; ++j) {
+ if (vmaxvq_u16(high_surrogate_mask) == 0 && j + 1 < 8 &&
+ vmaxvq_u16(low_surrogate_mask) != 0) {
+ utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output);
+ ++j;
+ } else {
+ utf16ToUtf8(utf16[i + j], output);
+ }
+ }
+ }
+
+ utf8.append(buffer, output - buffer);
+ output = buffer;
+ i += 8;
+ }
+
+ while (i < n) {
+ if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF &&
+ utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
+ utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output);
+ ++i;
+ } else {
+ utf16ToUtf8(utf16[i], output);
+ }
+ ++i;
+ }
+ utf8.append(buffer, output - buffer);
+
+ return utf8;
+}
+
 #elif defined(__riscv) && __riscv_vector
 
 bool isLatin(const std::string &str) {
  const char *data = str.data();
  size_t len = str.size();
 
  size_t i = 0;
+ auto latin_mask = vmv_v_x_u8m1(0x80, 16);
  for (; i + 16 <= len; i += 16) {
  auto chars = vle8_v_u8m1(reinterpret_cast<const uint8_t *>(data + i), 16);
- auto mask = vmv_v_x_u8m1(0x80, 16);
- auto result = vand_vv_u8m1(chars, mask, 16);
- if (vmax_v_u8m1(result, 16) != 0) {
+ auto result = vand_vv_u8m1(chars, latin_mask, 16);
+ if (vfirst_m_b8(vmsne_vx_u8m1_b8(result, 0, 16))) {
  return false;
  }
  }
@@ -105,6 +289,82 @@ bool isLatin(const std::string &str) {
  return true;
 }
 
+std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
+ std::string utf8;
+ utf8.reserve(utf16.size() * 3);
+
+ auto limit1 = vmv_v_x_u16m1(0x80, 8);
+ auto limit2 = vmv_v_x_u16m1(0x800, 8);
+ auto surrogate_high_start = vmv_v_x_u16m1(0xD800, 8);
+ auto surrogate_high_end = vmv_v_x_u16m1(0xDBFF, 8);
+ auto surrogate_low_start = vmv_v_x_u16m1(0xDC00, 8);
+ auto surrogate_low_end = vmv_v_x_u16m1(0xDFFF, 8);
+
+ char buffer[48];
+ char *output = buffer;
+ size_t i = 0;
+ size_t n = utf16.size();
+
+ while (i + 8 <= n) {
+ auto in =
+ vle16_v_u16m1(reinterpret_cast<const uint16_t *>(utf16.data() + i), 8);
+ if (!is_little_endian) {
+ in = vor_vv_u16m1(vsrl_vx_u16m1(in, 8, 8), vsll_vx_u16m1(in, 8, 8), 8);
+ }
+
+ auto mask1 = vmsgt_vx_u16m1(in, 0x80, 8);
+ auto mask2 = vmsgt_vx_u16m1(in, 0x800, 8);
+ auto high_surrogate_mask = vmand_vv_u16m1(vmsgt_vx_u16m1(in, 0xD800, 8),
+ vmslt_vx_u16m1(in, 0xDBFF, 8), 8);
+ auto low_surrogate_mask = vmand_vv_u16m1(vmsgt_vx_u16m1(in, 0xDC00, 8),
+ vmslt_vx_u16m1(in, 0xDFFF, 8), 8);
+
+ if (vmslt_vx_u16m1(mask1, 0, 8)) {
+ for (int j = 0; j < 8; ++j) {
+ *output++ = static_cast<char>(vget_vx_u16m1(in, j));
+ }
+ } else if (vmslt_vx_u16m1(mask2, 0, 8)) {
+ for (int j = 0; j < 8; ++j) {
+ utf16ToUtf8(vget_vx_u16m1(in, j), output);
+ }
+ } else {
+ for (int j = 0; j < 8; ++j) {
+ if (vfirst_m_b8(
+ vmand_vv_b8(high_surrogate_mask,
+ vmsne_vx_u8m1_b8(vmv_v_x_u8m1(0, 8), 0, 8))) &&
+ j + 1 < 8 &&
+ vfirst_m_b8(
+ vmand_vv_b8(low_surrogate_mask,
+ vmsne_vx_u8m1_b8(vmv_v_x_u8m1(0, 8), 0, 8)))) {
+ utf16SurrogatePairToUtf8(vget_vx_u16m1(in, j),
+ vget_vx_u16m1(in, j + 1), output);
+ ++j;
+ } else {
+ utf16ToUtf8(vget_vx_u16m1(in, j), output);
+ }
+ }
+ }
+
+ utf8.append(buffer, output - buffer);
+ output = buffer;
+ i += 8;
+ }
+
+ while (i < n) {
+ if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF &&
+ utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
+ utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output);
+ ++i;
+ } else {
+ utf16ToUtf8(utf16[i], output);
+ }
+ ++i;
+ }
+ utf8.append(buffer, output - buffer);
+
+ return utf8;
+}
+
 #else
 
 bool isLatin(const std::string &str) {
@@ -116,6 +376,44 @@ bool isLatin(const std::string &str) {
  return true;
 }
 
+// Fallback implementation without SIMD acceleration
+std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
+ std::string utf8;
+ utf8.reserve(utf16.size() *
+ 3); // Reserve enough space to avoid frequent reallocations
+
+ size_t i = 0;
+ size_t n = utf16.size();
+ char buffer[4]; // Buffer to hold temporary UTF-8 bytes
+ char *output = buffer;
+
+ while (i < n) {
+ uint16_t code_unit = utf16[i];
+ if (!is_little_endian) {
+ code_unit = swapBytes(code_unit);
+ }
+ if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF &&
+ utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
+ // Surrogate pair
+ uint16_t high = code_unit;
+ uint16_t low = utf16[i + 1];
+ if (!is_little_endian) {
+ low = swapBytes(low);
+ }
+ utf16SurrogatePairToUtf8(high, low, output);
+ utf8.append(buffer, output - buffer);
+ output = buffer;
+ ++i;
+ } else {
+ utf16ToUtf8(code_unit, output);
+ utf8.append(buffer, output - buffer);
+ output = buffer;
+ }
+ ++i;
+ }
+ return utf8;
+}
+
 #endif
 
 } // namespace fury
diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h
@@ -25,4 +25,6 @@ namespace fury {
 
 bool isLatin(const std::string &str);
 
+std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian);
+
 } // namespace fury