From fcb620c8f3b062c63786777339442f4510c38abd Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Mon, 13 Jan 2025 00:33:16 +0800
Subject: [PATCH 01/23] add get uint16_t array max value util

---
 cpp/fury/util/array_util.h       | 83 ++++++++++++++++++++++++++++++++
 cpp/fury/util/array_util_test.cc | 33 +++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 cpp/fury/util/array_util.h
 create mode 100644 cpp/fury/util/array_util_test.cc

diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
new file mode 100644
index 0000000000..2dee73e50a
--- /dev/null
+++ b/cpp/fury/util/array_util.h
@@ -0,0 +1,83 @@
+#include "fury/util/platform.h"
+
+namespace fury {
+#if defined(FURY_HAS_NEON)
+inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
+  if (length == 0) {
+    return 0;  // Return 0 for empty arrays
+  }
+  uint16x8_t max_val = vdupq_n_u16(0);  // Initialize max vector to zero
+
+  size_t i = 0;
+  for (; i + 8 <= length; i += 8) {
+    uint16x8_t current_val = vld1q_u16(&arr[i]);
+    max_val = vmaxq_u16(max_val, current_val);  // Max operation
+  }
+
+  // Find the max value in the resulting vector
+  uint16_t temp[8];
+  vst1q_u16(temp, max_val);
+  uint16_t max_neon = temp[0];
+  for (int j = 1; j < 8; j++) {
+    if (temp[j] > max_neon) {
+      max_neon = temp[j];
+    }
+  }
+
+  // Handle remaining elements
+  for (; i < length; i++) {
+    if (arr[i] > max_neon) {
+      max_neon = arr[i];
+    }
+  }
+  return max_neon;
+}
+
+#elif defined(FURY_HAS_SSE2)
+
+inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
+  if (length == 0) {
+    return 0;  // Return 0 for empty arrays
+  }
+
+  __m128i max_val = _mm_setzero_si128();  // Initialize max vector with zeros
+
+  size_t i = 0;
+  for (; i + 8 <= length; i += 8) {
+    __m128i current_val = _mm_loadu_si128((__m128i*)&arr[i]);
+    max_val = _mm_max_epu16(max_val, current_val);  // Max operation
+  }
+
+  // Find the max value in the resulting vector
+  uint16_t temp[8];
+  _mm_storeu_si128((__m128i*)temp, max_val);
+  uint16_t max_sse = temp[0];
+  for (int j = 1; j < 8; j++) {
+    if (temp[j] > max_sse) {
+      max_sse = temp[j];
+    }
+  }
+
+  // Handle remaining elements
+  for (; i < length; i++) {
+    if (arr[i] > max_sse) {
+      max_sse = arr[i];
+    }
+  }
+  return max_sse;
+}
+#else
+inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
+  if (length == 0) {
+    return 0;  // Return 0 for empty arrays
+  }
+  uint16_t max_val = arr[0];
+  for (size_t i = 1; i < length; i++) {
+    if (arr[i] > max_val) {
+      max_val = arr[i];
+    }
+  }
+  return max_val;
+}
+#endif
+}  // namespace fury
diff --git a/cpp/fury/util/array_util_test.cc b/cpp/fury/util/array_util_test.cc
new file mode 100644
index 0000000000..399018730a
--- /dev/null
+++ b/cpp/fury/util/array_util_test.cc
@@ -0,0 +1,33 @@
+#include "fury/util/array_util.h"
+#include "gtest/gtest.h"
+
+namespace fury {
+TEST(GetMaxValueTest, HandlesEmptyArray) {
+  uint16_t arr[] = {};
+  EXPECT_EQ(getMaxValue(arr, 0), 0);
+}
+
+TEST(GetMaxValueTest, HandlesSingleElementArray) {
+  uint16_t arr[] = {42};
+  EXPECT_EQ(getMaxValue(arr, 1), 42);
+}
+
+TEST(GetMaxValueTest, HandlesSmallArray) {
+  uint16_t arr[] = {10, 20, 30, 40, 5};
+  EXPECT_EQ(getMaxValue(arr, 5), 40);
+}
+
+TEST(GetMaxValueTest, HandlesLargeArray) {
+  const size_t length = 1024;
+  uint16_t arr[length];
+  for (size_t i = 0; i < length; ++i) {
+    arr[i] = static_cast<uint16_t>(i);
+  }
+  EXPECT_EQ(getMaxValue(arr, length), 1023);
+}
+}  // namespace fury
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From f68dce4ac8f0ba4d32293897108bfdbeee96b85e Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Tue, 14 Jan 2025 13:44:10 +0800
Subject: [PATCH 02/23] add SMID copy uint16 array to uint8 array

---
 cpp/fury/util/array_util.h | 115 +++++++++++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 5 deletions(-)

diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index 2dee73e50a..5dcdd31c2f 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -1,8 +1,78 @@
 #include "fury/util/platform.h"
 
 namespace fury {
-#if defined(FURY_HAS_NEON)
-inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
+#if defined(FURY_HAS_IMMINTRIN)
+inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
+  if (length == 0) {
+    return 0;  // Return 0 for empty arrays
+  }
+
+  __m256i max_val = _mm256_setzero_si256();  // Initialize max vector with zeros
+
+  size_t i = 0;
+  for (; i + 16 <= length; i += 16) {
+    __m256i current_val = _mm256_loadu_si256((__m256i*)&arr[i]);
+    max_val = _mm256_max_epu16(max_val, current_val);  // Max operation
+  }
+
+  // Find the max value in the resulting vector
+  uint16_t temp[16];
+  _mm256_storeu_si256((__m256i*)temp, max_val);
+  uint16_t max_avx = temp[0];
+  for (int j = 1; j < 16; j++) {
+    if (temp[j] > max_avx) {
+      max_avx = temp[j];
+    }
+  }
+
+  // Handle remaining elements
+  for (; i < length; i++) {
+    if (arr[i] > max_avx) {
+      max_avx = arr[i];
+    }
+  }
+  return max_avx;
+}
+
+inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
+  size_t i = 0;
+  // Process chunks of 32 bytes (16 uint16_t elements at a time)
+  for (; i + 31 < length; i += 32) {
+    // Load two 256-bit blocks (32 uint16_t elements total)
+    __m256i src1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&from[i]));
+    __m256i src2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&from[i + 16]));
+
+    // Narrow the 16-bit integers to 8-bit integers
+    __m256i packed = _mm256_packus_epi16(src1, src2);
+
+    // Shuffle the packed result to interleave lower and upper parts
+    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
+
+    // Store the result
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&to[i]), packed);
+  }
+  // Check if at least 16 elements are left to process
+  if (i + 15 < length) {
+    // Process the next 16 elements
+    __m256i src1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&from[i]));
+    // Narrow the 16-bit integers to 8-bit integers by zeroing the upper halves
+    __m128i packed1 = _mm256_castsi256_si128(src1);       // Lower 128 bits
+    __m128i packed2 = _mm256_extracti128_si256(src1, 1);  // Upper 128 bits
+    // Pack two 128-wide vectors into 8-bit integers, ignore saturating with itself.
+    __m128i packed = _mm_packus_epi16(packed1, packed2);
+
+    // Store the result; using only the first 128 bits
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(&to[i]), packed);
+
+    i += 16;
+  }
+  // Process remaining elements one at a time
+  for (; i < length; ++i) {
+    to[i] = static_cast<uint8_t>(from[i]);
+  }
+}
+#elif defined(FURY_HAS_NEON)
+inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   if (length == 0) {
     return 0;  // Return 0 for empty arrays
   }
@@ -33,9 +103,21 @@ inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
   return max_neon;
 }
 
-#elif defined(FURY_HAS_SSE2)
+inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
+  size_t i = 0;
+  for (; i + 7 < length; i += 8) {
+    uint16x8_t src = vld1q_u16(&from[i]);
+    uint8x8_t result = vmovn_u16(src);
+    vst1_u8(&to[i], result);
+  }
 
-inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
+  // Fallback for the remainder
+  for (; i < length; ++i) {
+    to[i] = static_cast<uint8_t>(from[i]);
+  }
+}
+#elif defined(FURY_HAS_SSE2)
+inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   if (length == 0) {
     return 0;  // Return 0 for empty arrays
   }
@@ -66,8 +148,24 @@ inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
   }
   return max_sse;
 }
+
+inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
+  size_t i = 0;
+  __m128i mask = _mm_set1_epi16(0xFF);  // Mask to zero out the high byte
+  for (; i + 7 < length; i += 8) {
+    __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&from[i]));
+    __m128i result = _mm_and_si128(src, mask);
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(&to[i]),
+                     _mm_packus_epi16(result, result));
+  }
+
+  // Fallback for the remainder
+  for (; i < length; ++i) {
+    to[i] = static_cast<uint8_t>(from[i]);
+  }
+}
 #else
-inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
+inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   if (length == 0) {
     return 0;  // Return 0 for empty arrays
   }
@@ -79,5 +177,12 @@ inline uint16_t getMaxValue(uint16_t* arr, size_t length) {
   }
   return max_val;
 }
+
+inline void copyValue(const uint16_t* from, const uint8_t* to, size_t length) {
+  // Fallback for systems without SSE2/NEON
+  for (size_t i = 0; i < length; ++i) {
+    to[i] = static_cast<uint8_t>(from[i]);
+  }
+}
 #endif
 }  // namespace fury

From eb7f7b8cf6475aa062ec4fef88802875b8aed423 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 01:01:23 +0800
Subject: [PATCH 03/23] skip avx for python wheel

---
 cpp/fury/util/BUILD        | 10 ++++++++++
 cpp/fury/util/array_util.h | 12 +++++++-----
 cpp/fury/util/platform.h   |  3 ---
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/cpp/fury/util/BUILD b/cpp/fury/util/BUILD
index 8f605dc75e..2a2a6e5396 100644
--- a/cpp/fury/util/BUILD
+++ b/cpp/fury/util/BUILD
@@ -62,4 +62,14 @@ cc_test(
         ":fury_util",
         "@com_google_googletest//:gtest",
     ],
+)
+
+
+cc_test(
+    name = "array_util_test",
+    srcs = ["array_util_test.cc"],
+    deps = [
+        ":fury_util",
+        "@com_google_googletest//:gtest",
+    ],
 )
\ No newline at end of file
diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index 5dcdd31c2f..5decaa3524 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -2,13 +2,14 @@
 
 namespace fury {
 #if defined(FURY_HAS_IMMINTRIN)
-inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
+// TODO: runtime dispatch for avx
+// We can not distribute a seperate wheel for avx, so we need to
+// check instcuctions set at runtime
+inline uint16_t getMaxValueAVX(const uint16_t* arr, size_t length) {
   if (length == 0) {
     return 0;  // Return 0 for empty arrays
   }
-
   __m256i max_val = _mm256_setzero_si256();  // Initialize max vector with zeros
-
   size_t i = 0;
   for (; i + 16 <= length; i += 16) {
     __m256i current_val = _mm256_loadu_si256((__m256i*)&arr[i]);
@@ -34,7 +35,7 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_avx;
 }
 
-inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
+inline void copyValueAVX(const uint16_t* from, uint8_t* to, size_t length) {
   size_t i = 0;
   // Process chunks of 32 bytes (16 uint16_t elements at a time)
   for (; i + 31 < length; i += 32) {
@@ -71,7 +72,8 @@ inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
     to[i] = static_cast<uint8_t>(from[i]);
   }
 }
-#elif defined(FURY_HAS_NEON)
+#endif
+#if defined(FURY_HAS_NEON)
 inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   if (length == 0) {
     return 0;  // Return 0 for empty arrays
diff --git a/cpp/fury/util/platform.h b/cpp/fury/util/platform.h
index 70a699bc20..bc5b13bbb9 100644
--- a/cpp/fury/util/platform.h
+++ b/cpp/fury/util/platform.h
@@ -23,9 +23,6 @@
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
 #include <arm_neon.h>
 #define FURY_HAS_NEON
-#elif defined(__SSE2__)
-#include <emmintrin.h>
-#define FURY_HAS_SSE2
 #elif defined(__riscv) && __riscv_vector
 #include <riscv_vector.h>
 #define FURY_HAS_RISCV_VECTOR

From 84e0b0b25d845bcd51ec77e75b5cd24f682451cf Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 01:03:47 +0800
Subject: [PATCH 04/23] enable avx for cpp test

---
 cpp/fury/util/BUILD        | 4 +++-
 cpp/fury/util/array_util.h | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/fury/util/BUILD b/cpp/fury/util/BUILD
index 2a2a6e5396..124825b9b3 100644
--- a/cpp/fury/util/BUILD
+++ b/cpp/fury/util/BUILD
@@ -72,4 +72,6 @@ cc_test(
         ":fury_util",
         "@com_google_googletest//:gtest",
     ],
-)
\ No newline at end of file
+    copts = ["-mavx2"],  # Enable AVX2 support
+    linkopts = ["-mavx2"],  # Ensure linker also knows about AVX2
+)
diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index 5decaa3524..0058c675b6 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -180,7 +180,7 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_val;
 }
 
-inline void copyValue(const uint16_t* from, const uint8_t* to, size_t length) {
+inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
   // Fallback for systems without SSE2/NEON
   for (size_t i = 0; i < length; ++i) {
     to[i] = static_cast<uint8_t>(from[i]);

From 9fd56f06691a8bb000d5a2e7b4fc4fbcd891b0a7 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 13:21:46 +0800
Subject: [PATCH 05/23] implement pyunicode library

---
 BUILD                              |  3 +
 cpp/fury/python/BUILD              | 33 ++++++++++
 cpp/fury/python/pyunicode.cc       | 24 ++++++++
 cpp/fury/python/pyunicode.h        | 97 ++++++++++++++++++++++++++++++
 python/pyfury/includes/libutil.pxd |  6 ++
 5 files changed, 163 insertions(+)
 create mode 100644 cpp/fury/python/BUILD
 create mode 100644 cpp/fury/python/pyunicode.cc
 create mode 100644 cpp/fury/python/pyunicode.h

diff --git a/BUILD b/BUILD
index d5f1063377..c3c58b4eef 100644
--- a/BUILD
+++ b/BUILD
@@ -31,6 +31,7 @@ pyx_library(
     ),
     deps = [
         "//cpp/fury/util:fury_util",
+        "//cpp/fury/python:pyunicode",
     ],
 )
 
@@ -63,6 +64,7 @@ pyx_library(
     deps = [
         "//cpp/fury/util:fury_util",
         "//cpp/fury/type:fury_type",
+        "//cpp/fury/python:pyunicode",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -83,6 +85,7 @@ pyx_library(
     ),
     deps = [
         "//cpp/fury:fury",
+        "//cpp/fury/python:pyunicode",
         "@local_config_pyarrow//:python_numpy_headers",
         "@local_config_pyarrow//:arrow_python_shared_library"
     ],
diff --git a/cpp/fury/python/BUILD b/cpp/fury/python/BUILD
new file mode 100644
index 0000000000..6b218d0a60
--- /dev/null
+++ b/cpp/fury/python/BUILD
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+load("@com_github_grpc_grpc//bazel:cython_library.bzl", "pyx_library")
+
+cc_library(
+    name = "pyunicode",
+    srcs = ["pyunicode.cc"],
+    hdrs = ["pyunicode.h"],
+    alwayslink=True,
+    linkstatic=True,
+    strip_include_prefix = "/cpp",
+    deps = [
+        "//cpp/fury/util:fury_util",
+        "@local_config_python//:python_headers",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
new file mode 100644
index 0000000000..c25ce80945
--- /dev/null
+++ b/cpp/fury/python/pyunicode.cc
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "pyunicode.h"
+
+namespace fury {
+
+}  // namespace fury
diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
new file mode 100644
index 0000000000..1c0ea57861
--- /dev/null
+++ b/cpp/fury/python/pyunicode.h
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <string>
+#include <cstring>
+#include "fury/util/array_util.h"
+#include "fury/util/buffer.h"
+#include "fury/util/logging.h"
+#include "fury/util/string_util.h"
+#include "pyport.h"
+#include "object.h"
+#include "unicodeobject.h"
+
+namespace fury {
+
+static PyObject* unicode_latin1[256] = {nullptr};
+
+static PyObject* get_latin1_char(unsigned char ch) {
+  PyObject* unicode = unicode_latin1[ch];
+  if (!unicode) {
+    unicode = PyUnicode_New(1, ch);
+    if (!unicode) return NULL;
+    PyUnicode_1BYTE_DATA(unicode)[0] = ch;
+    // assert(_PyUnicode_CheckConsistency(unicode, 1));
+    unicode_latin1[ch] = unicode;
+  }
+  Py_INCREF(unicode);
+  return unicode;
+}
+
+// unicodeobject.c
+inline PyObject* Fury_PyUnicode_FromUCS1(const uint8_t* u, Py_ssize_t size) {
+  PyObject* res;
+  unsigned char max_char;
+  FURY_CHECK(size > 0);
+  if (size == 1) return get_latin1_char(u[0]);
+  max_char = isAscii(reinterpret_cast<const char*>(u), size) ? 127 : 255;
+  res = PyUnicode_New(size, max_char);
+  if (!res) return NULL;
+  std::memcpy(PyUnicode_1BYTE_DATA(res), u, size);
+  // assert(_PyUnicode_CheckConsistency(res, 1));
+  return res;
+}
+
+inline PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size) {
+  PyObject* res;
+  Py_UCS2 max_char;
+  FURY_CHECK(size > 0);
+  if (size == 1) {
+    max_char = u[0];
+    if (max_char < 256) {
+      return get_latin1_char(max_char);
+    } else {
+      res = PyUnicode_New(1, max_char);
+      if (res == NULL) {
+        return NULL;
+      }
+      if (PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND) {
+        PyUnicode_2BYTE_DATA(res)[0] = (Py_UCS2)max_char;
+      } else {
+        FURY_CHECK(PyUnicode_KIND(res) == PyUnicode_4BYTE_KIND);
+        PyUnicode_4BYTE_DATA(res)[0] = max_char;
+      }
+      return res;
+    }
+  }
+  max_char = getMaxValue(u, size);
+  res = PyUnicode_New(size, max_char);
+  if (!res) {
+    return NULL;
+  }
+  if (max_char >= 256) {
+    std::memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2) * size);
+  } else {
+     copyValue(u, PyUnicode_1BYTE_DATA(res), size);
+  }
+  // assert(_PyUnicode_CheckConsistency(res, 1));
+  return res;
+}
+
+}  // namespace fury
diff --git a/python/pyfury/includes/libutil.pxd b/python/pyfury/includes/libutil.pxd
index 72a640033d..5618417c23 100644
--- a/python/pyfury/includes/libutil.pxd
+++ b/python/pyfury/includes/libutil.pxd
@@ -19,6 +19,7 @@ from libc.stdint cimport *
 from libcpp cimport bool as c_bool
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string as c_string
+from cpython cimport PyObject
 
 cdef extern from "fury/util/buffer.h" namespace "fury" nogil:
     cdef cppclass CStatus" fury::Status":
@@ -111,3 +112,8 @@ cdef extern from "fury/util/bit_util.h" namespace "fury::util" nogil:
 
 cdef extern from "fury/util/string_util.h" namespace "fury" nogil:
     c_bool utf16HasSurrogatePairs(uint16_t* data, size_t size)
+
+
+cdef extern from "fury/python/pyunicode.h" namespace "fury" nogil:
+    PyObject* Fury_PyUnicode_FromUCS1(const uint8_t* u, Py_ssize_t size)
+    PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size)

From 77fbec9351bc45c45be1e175082c89a5e78cdfc9 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:10:36 +0800
Subject: [PATCH 06/23] use pyunicode for python ucs1/2 string decoding

---
 cpp/fury/python/pyunicode.cc       | 63 +++++++++++++++++++++++++++++
 cpp/fury/python/pyunicode.h        | 64 +-----------------------------
 cpp/fury/util/array_util_test.cc   |  4 +-
 cpp/fury/util/platform.h           |  4 ++
 cpp/fury/util/string_util_test.cc  | 21 ++++++++--
 python/pyfury/_util.pyx            | 11 +++--
 python/pyfury/includes/libutil.pxd |  2 +-
 7 files changed, 97 insertions(+), 72 deletions(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index c25ce80945..e875c357e3 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -21,4 +21,67 @@
 
 namespace fury {
 
+static PyObject* unicode_latin1[256] = {nullptr};
+
+static PyObject* get_latin1_char(unsigned char ch) {
+  PyObject* unicode = unicode_latin1[ch];
+  if (!unicode) {
+    unicode = PyUnicode_New(1, ch);
+    if (!unicode) return NULL;
+    PyUnicode_1BYTE_DATA(unicode)[0] = ch;
+    // assert(_PyUnicode_CheckConsistency(unicode, 1));
+    unicode_latin1[ch] = unicode;
+  }
+  Py_INCREF(unicode);
+  return unicode;
+}
+
+PyObject* Fury_PyUnicode_FromUCS1(const char* u, Py_ssize_t size) {
+  PyObject* res;
+  unsigned char max_char;
+  FURY_CHECK(size > 0);
+  if (size == 1) return get_latin1_char(u[0]);
+  max_char = isAscii(reinterpret_cast<const char*>(u), size) ? 127 : 255;
+  res = PyUnicode_New(size, max_char);
+  if (!res) return NULL;
+  memcpy(PyUnicode_1BYTE_DATA(res), u, size);
+  // assert(_PyUnicode_CheckConsistency(res, 1));
+  return res;
+}
+
+PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size) {
+  PyObject* res;
+  Py_UCS2 max_char;
+  FURY_CHECK(size > 0);
+  if (size == 1) {
+    max_char = u[0];
+    if (max_char < 256) {
+      return get_latin1_char(max_char);
+    } else {
+      res = PyUnicode_New(1, max_char);
+      if (res == NULL) {
+        return NULL;
+      }
+      if (PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND) {
+        PyUnicode_2BYTE_DATA(res)[0] = (Py_UCS2)max_char;
+      } else {
+        FURY_CHECK(PyUnicode_KIND(res) == PyUnicode_4BYTE_KIND);
+        PyUnicode_4BYTE_DATA(res)[0] = max_char;
+      }
+      return res;
+    }
+  }
+  max_char = getMaxValue(u, size);
+  res = PyUnicode_New(size, max_char);
+  if (!res) {
+    return NULL;
+  }
+  if (max_char >= 256) {
+    memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2) * size);
+  } else {
+     copyValue(u, PyUnicode_1BYTE_DATA(res), size);
+  }
+  // assert(_PyUnicode_CheckConsistency(res, 1));
+  return res;
+}
 }  // namespace fury
diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index 1c0ea57861..2512fbb45b 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -29,69 +29,9 @@
 
 namespace fury {
 
-static PyObject* unicode_latin1[256] = {nullptr};
-
-static PyObject* get_latin1_char(unsigned char ch) {
-  PyObject* unicode = unicode_latin1[ch];
-  if (!unicode) {
-    unicode = PyUnicode_New(1, ch);
-    if (!unicode) return NULL;
-    PyUnicode_1BYTE_DATA(unicode)[0] = ch;
-    // assert(_PyUnicode_CheckConsistency(unicode, 1));
-    unicode_latin1[ch] = unicode;
-  }
-  Py_INCREF(unicode);
-  return unicode;
-}
-
 // unicodeobject.c
-inline PyObject* Fury_PyUnicode_FromUCS1(const uint8_t* u, Py_ssize_t size) {
-  PyObject* res;
-  unsigned char max_char;
-  FURY_CHECK(size > 0);
-  if (size == 1) return get_latin1_char(u[0]);
-  max_char = isAscii(reinterpret_cast<const char*>(u), size) ? 127 : 255;
-  res = PyUnicode_New(size, max_char);
-  if (!res) return NULL;
-  std::memcpy(PyUnicode_1BYTE_DATA(res), u, size);
-  // assert(_PyUnicode_CheckConsistency(res, 1));
-  return res;
-}
+PyObject* Fury_PyUnicode_FromUCS1(const char* u, Py_ssize_t size);
 
-inline PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size) {
-  PyObject* res;
-  Py_UCS2 max_char;
-  FURY_CHECK(size > 0);
-  if (size == 1) {
-    max_char = u[0];
-    if (max_char < 256) {
-      return get_latin1_char(max_char);
-    } else {
-      res = PyUnicode_New(1, max_char);
-      if (res == NULL) {
-        return NULL;
-      }
-      if (PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND) {
-        PyUnicode_2BYTE_DATA(res)[0] = (Py_UCS2)max_char;
-      } else {
-        FURY_CHECK(PyUnicode_KIND(res) == PyUnicode_4BYTE_KIND);
-        PyUnicode_4BYTE_DATA(res)[0] = max_char;
-      }
-      return res;
-    }
-  }
-  max_char = getMaxValue(u, size);
-  res = PyUnicode_New(size, max_char);
-  if (!res) {
-    return NULL;
-  }
-  if (max_char >= 256) {
-    std::memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2) * size);
-  } else {
-     copyValue(u, PyUnicode_1BYTE_DATA(res), size);
-  }
-  // assert(_PyUnicode_CheckConsistency(res, 1));
-  return res;
-}
+PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size);
 
 }  // namespace fury
diff --git a/cpp/fury/util/array_util_test.cc b/cpp/fury/util/array_util_test.cc
index 399018730a..50bfc3c92d 100644
--- a/cpp/fury/util/array_util_test.cc
+++ b/cpp/fury/util/array_util_test.cc
@@ -25,9 +25,9 @@ TEST(GetMaxValueTest, HandlesLargeArray) {
   }
   EXPECT_EQ(getMaxValue(arr, length), 1023);
 }
-}  // namespace fury
+} // namespace fury
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/cpp/fury/util/platform.h b/cpp/fury/util/platform.h
index bc5b13bbb9..0c725478d7 100644
--- a/cpp/fury/util/platform.h
+++ b/cpp/fury/util/platform.h
@@ -27,3 +27,7 @@
 #include <riscv_vector.h>
 #define FURY_HAS_RISCV_VECTOR
 #endif
+#if defined(__SSE2__)
+#include <emmintrin.h>
+#define FURY_HAS_SSE2
+#endif
diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc
index f57f75186f..080fd603ed 100644
--- a/cpp/fury/util/string_util_test.cc
+++ b/cpp/fury/util/string_util_test.cc
@@ -58,21 +58,34 @@ bool isAscii_BaseLine(const std::string &str) {
 TEST(StringUtilTest, TestisAsciiFunctions) {
   std::string testStr = generateRandomString(100000);
   auto start_time = std::chrono::high_resolution_clock::now();
-  bool result = isAscii_BaseLine(testStr);
+  bool result;
+  int c = 0;
+  for (size_t i = 0; i < 10000; i++) {
+    result = isAscii_BaseLine(testStr);
+    if (result) {
+      c++;
+    }
+  }
+
   auto end_time = std::chrono::high_resolution_clock::now();
   auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
                       end_time - start_time)
                       .count();
   FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns.";
-
+  FURY_LOG(DEBUG) << "Avoid compiler optimized loop " << c;
   start_time = std::chrono::high_resolution_clock::now();
-  result = isAscii(testStr);
+  for (size_t i = 0; i < 10000; i++) {
+    result = isAscii(testStr);
+    if (result) {
+      c++;
+    }
+  }
   end_time = std::chrono::high_resolution_clock::now();
   duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time -
                                                                   start_time)
                  .count();
   FURY_LOG(INFO) << "Optimized Running Time: " << duration << " ns.";
-
+  FURY_LOG(DEBUG) << "Avoid compiler optimized loop " << c;
   EXPECT_TRUE(result);
 }
 
diff --git a/python/pyfury/_util.pyx b/python/pyfury/_util.pyx
index ca87d81e0c..0db77a81ad 100644
--- a/python/pyfury/_util.pyx
+++ b/python/pyfury/_util.pyx
@@ -27,7 +27,8 @@ from libcpp.memory cimport shared_ptr, make_shared
 from libc.stdint cimport *
 from libcpp cimport bool as c_bool
 from pyfury.includes.libutil cimport(
-    CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo, CStatus, StatusCode, utf16HasSurrogatePairs
+    CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo, CStatus, StatusCode, utf16HasSurrogatePairs,
+    Fury_PyUnicode_FromUCS1, Fury_PyUnicode_FromUCS2
 )
 
 cdef int32_t max_buffer_size = 2 ** 31 - 1
@@ -572,12 +573,15 @@ cdef class Buffer:
         cdef uint64_t header = self.read_varuint64()
         cdef uint32_t size = header >> 2
         self.check_bound(self.reader_index, size)
+        if size == 0:
+            return ""
         cdef const char * buf = <const char *>(self.c_buffer.get().data() + self.reader_index)
         self.reader_index += size
         cdef uint32_t encoding = header & <uint32_t>0b11
         if encoding == 0:
             # PyUnicode_FromASCII
-            return PyUnicode_DecodeLatin1(buf, size, "strict")
+            return <unicode>Fury_PyUnicode_FromUCS1(buf, size)
+            # return PyUnicode_DecodeLatin1(buf, size, "strict")
         elif encoding == 1:
             if utf16HasSurrogatePairs(<const uint16_t *>buf, size >> 1):
                 return PyUnicode_DecodeUTF16(
@@ -587,7 +591,8 @@ cdef class Buffer:
                     &UTF16_LE,  # fury use little-endian
                 )
             else:
-                return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, buf, size >> 1)
+                # return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, buf, size >> 1)
+                return <unicode>Fury_PyUnicode_FromUCS2(<const uint16_t *>buf, size >> 1)
         else:
             return PyUnicode_DecodeUTF8(buf, size, "strict")
 
diff --git a/python/pyfury/includes/libutil.pxd b/python/pyfury/includes/libutil.pxd
index 5618417c23..b79287a659 100644
--- a/python/pyfury/includes/libutil.pxd
+++ b/python/pyfury/includes/libutil.pxd
@@ -115,5 +115,5 @@ cdef extern from "fury/util/string_util.h" namespace "fury" nogil:
 
 
 cdef extern from "fury/python/pyunicode.h" namespace "fury" nogil:
-    PyObject* Fury_PyUnicode_FromUCS1(const uint8_t* u, Py_ssize_t size)
+    PyObject* Fury_PyUnicode_FromUCS1(const char* u, Py_ssize_t size)
     PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size)

From ec2c4d4474fd6b84b440e1332ece505b2b056c6f Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:13:48 +0800
Subject: [PATCH 07/23] remove avx getMaxValue and copyValue

---
 cpp/fury/util/array_util.h | 72 --------------------------------------
 1 file changed, 72 deletions(-)

diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index 0058c675b6..b4c88866a0 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -1,78 +1,6 @@
 #include "fury/util/platform.h"
 
 namespace fury {
-#if defined(FURY_HAS_IMMINTRIN)
-// TODO: runtime dispatch for avx
-// We can not distribute a seperate wheel for avx, so we need to
-// check instcuctions set at runtime
-inline uint16_t getMaxValueAVX(const uint16_t* arr, size_t length) {
-  if (length == 0) {
-    return 0;  // Return 0 for empty arrays
-  }
-  __m256i max_val = _mm256_setzero_si256();  // Initialize max vector with zeros
-  size_t i = 0;
-  for (; i + 16 <= length; i += 16) {
-    __m256i current_val = _mm256_loadu_si256((__m256i*)&arr[i]);
-    max_val = _mm256_max_epu16(max_val, current_val);  // Max operation
-  }
-
-  // Find the max value in the resulting vector
-  uint16_t temp[16];
-  _mm256_storeu_si256((__m256i*)temp, max_val);
-  uint16_t max_avx = temp[0];
-  for (int j = 1; j < 16; j++) {
-    if (temp[j] > max_avx) {
-      max_avx = temp[j];
-    }
-  }
-
-  // Handle remaining elements
-  for (; i < length; i++) {
-    if (arr[i] > max_avx) {
-      max_avx = arr[i];
-    }
-  }
-  return max_avx;
-}
-
-inline void copyValueAVX(const uint16_t* from, uint8_t* to, size_t length) {
-  size_t i = 0;
-  // Process chunks of 32 bytes (16 uint16_t elements at a time)
-  for (; i + 31 < length; i += 32) {
-    // Load two 256-bit blocks (32 uint16_t elements total)
-    __m256i src1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&from[i]));
-    __m256i src2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&from[i + 16]));
-
-    // Narrow the 16-bit integers to 8-bit integers
-    __m256i packed = _mm256_packus_epi16(src1, src2);
-
-    // Shuffle the packed result to interleave lower and upper parts
-    packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
-
-    // Store the result
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&to[i]), packed);
-  }
-  // Check if at least 16 elements are left to process
-  if (i + 15 < length) {
-    // Process the next 16 elements
-    __m256i src1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&from[i]));
-    // Narrow the 16-bit integers to 8-bit integers by zeroing the upper halves
-    __m128i packed1 = _mm256_castsi256_si128(src1);       // Lower 128 bits
-    __m128i packed2 = _mm256_extracti128_si256(src1, 1);  // Upper 128 bits
-    // Pack two 128-wide vectors into 8-bit integers, ignore saturating with itself.
-    __m128i packed = _mm_packus_epi16(packed1, packed2);
-
-    // Store the result; using only the first 128 bits
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(&to[i]), packed);
-
-    i += 16;
-  }
-  // Process remaining elements one at a time
-  for (; i < length; ++i) {
-    to[i] = static_cast<uint8_t>(from[i]);
-  }
-}
-#endif
 #if defined(FURY_HAS_NEON)
 inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   if (length == 0) {

From a0d74f14220b146b9562238651fce26c60db90a9 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:14:18 +0800
Subject: [PATCH 08/23] rename copyValue to copyArray

---
 cpp/fury/python/pyunicode.cc | 2 +-
 cpp/fury/util/array_util.h   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index e875c357e3..57654ffead 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -79,7 +79,7 @@ PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size) {
   if (max_char >= 256) {
     memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2) * size);
   } else {
-     copyValue(u, PyUnicode_1BYTE_DATA(res), size);
+     copyArray(u, PyUnicode_1BYTE_DATA(res), size);
   }
   // assert(_PyUnicode_CheckConsistency(res, 1));
   return res;
diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index b4c88866a0..b826b08572 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -33,7 +33,7 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_neon;
 }
 
-inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
+inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
   size_t i = 0;
   for (; i + 7 < length; i += 8) {
     uint16x8_t src = vld1q_u16(&from[i]);
@@ -79,7 +79,7 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_sse;
 }
 
-inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
+inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
   size_t i = 0;
   __m128i mask = _mm_set1_epi16(0xFF);  // Mask to zero out the high byte
   for (; i + 7 < length; i += 8) {
@@ -108,7 +108,7 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_val;
 }
 
-inline void copyValue(const uint16_t* from, uint8_t* to, size_t length) {
+inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
   // Fallback for systems without SSE2/NEON
   for (size_t i = 0; i < length; ++i) {
     to[i] = static_cast<uint8_t>(from[i]);

From 8e2a4b261f9264ee764a3ce47d7be2eda2919828 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:20:58 +0800
Subject: [PATCH 09/23] add header and #pragma once

---
 cpp/fury/util/array_util.h       | 20 ++++++++++++++++++++
 cpp/fury/util/array_util_test.cc | 19 +++++++++++++++++++
 cpp/fury/util/platform.h         |  2 ++
 3 files changed, 41 insertions(+)

diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index b826b08572..e4df6fd148 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -1,3 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
 #include "fury/util/platform.h"
 
 namespace fury {
diff --git a/cpp/fury/util/array_util_test.cc b/cpp/fury/util/array_util_test.cc
index 50bfc3c92d..669bb96223 100644
--- a/cpp/fury/util/array_util_test.cc
+++ b/cpp/fury/util/array_util_test.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #include "fury/util/array_util.h"
 #include "gtest/gtest.h"
 
diff --git a/cpp/fury/util/platform.h b/cpp/fury/util/platform.h
index 0c725478d7..9aa562a3db 100644
--- a/cpp/fury/util/platform.h
+++ b/cpp/fury/util/platform.h
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#pragma once
+
 #if defined(__x86_64__) || defined(_M_X64)
 #include <immintrin.h>
 #define FURY_HAS_IMMINTRIN

From d1d02e71ef2e6a4a3065fef473ffae247c0659f0 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:24:46 +0800
Subject: [PATCH 10/23] add cstdint include

---
 cpp/fury/util/array_util.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index e4df6fd148..ef7328ab70 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -18,6 +18,7 @@
  */
 
 #pragma once
+#include <cstdint>
 #include "fury/util/platform.h"
 
 namespace fury {

From 221a6f10962ecc855d39c989dafc85892f07239a Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:25:46 +0800
Subject: [PATCH 11/23] lint code

---
 cpp/fury/python/pyunicode.cc | 29 ++++++++++++++------------
 cpp/fury/python/pyunicode.h  | 12 +++++------
 cpp/fury/util/array_util.h   | 40 ++++++++++++++++++------------------
 3 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index 57654ffead..e8cfee7cae 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -21,13 +21,14 @@
 
 namespace fury {
 
-static PyObject* unicode_latin1[256] = {nullptr};
+static PyObject *unicode_latin1[256] = {nullptr};
 
-static PyObject* get_latin1_char(unsigned char ch) {
-  PyObject* unicode = unicode_latin1[ch];
+static PyObject *get_latin1_char(unsigned char ch) {
+  PyObject *unicode = unicode_latin1[ch];
   if (!unicode) {
     unicode = PyUnicode_New(1, ch);
-    if (!unicode) return NULL;
+    if (!unicode)
+      return NULL;
     PyUnicode_1BYTE_DATA(unicode)[0] = ch;
     // assert(_PyUnicode_CheckConsistency(unicode, 1));
     unicode_latin1[ch] = unicode;
@@ -36,21 +37,23 @@ static PyObject* get_latin1_char(unsigned char ch) {
   return unicode;
 }
 
-PyObject* Fury_PyUnicode_FromUCS1(const char* u, Py_ssize_t size) {
-  PyObject* res;
+PyObject *Fury_PyUnicode_FromUCS1(const char *u, Py_ssize_t size) {
+  PyObject *res;
   unsigned char max_char;
   FURY_CHECK(size > 0);
-  if (size == 1) return get_latin1_char(u[0]);
-  max_char = isAscii(reinterpret_cast<const char*>(u), size) ? 127 : 255;
+  if (size == 1)
+    return get_latin1_char(u[0]);
+  max_char = isAscii(reinterpret_cast<const char *>(u), size) ? 127 : 255;
   res = PyUnicode_New(size, max_char);
-  if (!res) return NULL;
+  if (!res)
+    return NULL;
   memcpy(PyUnicode_1BYTE_DATA(res), u, size);
   // assert(_PyUnicode_CheckConsistency(res, 1));
   return res;
 }
 
-PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size) {
-  PyObject* res;
+PyObject *Fury_PyUnicode_FromUCS2(const uint16_t *u, Py_ssize_t size) {
+  PyObject *res;
   Py_UCS2 max_char;
   FURY_CHECK(size > 0);
   if (size == 1) {
@@ -79,9 +82,9 @@ PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size) {
   if (max_char >= 256) {
     memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2) * size);
   } else {
-     copyArray(u, PyUnicode_1BYTE_DATA(res), size);
+    copyArray(u, PyUnicode_1BYTE_DATA(res), size);
   }
   // assert(_PyUnicode_CheckConsistency(res, 1));
   return res;
 }
-}  // namespace fury
+} // namespace fury
diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index 2512fbb45b..76b96751ba 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -17,21 +17,21 @@
  * under the License.
  */
 
-#include <string>
-#include <cstring>
 #include "fury/util/array_util.h"
 #include "fury/util/buffer.h"
 #include "fury/util/logging.h"
 #include "fury/util/string_util.h"
-#include "pyport.h"
 #include "object.h"
+#include "pyport.h"
 #include "unicodeobject.h"
+#include <cstring>
+#include <string>
 
 namespace fury {
 
 // unicodeobject.c
-PyObject* Fury_PyUnicode_FromUCS1(const char* u, Py_ssize_t size);
+PyObject *Fury_PyUnicode_FromUCS1(const char *u, Py_ssize_t size);
 
-PyObject* Fury_PyUnicode_FromUCS2(const uint16_t* u, Py_ssize_t size);
+PyObject *Fury_PyUnicode_FromUCS2(const uint16_t *u, Py_ssize_t size);
 
-}  // namespace fury
+} // namespace fury
diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index ef7328ab70..9bc7053dc9 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -18,21 +18,21 @@
  */
 
 #pragma once
-#include <cstdint>
 #include "fury/util/platform.h"
+#include <cstdint>
 
 namespace fury {
 #if defined(FURY_HAS_NEON)
-inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
+inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
   if (length == 0) {
-    return 0;  // Return 0 for empty arrays
+    return 0; // Return 0 for empty arrays
   }
-  uint16x8_t max_val = vdupq_n_u16(0);  // Initialize max vector to zero
+  uint16x8_t max_val = vdupq_n_u16(0); // Initialize max vector to zero
 
   size_t i = 0;
   for (; i + 8 <= length; i += 8) {
     uint16x8_t current_val = vld1q_u16(&arr[i]);
-    max_val = vmaxq_u16(max_val, current_val);  // Max operation
+    max_val = vmaxq_u16(max_val, current_val); // Max operation
   }
 
   // Find the max value in the resulting vector
@@ -54,7 +54,7 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_neon;
 }
 
-inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
+inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
   size_t i = 0;
   for (; i + 7 < length; i += 8) {
     uint16x8_t src = vld1q_u16(&from[i]);
@@ -68,22 +68,22 @@ inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
   }
 }
 #elif defined(FURY_HAS_SSE2)
-inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
+inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
   if (length == 0) {
-    return 0;  // Return 0 for empty arrays
+    return 0; // Return 0 for empty arrays
   }
 
-  __m128i max_val = _mm_setzero_si128();  // Initialize max vector with zeros
+  __m128i max_val = _mm_setzero_si128(); // Initialize max vector with zeros
 
   size_t i = 0;
   for (; i + 8 <= length; i += 8) {
-    __m128i current_val = _mm_loadu_si128((__m128i*)&arr[i]);
-    max_val = _mm_max_epu16(max_val, current_val);  // Max operation
+    __m128i current_val = _mm_loadu_si128((__m128i *)&arr[i]);
+    max_val = _mm_max_epu16(max_val, current_val); // Max operation
   }
 
   // Find the max value in the resulting vector
   uint16_t temp[8];
-  _mm_storeu_si128((__m128i*)temp, max_val);
+  _mm_storeu_si128((__m128i *)temp, max_val);
   uint16_t max_sse = temp[0];
   for (int j = 1; j < 8; j++) {
     if (temp[j] > max_sse) {
@@ -100,13 +100,13 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_sse;
 }
 
-inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
+inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
   size_t i = 0;
-  __m128i mask = _mm_set1_epi16(0xFF);  // Mask to zero out the high byte
+  __m128i mask = _mm_set1_epi16(0xFF); // Mask to zero out the high byte
   for (; i + 7 < length; i += 8) {
-    __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&from[i]));
+    __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&from[i]));
     __m128i result = _mm_and_si128(src, mask);
-    _mm_storel_epi64(reinterpret_cast<__m128i*>(&to[i]),
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&to[i]),
                      _mm_packus_epi16(result, result));
   }
 
@@ -116,9 +116,9 @@ inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
   }
 }
 #else
-inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
+inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
   if (length == 0) {
-    return 0;  // Return 0 for empty arrays
+    return 0; // Return 0 for empty arrays
   }
   uint16_t max_val = arr[0];
   for (size_t i = 1; i < length; i++) {
@@ -129,11 +129,11 @@ inline uint16_t getMaxValue(const uint16_t* arr, size_t length) {
   return max_val;
 }
 
-inline void copyArray(const uint16_t* from, uint8_t* to, size_t length) {
+inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
   // Fallback for systems without SSE2/NEON
   for (size_t i = 0; i < length; ++i) {
     to[i] = static_cast<uint8_t>(from[i]);
   }
 }
 #endif
-}  // namespace fury
+} // namespace fury

From 4793946c468753ad0e3639c7305acc8655a95a6e Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:28:12 +0800
Subject: [PATCH 12/23] add #include <cassert>

---
 cpp/fury/python/pyunicode.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index e8cfee7cae..a3454f02a9 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -18,6 +18,7 @@
  */
 
 #include "pyunicode.h"
+#include <cassert>
 
 namespace fury {
 

From 6f0a64b0eb5ff5286081f81bbe10dcba62d9cb7e Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:32:29 +0800
Subject: [PATCH 13/23] remove array util inline

---
 cpp/fury/util/array_util.cc | 139 ++++++++++++++++++++++++++++++++++++
 cpp/fury/util/array_util.h  | 115 +----------------------------
 2 files changed, 141 insertions(+), 113 deletions(-)
 create mode 100644 cpp/fury/util/array_util.cc

diff --git a/cpp/fury/util/array_util.cc b/cpp/fury/util/array_util.cc
new file mode 100644
index 0000000000..65a4cd862f
--- /dev/null
+++ b/cpp/fury/util/array_util.cc
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#include "fury/util/platform.h"
+#include <cstdint>
+
+namespace fury {
+#if defined(FURY_HAS_NEON)
+uint16_t getMaxValue(const uint16_t *arr, size_t length) {
+  if (length == 0) {
+    return 0; // Return 0 for empty arrays
+  }
+  uint16x8_t max_val = vdupq_n_u16(0); // Initialize max vector to zero
+
+  size_t i = 0;
+  for (; i + 8 <= length; i += 8) {
+    uint16x8_t current_val = vld1q_u16(&arr[i]);
+    max_val = vmaxq_u16(max_val, current_val); // Max operation
+  }
+
+  // Find the max value in the resulting vector
+  uint16_t temp[8];
+  vst1q_u16(temp, max_val);
+  uint16_t max_neon = temp[0];
+  for (int j = 1; j < 8; j++) {
+    if (temp[j] > max_neon) {
+      max_neon = temp[j];
+    }
+  }
+
+  // Handle remaining elements
+  for (; i < length; i++) {
+    if (arr[i] > max_neon) {
+      max_neon = arr[i];
+    }
+  }
+  return max_neon;
+}
+
+void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
+  size_t i = 0;
+  for (; i + 7 < length; i += 8) {
+    uint16x8_t src = vld1q_u16(&from[i]);
+    uint8x8_t result = vmovn_u16(src);
+    vst1_u8(&to[i], result);
+  }
+
+  // Fallback for the remainder
+  for (; i < length; ++i) {
+    to[i] = static_cast<uint8_t>(from[i]);
+  }
+}
+#elif defined(FURY_HAS_SSE2)
+uint16_t getMaxValue(const uint16_t *arr, size_t length) {
+  if (length == 0) {
+    return 0; // Return 0 for empty arrays
+  }
+
+  __m128i max_val = _mm_setzero_si128(); // Initialize max vector with zeros
+
+  size_t i = 0;
+  for (; i + 8 <= length; i += 8) {
+    __m128i current_val = _mm_loadu_si128((__m128i *)&arr[i]);
+    max_val = _mm_max_epu16(max_val, current_val); // Max operation
+  }
+
+  // Find the max value in the resulting vector
+  uint16_t temp[8];
+  _mm_storeu_si128((__m128i *)temp, max_val);
+  uint16_t max_sse = temp[0];
+  for (int j = 1; j < 8; j++) {
+    if (temp[j] > max_sse) {
+      max_sse = temp[j];
+    }
+  }
+
+  // Handle remaining elements
+  for (; i < length; i++) {
+    if (arr[i] > max_sse) {
+      max_sse = arr[i];
+    }
+  }
+  return max_sse;
+}
+
+void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
+  size_t i = 0;
+  __m128i mask = _mm_set1_epi16(0xFF); // Mask to zero out the high byte
+  for (; i + 7 < length; i += 8) {
+    __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&from[i]));
+    __m128i result = _mm_and_si128(src, mask);
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&to[i]),
+                     _mm_packus_epi16(result, result));
+  }
+
+  // Fallback for the remainder
+  for (; i < length; ++i) {
+    to[i] = static_cast<uint8_t>(from[i]);
+  }
+}
+#else
+uint16_t getMaxValue(const uint16_t *arr, size_t length) {
+  if (length == 0) {
+    return 0; // Return 0 for empty arrays
+  }
+  uint16_t max_val = arr[0];
+  for (size_t i = 1; i < length; i++) {
+    if (arr[i] > max_val) {
+      max_val = arr[i];
+    }
+  }
+  return max_val;
+}
+
+void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
+  // Fallback for systems without SSE2/NEON
+  for (size_t i = 0; i < length; ++i) {
+    to[i] = static_cast<uint8_t>(from[i]);
+  }
+}
+#endif
+} // namespace fury
diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index 9bc7053dc9..e7d89f275d 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -22,118 +22,7 @@
 #include <cstdint>
 
 namespace fury {
-#if defined(FURY_HAS_NEON)
-inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
-  if (length == 0) {
-    return 0; // Return 0 for empty arrays
-  }
-  uint16x8_t max_val = vdupq_n_u16(0); // Initialize max vector to zero
+uint16_t getMaxValue(const uint16_t *arr, size_t length);
 
-  size_t i = 0;
-  for (; i + 8 <= length; i += 8) {
-    uint16x8_t current_val = vld1q_u16(&arr[i]);
-    max_val = vmaxq_u16(max_val, current_val); // Max operation
-  }
-
-  // Find the max value in the resulting vector
-  uint16_t temp[8];
-  vst1q_u16(temp, max_val);
-  uint16_t max_neon = temp[0];
-  for (int j = 1; j < 8; j++) {
-    if (temp[j] > max_neon) {
-      max_neon = temp[j];
-    }
-  }
-
-  // Handle remaining elements
-  for (; i < length; i++) {
-    if (arr[i] > max_neon) {
-      max_neon = arr[i];
-    }
-  }
-  return max_neon;
-}
-
-inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
-  size_t i = 0;
-  for (; i + 7 < length; i += 8) {
-    uint16x8_t src = vld1q_u16(&from[i]);
-    uint8x8_t result = vmovn_u16(src);
-    vst1_u8(&to[i], result);
-  }
-
-  // Fallback for the remainder
-  for (; i < length; ++i) {
-    to[i] = static_cast<uint8_t>(from[i]);
-  }
-}
-#elif defined(FURY_HAS_SSE2)
-inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
-  if (length == 0) {
-    return 0; // Return 0 for empty arrays
-  }
-
-  __m128i max_val = _mm_setzero_si128(); // Initialize max vector with zeros
-
-  size_t i = 0;
-  for (; i + 8 <= length; i += 8) {
-    __m128i current_val = _mm_loadu_si128((__m128i *)&arr[i]);
-    max_val = _mm_max_epu16(max_val, current_val); // Max operation
-  }
-
-  // Find the max value in the resulting vector
-  uint16_t temp[8];
-  _mm_storeu_si128((__m128i *)temp, max_val);
-  uint16_t max_sse = temp[0];
-  for (int j = 1; j < 8; j++) {
-    if (temp[j] > max_sse) {
-      max_sse = temp[j];
-    }
-  }
-
-  // Handle remaining elements
-  for (; i < length; i++) {
-    if (arr[i] > max_sse) {
-      max_sse = arr[i];
-    }
-  }
-  return max_sse;
-}
-
-inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
-  size_t i = 0;
-  __m128i mask = _mm_set1_epi16(0xFF); // Mask to zero out the high byte
-  for (; i + 7 < length; i += 8) {
-    __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&from[i]));
-    __m128i result = _mm_and_si128(src, mask);
-    _mm_storel_epi64(reinterpret_cast<__m128i *>(&to[i]),
-                     _mm_packus_epi16(result, result));
-  }
-
-  // Fallback for the remainder
-  for (; i < length; ++i) {
-    to[i] = static_cast<uint8_t>(from[i]);
-  }
-}
-#else
-inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
-  if (length == 0) {
-    return 0; // Return 0 for empty arrays
-  }
-  uint16_t max_val = arr[0];
-  for (size_t i = 1; i < length; i++) {
-    if (arr[i] > max_val) {
-      max_val = arr[i];
-    }
-  }
-  return max_val;
-}
-
-inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
-  // Fallback for systems without SSE2/NEON
-  for (size_t i = 0; i < length; ++i) {
-    to[i] = static_cast<uint8_t>(from[i]);
-  }
-}
-#endif
+void copyArray(const uint16_t *from, uint8_t *to, size_t length);
 } // namespace fury

From 2ebfbc88c4d8d9b76c2a88528670d9232dcf934d Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:36:30 +0800
Subject: [PATCH 14/23] include <stdlib.h>

---
 cpp/fury/util/array_util.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/fury/util/array_util.h b/cpp/fury/util/array_util.h
index e7d89f275d..45eb0d33c7 100644
--- a/cpp/fury/util/array_util.h
+++ b/cpp/fury/util/array_util.h
@@ -20,6 +20,7 @@
 #pragma once
 #include "fury/util/platform.h"
 #include <cstdint>
+#include <stdlib.h>
 
 namespace fury {
 uint16_t getMaxValue(const uint16_t *arr, size_t length);

From ad2f28acc0d633e1c44f7a6277f0db9dad2601cc Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:39:28 +0800
Subject: [PATCH 15/23] fix include

---
 cpp/fury/util/array_util.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/fury/util/array_util.cc b/cpp/fury/util/array_util.cc
index 65a4cd862f..1182e5c19a 100644
--- a/cpp/fury/util/array_util.cc
+++ b/cpp/fury/util/array_util.cc
@@ -17,9 +17,7 @@
  * under the License.
  */
 
-#pragma once
-#include "fury/util/platform.h"
-#include <cstdint>
+#include "fury/util/array_util.h"
 
 namespace fury {
 #if defined(FURY_HAS_NEON)

From ea206d99876335ffa629601afd51401d1d995cfd Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:44:49 +0800
Subject: [PATCH 16/23] add #pragma once

---
 cpp/fury/python/pyunicode.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index 76b96751ba..84880bad24 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#pragma once
+
 #include "fury/util/array_util.h"
 #include "fury/util/buffer.h"
 #include "fury/util/logging.h"

From d2627fb5f1f2b01a2e0d09549f5fd61e8491b995 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:46:22 +0800
Subject: [PATCH 17/23] fix include

---
 cpp/fury/python/pyunicode.cc | 5 +++++
 cpp/fury/python/pyunicode.h  | 6 ------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index a3454f02a9..2d9c2bceda 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -18,6 +18,11 @@
  */
 
 #include "pyunicode.h"
+#include "fury/util/array_util.h"
+#include "fury/util/buffer.h"
+#include "fury/util/logging.h"
+#include "fury/util/string_util.h"
+#include "unicodeobject.h"
 #include <cassert>
 
 namespace fury {
diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index 84880bad24..d96239a942 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -19,19 +19,13 @@
 
 #pragma once
 
-#include "fury/util/array_util.h"
-#include "fury/util/buffer.h"
-#include "fury/util/logging.h"
-#include "fury/util/string_util.h"
 #include "object.h"
 #include "pyport.h"
-#include "unicodeobject.h"
 #include <cstring>
 #include <string>
 
 namespace fury {
 
-// unicodeobject.c
 PyObject *Fury_PyUnicode_FromUCS1(const char *u, Py_ssize_t size);
 
 PyObject *Fury_PyUnicode_FromUCS2(const uint16_t *u, Py_ssize_t size);

From 28aaf2cecd4ff786a2912cb6732d1fe16c545278 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:48:20 +0800
Subject: [PATCH 18/23] fix include

---
 cpp/fury/python/pyunicode.cc | 2 ++
 cpp/fury/python/pyunicode.h  | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index 2d9c2bceda..df5ef73198 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -24,6 +24,8 @@
 #include "fury/util/string_util.h"
 #include "unicodeobject.h"
 #include <cassert>
+#include <cstring>
+#include <string>
 
 namespace fury {
 
diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index d96239a942..25504b7f5a 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -21,8 +21,7 @@
 
 #include "object.h"
 #include "pyport.h"
-#include <cstring>
-#include <string>
+#include <cstdint>
 
 namespace fury {
 

From e326271dc0a0add5e1197c4047ffa38dff416dff Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 22:59:43 +0800
Subject: [PATCH 19/23] fix include

---
 cpp/fury/python/pyunicode.cc | 4 ----
 cpp/fury/python/pyunicode.h  | 8 +++++---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index df5ef73198..5de937d1bb 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -19,13 +19,9 @@
 
 #include "pyunicode.h"
 #include "fury/util/array_util.h"
-#include "fury/util/buffer.h"
 #include "fury/util/logging.h"
 #include "fury/util/string_util.h"
-#include "unicodeobject.h"
 #include <cassert>
-#include <cstring>
-#include <string>
 
 namespace fury {
 
diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index 25504b7f5a..d474915d81 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -18,10 +18,12 @@
  */
 
 #pragma once
-
-#include "object.h"
-#include "pyport.h"
+#include <string>
+#include <cstring>
 #include <cstdint>
+#include "pyport.h"
+#include "object.h"
+#include "unicodeobject.h"
 
 namespace fury {
 

From 1ef388cfc68048a8b338e3f00e23620f447ee885 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 23:14:04 +0800
Subject: [PATCH 20/23] add Python.h include

---
 cpp/fury/python/pyunicode.h      | 1 +
 cpp/fury/util/array_util_test.cc | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index d474915d81..2237ee0b36 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <cstring>
 #include <cstdint>
+#include "Python.h"
 #include "pyport.h"
 #include "object.h"
 #include "unicodeobject.h"
diff --git a/cpp/fury/util/array_util_test.cc b/cpp/fury/util/array_util_test.cc
index 669bb96223..cc27770362 100644
--- a/cpp/fury/util/array_util_test.cc
+++ b/cpp/fury/util/array_util_test.cc
@@ -22,7 +22,7 @@
 
 namespace fury {
 TEST(GetMaxValueTest, HandlesEmptyArray) {
-  uint16_t arr[] = {};
+  uint16_t* arr = nullptr;
   EXPECT_EQ(getMaxValue(arr, 0), 0);
 }
 

From d4837ffbd03d9b0f55be51be4cf51dc2e8d0ed13 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 23:15:00 +0800
Subject: [PATCH 21/23] lint code

---
 cpp/fury/python/pyunicode.h      | 8 ++++----
 cpp/fury/util/array_util_test.cc | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index 2237ee0b36..3308dce00b 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -18,13 +18,13 @@
  */
 
 #pragma once
-#include <string>
-#include <cstring>
-#include <cstdint>
 #include "Python.h"
-#include "pyport.h"
 #include "object.h"
+#include "pyport.h"
 #include "unicodeobject.h"
+#include <cstdint>
+#include <cstring>
+#include <string>
 
 namespace fury {
 
diff --git a/cpp/fury/util/array_util_test.cc b/cpp/fury/util/array_util_test.cc
index cc27770362..eb9eebd7f5 100644
--- a/cpp/fury/util/array_util_test.cc
+++ b/cpp/fury/util/array_util_test.cc
@@ -22,7 +22,7 @@
 
 namespace fury {
 TEST(GetMaxValueTest, HandlesEmptyArray) {
-  uint16_t* arr = nullptr;
+  uint16_t *arr = nullptr;
   EXPECT_EQ(getMaxValue(arr, 0), 0);
 }
 

From 8fe4de7895873bb2712173669579db4d0635f61e Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 23:19:01 +0800
Subject: [PATCH 22/23] optimize include

---
 cpp/fury/python/pyunicode.cc | 1 +
 cpp/fury/python/pyunicode.h  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index 5de937d1bb..bbb5b35dc1 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -21,6 +21,7 @@
 #include "fury/util/array_util.h"
 #include "fury/util/logging.h"
 #include "fury/util/string_util.h"
+#include "unicodeobject.h"
 #include <cassert>
 
 namespace fury {
diff --git a/cpp/fury/python/pyunicode.h b/cpp/fury/python/pyunicode.h
index 3308dce00b..0f4ddeb793 100644
--- a/cpp/fury/python/pyunicode.h
+++ b/cpp/fury/python/pyunicode.h
@@ -21,7 +21,6 @@
 #include "Python.h"
 #include "object.h"
 #include "pyport.h"
-#include "unicodeobject.h"
 #include <cstdint>
 #include <cstring>
 #include <string>

From a940ba3574c866a493d719f9926b08c989a91596 Mon Sep 17 00:00:00 2001
From: chaokunyang <shawn.ck.yang@gmail.com>
Date: Wed, 15 Jan 2025 23:22:48 +0800
Subject: [PATCH 23/23] remove comments

---
 cpp/fury/python/pyunicode.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cpp/fury/python/pyunicode.cc b/cpp/fury/python/pyunicode.cc
index bbb5b35dc1..3c50e00b98 100644
--- a/cpp/fury/python/pyunicode.cc
+++ b/cpp/fury/python/pyunicode.cc
@@ -35,7 +35,6 @@ static PyObject *get_latin1_char(unsigned char ch) {
     if (!unicode)
       return NULL;
     PyUnicode_1BYTE_DATA(unicode)[0] = ch;
-    // assert(_PyUnicode_CheckConsistency(unicode, 1));
     unicode_latin1[ch] = unicode;
   }
   Py_INCREF(unicode);
@@ -53,7 +52,6 @@ PyObject *Fury_PyUnicode_FromUCS1(const char *u, Py_ssize_t size) {
   if (!res)
     return NULL;
   memcpy(PyUnicode_1BYTE_DATA(res), u, size);
-  // assert(_PyUnicode_CheckConsistency(res, 1));
   return res;
 }
 
@@ -89,7 +87,6 @@ PyObject *Fury_PyUnicode_FromUCS2(const uint16_t *u, Py_ssize_t size) {
   } else {
     copyArray(u, PyUnicode_1BYTE_DATA(res), size);
   }
-  // assert(_PyUnicode_CheckConsistency(res, 1));
   return res;
 }
 } // namespace fury