From e2ea9f0b17d0f820a656abefc4db1bbe6d78f49f Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 1 Apr 2025 22:58:10 +0200
Subject: [PATCH 1/6] ENH, SIMD: Initial implementation of Highway wrapper

A thin wrapper over Google's Highway SIMD library to simplify its interface.
This commit provides the implementation of that wrapper, consisting of:

- simd.hpp: Main header defining the SIMD namespaces and configuration
- simd.inc.hpp: Template header included multiple times with different namespaces

The wrapper eliminates Highway's class tags by:
- Using lane types directly which can be deduced from arguments
- Leveraging namespaces (np::simd and np::simd128) for different register widths

A README is included to guide usage and document design decisions.
---
 numpy/_core/src/common/simd/README.md    | 258 +++++++++++++++++++++++
 numpy/_core/src/common/simd/simd.hpp     |  80 +++++++
 numpy/_core/src/common/simd/simd.inc.hpp | 102 +++++++++
 3 files changed, 440 insertions(+)
 create mode 100644 numpy/_core/src/common/simd/README.md
 create mode 100644 numpy/_core/src/common/simd/simd.hpp
 create mode 100644 numpy/_core/src/common/simd/simd.inc.hpp
diff --git a/numpy/_core/src/common/simd/README.md b/numpy/_core/src/common/simd/README.md
new file mode 100644
index 000000000000..ac58c4a6bd94
--- /dev/null
+++ b/numpy/_core/src/common/simd/README.md
@@ -0,0 +1,258 @@
+# NumPy SIMD Wrapper for Highway
+
+This directory contains a lightweight C++ wrapper over Google's [Highway](https://github.com/google/highway) SIMD library, designed specifically for NumPy's needs.
+
+> **Note**: This directory also contains the C interface of universal intrinsics (under `simd.h`) which is no longer supported. The Highway wrapper described in this document should be used instead for all new SIMD code.
+
+## Overview
+
+The wrapper simplifies Highway's SIMD interface by eliminating class tags and using lane types directly, which can be deduced from arguments in most cases. This design makes the SIMD code more intuitive and easier to maintain while still leveraging Highway generic intrinsics.
+
+## Architecture
+
+The wrapper consists of two main headers:
+
+1. `simd.hpp`: The main header that defines namespaces and includes configuration macros
+2. `simd.inc.hpp`: Implementation details included by `simd.hpp` multiple times for different namespaces
+
+Additionally, this directory contains legacy C interface files for universal intrinsics (`simd.h` and related files) which are deprecated and should not be used for new code. All new SIMD code should use the Highway wrapper.
+
+
+## Usage
+
+### Basic Usage
+
+```cpp
+#include "simd/simd.hpp"
+
+// Use np::simd for maximum width SIMD operations
+using namespace np::simd;
+float *data = /* ... */;
+Vec<float> v = LoadU(data);
+v = Add(v, v);
+StoreU(v, data);
+
+// Use np::simd128 for fixed 128-bit SIMD operations
+using namespace np::simd128;
+Vec<float> v128 = LoadU(data);
+v128 = Add(v128, v128);
+StoreU(v128, data);
+```
+
+### Checking for SIMD Support
+
+```cpp
+#include "simd/simd.hpp"
+
+// Check if SIMD is enabled
+#if NPY_SIMDX
+    // SIMD code
+#else
+    // Scalar fallback code
+#endif
+
+// Check for float64 support
+#if NPY_SIMDX_F64
+    // Use float64 SIMD operations
+#endif
+
+// Check for FMA support
+#if NPY_SIMDX_FMA
+    // Use FMA operations
+#endif
+```
+
+## Type Support and Constraints
+
+The wrapper provides type constraints to help with SFINAE (Substitution Failure Is Not An Error) and compile-time type checking:
+
+- `kSupportLane<TLane>`: Determines whether the specified lane type is supported by the SIMD extension.
+  ```cpp
+  // Base template - always defined, even when SIMD is not enabled (for SFINAE)
+  template <typename TLane>
+  constexpr bool kSupportLane = NPY_SIMDX != 0;
+  template <>
+  constexpr bool kSupportLane<double> = NPY_SIMDX_F64 != 0;
+  ```
+
+
+```cpp
+#include "simd/simd.hpp"
+
+// Check if float64 operations are supported
+if constexpr (np::simd::kSupportLane<double>) {
+    // Use float64 operations
+}
+```
+
+These constraints allow for compile-time checking of which lane types are supported, which can be used in SFINAE contexts to enable or disable functions based on type support.
+
+## Available Operations
+
+The wrapper provides the following common operations that are used in NumPy:
+
+- Vector creation operations:
+  - `Zero`: Returns a vector with all lanes set to zero
+  - `Set`: Returns a vector with all lanes set to the given value
+  - `Undefined`: Returns an uninitialized vector
+  
+- Memory operations:
+  - `LoadU`: Unaligned load of a vector from memory
+  - `StoreU`: Unaligned store of a vector to memory
+  
+- Vector information:
+  - `Lanes`: Returns the number of vector lanes based on the lane type
+  
+- Type conversion:
+  - `BitCast`: Reinterprets a vector to a different type without modifying the underlying data
+  - `VecFromMask`: Converts a mask to a vector
+  
+- Comparison operations:
+  - `Eq`: Element-wise equality comparison
+  - `Le`: Element-wise less than or equal comparison
+  - `Lt`: Element-wise less than comparison
+  - `Gt`: Element-wise greater than comparison
+  - `Ge`: Element-wise greater than or equal comparison
+  
+- Arithmetic operations:
+  - `Add`: Element-wise addition
+  - `Sub`: Element-wise subtraction
+  - `Mul`: Element-wise multiplication
+  - `Div`: Element-wise division
+  - `Min`: Element-wise minimum
+  - `Max`: Element-wise maximum
+  - `Abs`: Element-wise absolute value
+  - `Sqrt`: Element-wise square root
+  
+- Logical operations:
+  - `And`: Bitwise AND
+  - `Or`: Bitwise OR
+  - `Xor`: Bitwise XOR
+  - `AndNot`: Bitwise AND NOT (a & ~b)
+
+Additional Highway operations can be accessed via the `hn` namespace alias inside the `simd` or `simd128` namespaces.
+
+## Extending
+
+To add more operations from Highway:
+
+1. Import them in the `simd.inc.hpp` file using the `using` directive if they don't require a tag:
+   ```cpp
+   // For operations that don't require a tag
+   using hn::FunctionName;
+   ```
+
+2. Define wrapper functions for intrinsics that require a class tag:
+   ```cpp
+   // For operations that require a tag
+   template <typename TLane>
+   HWY_API ReturnType FunctionName(Args... args) {
+       return hn::FunctionName(_Tag<TLane>(), args...);
+   }
+   ```
+
+3. Add appropriate documentation and SFINAE constraints if needed
+
+
+## Build Configuration
+
+The SIMD wrapper automatically disables SIMD operations when optimizations are disabled:
+
+- When `NPY_DISABLE_OPTIMIZATION` is defined, SIMD operations are disabled
+- SIMD is enabled only when the Highway target is not scalar (`HWY_TARGET != HWY_SCALAR`)
+
+## Design Notes
+
+1. **Why avoid Highway scalar operations?**
+   - NumPy already provides kernels for scalar operations
+   - Compilers can better optimize standard library implementations
+   - Not all Highway intrinsics are fully supported in scalar mode
+
+2. **Legacy Universal Intrinsics**
+   - The older universal intrinsics C interface (in `simd.h` and accessible via `NPY_SIMD` macros) is deprecated
+   - All new SIMD code should use this Highway-based wrapper (accessible via `NPY_SIMDX` macros) 
+   - The legacy code is maintained for compatibility but will eventually be removed
+   
+3. **Feature Detection Constants vs. Highway Constants**
+   - NumPy-specific constants (`NPY_SIMDX_F16`, `NPY_SIMDX_F64`, `NPY_SIMDX_FMA`) provide additional safety beyond raw Highway constants
+   - Highway constants (e.g., `HWY_HAVE_FLOAT16`) only check hardware capabilities but don't consider NumPy's build configuration
+   - Our constants combine both checks:
+     ```cpp
+     #define NPY_SIMDX_F16 (NPY_SIMDX && HWY_HAVE_FLOAT16)
+     ```
+   - This ensures SIMD features won't be used when:
+     - Hardware supports it but NumPy optimization is disabled via meson option:
+       ```
+       option('disable-optimization', type: 'boolean', value: false,
+              description: 'Disable CPU optimized code (dispatch,simd,unroll...)')
+       ```
+     - Highway target is scalar (`HWY_TARGET == HWY_SCALAR`)
+   - Using these constants ensures consistent behavior across different compilation settings
+   - Without this additional layer, code might incorrectly try to use SIMD paths in scalar mode
+
+4. **Namespace Design**
+   - `np::simd`: Maximum width SIMD operations (scalable)
+   - `np::simd128`: Fixed 128-bit SIMD operations
+   - `hn`: Highway namespace alias (available within the SIMD namespaces)
+
+5. **Why Namespaces and Why Not Just Use Highway Directly?**
+   - Highway's design uses class tag types as template parameters (e.g., `Vec<ScalableTag<float>>`) when defining vector types
+   - Many Highway functions require explicitly passing a tag instance as the first parameter
+   - This class tag-based approach increases verbosity and complexity in user code
+   - Our wrapper eliminates this by internally managing tags through namespaces, letting users directly use types e.g. `Vec<float>`
+   - Simple example with raw Highway:
+     ```cpp
+     // Highway's approach
+     float *data = /* ... */;
+     
+     namespace hn = hwy::HWY_NAMESPACE;
+     using namespace hn;
+     
+     // Full-width operations
+     ScalableTag<float> df;  // Create a tag instance
+     Vec<decltype(df)> v = LoadU(df, data);  // LoadU requires a tag instance
+     StoreU(v, df, data);  // StoreU requires a tag instance
+     
+     // 128-bit operations
+     Full128<float> df128;  // Create a 128-bit tag instance
+     Vec<decltype(df128)> v128 = LoadU(df128, data);  // LoadU requires a tag instance
+     StoreU(v128, df128, data);  // StoreU requires a tag instance
+     ```
+  
+   - Simple example with our wrapper:
+     ```cpp
+     // Our wrapper approach
+     float *data = /* ... */;
+     
+     // Full-width operations
+     using namespace np::simd;
+     Vec<float> v = LoadU(data);  // Full-width vector load
+     StoreU(v, data);
+     
+     // 128-bit operations
+     using namespace np::simd128;
+     Vec<float> v128 = LoadU(data);  // 128-bit vector load
+     StoreU(v128, data);
+     ```
+   
+   - The namespaced approach simplifies code, reduces errors, and provides a more intuitive interface
+   - It preserves all Highway operations benefits while reducing cognitive overhead
+
+5. **Why Namespaces Are Essential for This Design?**
+   - Namespaces allow us to define different internal tag types (`hn::ScalableTag<TLane>` in `np::simd` vs `hn::Full128<TLane>` in `np::simd128`)
+   - This provides a consistent type-based interface (`Vec<float>`) without requiring users to manually create tags
+   - Enables using the same function names (like `LoadU`) with different implementations based on SIMD width
+   - Without namespaces, we'd have to either reintroduce tags (defeating the purpose of the wrapper) or create different function names for each variant (e.g., `LoadU` vs `LoadU128`)
+
+6. **Template Type Parameters**
+   - `TLane`: The scalar type for each vector lane (e.g., uint8_t, float, double)
+
+
+## Requirements
+
+- C++17 or later
+- Google Highway library
+
+## License
+
+Same as NumPy's license
diff --git a/numpy/_core/src/common/simd/simd.hpp b/numpy/_core/src/common/simd/simd.hpp
new file mode 100644
index 000000000000..698da4adf865
--- /dev/null
+++ b/numpy/_core/src/common/simd/simd.hpp
@@ -0,0 +1,80 @@
+#ifndef NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_
+#define NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_
+
+/**
+ * This header provides a thin wrapper over Google's Highway SIMD library.
+ *
+ * The wrapper aims to simplify the SIMD interface of Google's Highway by
+ * get ride of its class tags and use lane types directly which can be deduced
+ * from the args in most cases.
+ */
+/**
+ * Since `NPY_SIMD` is only limited to NumPy C universal intrinsics,
+ * `NPY_SIMDX` is defined to indicate the SIMD availability for Google's Highway
+ * C++ code.
+ *
+ * Highway SIMD is only available when optimization is enabled.
+ * When NPY_DISABLE_OPTIMIZATION is defined, SIMD operations are disabled
+ * and the code falls back to scalar implementations.
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include <hwy/highway.h>
+
+/**
+ * We avoid using Highway scalar operations for the following reasons:
+ * 1. We already provide kernels for scalar operations, so falling back to
+ *    the NumPy implementation is more appropriate. Compilers can often
+ *    optimize these better since they rely on standard libraries.
+ * 2. Not all Highway intrinsics are fully supported in scalar mode.
+ *
+ * Therefore, we only enable SIMD when the Highway target is not scalar.
+ */
+#define NPY_SIMDX (HWY_TARGET != HWY_SCALAR)
+
+// Indicates if the SIMD operations are available for float16.
+#define NPY_SIMDX_F16 (NPY_SIMDX && HWY_HAVE_FLOAT16)
+// Note: Highway requires SIMD extentions with native float32 support, so we don't need
+// to check for it.
+
+// Indicates if the SIMD operations are available for float64.
+#define NPY_SIMDX_F64 (NPY_SIMDX && HWY_HAVE_FLOAT64)
+
+// Indicates if the SIMD floating operations are natively supports fma.
+#define NPY_SIMDX_FMA (NPY_SIMDX && HWY_NATIVE_FMA)
+
+#else
+#define NPY_SIMDX 0
+#define NPY_SIMDX_F16 0
+#define NPY_SIMDX_F64 0
+#define NPY_SIMDX_FMA 0
+#endif
+
+namespace np {
+
+/// Represents the max SIMD width supported by the platform.
+namespace simd {
+#if NPY_SIMDX
+/// The highway namespace alias.
+/// We can not import all the symbols from the HWY_NAMESPACE because it will
+/// conflict with the existing symbols in the numpy namespace.
+namespace hn = hwy::HWY_NAMESPACE;
+// internaly used by the template header
+template <typename TLane>
+using _Tag = hn::ScalableTag<TLane>;
+#endif
+#include "simd.inc.hpp"
+}  // namespace simd
+
+/// Represents the 128-bit SIMD width.
+namespace simd128 {
+#if NPY_SIMDX
+namespace hn = hwy::HWY_NAMESPACE;
+template <typename TLane>
+using _Tag = hn::Full128<TLane>;
+#endif
+#include "simd.inc.hpp"
+}  // namespace simd128
+
+}  // namespace np
+
+#endif  // NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_
diff --git a/numpy/_core/src/common/simd/simd.inc.hpp b/numpy/_core/src/common/simd/simd.inc.hpp
new file mode 100644
index 000000000000..d052d829720a
--- /dev/null
+++ b/numpy/_core/src/common/simd/simd.inc.hpp
@@ -0,0 +1,102 @@
+#ifndef NPY_SIMDX
+#error "This is not a standalone header. Include simd.hpp instead."
+#endif
+
+// NOTE: This file is included by simd.hpp multiple times with different namespaces
+// so avoid including any headers here
+// #define NPY_SIMDX 1  // uncomment to enable Highlighting
+
+/**
+ * Determines whether the specified lane type is supported by the SIMD extension.
+ * Always defined as false when SIMD is not enabled, so it can be used in SFINAE.
+ *
+ * @tparam TLane The lane type to check for support.
+ */
+template <typename TLane>
+constexpr bool kSupportLane = NPY_SIMDX != 0;
+
+#if NPY_SIMDX
+// Define lane type support based on Highway capabilities
+template <>
+constexpr bool kSupportLane<hwy::float16_t> = HWY_HAVE_FLOAT16 != 0;
+template <>
+constexpr bool kSupportLane<double> = HWY_HAVE_FLOAT64 != 0;
+
+/// Represents an N-lane vector based on the specified lane type.
+/// @tparam TLane The scalar type for each vector lane
+template <typename TLane>
+using Vec = hn::Vec<_Tag<TLane>>;
+
+/// Represents a mask vector with boolean values or as a bitmask.
+/// @tparam TLane The scalar type the mask corresponds to
+template <typename TLane>
+using Mask = hn::Mask<_Tag<TLane>>;
+
+/// Unaligned load of a vector from memory.
+template <typename TLane>
+HWY_API Vec<TLane> LoadU(const TLane* ptr) {
+    return hn::LoadU(_Tag<TLane>(), ptr);
+}
+
+/// Unaligned store of a vector to memory.
+template <typename TLane>
+HWY_API void StoreU(const Vec<TLane>& a, TLane* ptr) {
+    hn::StoreU(a, _Tag<TLane>(), ptr);
+}
+
+/// Returns the number of vector lanes based on the lane type.
+template <typename TLane>
+HWY_API constexpr size_t Lanes(TLane tag = 0) {
+    return hn::Lanes(_Tag<TLane>());
+}
+
+/// Returns an uninitialized N-lane vector.
+template <typename TLane>
+HWY_API Vec<TLane> Undefined(TLane tag = 0) {
+    return hn::Undefined(_Tag<TLane>());
+}
+
+/// Returns N-lane vector with all lanes equal to zero.
+template <typename TLane>
+HWY_API Vec<TLane> Zero(TLane tag = 0) {
+    return hn::Zero(_Tag<TLane>());
+}
+
+/// Returns N-lane vector with all lanes equal to the given value of type `TLane`.
+template <typename TLane>
+HWY_API Vec<TLane> Set(TLane val) {
+    return hn::Set(_Tag<TLane>(), val);
+}
+
+/// Converts a mask to a vector based on the specified lane type.
+template <typename TLane, typename TMask>
+HWY_API Vec<TLane> VecFromMask(const TMask &m) {
+    return hn::VecFromMask(_Tag<TLane>(), m);
+}
+
+/// Convert (Reinterpret) an N-lane vector to a different type without modifying the underlying data.
+template <typename TLaneTo, typename TVec>
+HWY_API Vec<TLaneTo> BitCast(const TVec &v) {
+    return hn::BitCast(_Tag<TLaneTo>(), v);
+}
+
+// Import common Highway intrinsics
+using hn::Eq;
+using hn::Le;
+using hn::Lt;
+using hn::Gt;
+using hn::Ge;
+using hn::And;
+using hn::Or;
+using hn::Xor;
+using hn::AndNot;
+using hn::Sub;
+using hn::Add;
+using hn::Mul;
+using hn::Div;
+using hn::Min;
+using hn::Max;
+using hn::Abs;
+using hn::Sqrt;
+
+#endif // NPY_SIMDX

From 4a41c6ec02dd60d646d5e1710f12ead183db5c09 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Sat, 12 Apr 2025 19:25:27 +0200
Subject: [PATCH 2/6] SIMD: Update wrapper with improved docs and type support

  - Fix hardware/platform terminology in documentation for clarity
  - Add support for long double in template specializations
  - Add kMaxLanes constant to expose maximum vector width information
  - Follows clang formatting style for consistency with NumPy codebase.
---
 numpy/_core/src/common/simd/README.md    |  9 ++-
 numpy/_core/src/common/simd/simd.inc.hpp | 70 ++++++++++++++++--------
 2 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/numpy/_core/src/common/simd/README.md b/numpy/_core/src/common/simd/README.md
index ac58c4a6bd94..9a68d1aa1bfc 100644
--- a/numpy/_core/src/common/simd/README.md
+++ b/numpy/_core/src/common/simd/README.md
@@ -75,6 +75,11 @@ The wrapper provides type constraints to help with SFINAE (Substitution Failure
   constexpr bool kSupportLane<double> = NPY_SIMDX_F64 != 0;
   ```
 
+- `kMaxLanes<TLane>`: Maximum number of lanes supported by the SIMD extension for the specified lane type.
+  ```cpp
+  template <typename TLane>
+  constexpr size_t kMaxLanes = HWY_MAX_LANES_D(_Tag<TLane>);
+  ```
 
 ```cpp
 #include "simd/simd.hpp"
@@ -175,13 +180,13 @@ The SIMD wrapper automatically disables SIMD operations when optimizations are d
    
 3. **Feature Detection Constants vs. Highway Constants**
    - NumPy-specific constants (`NPY_SIMDX_F16`, `NPY_SIMDX_F64`, `NPY_SIMDX_FMA`) provide additional safety beyond raw Highway constants
-   - Highway constants (e.g., `HWY_HAVE_FLOAT16`) only check hardware capabilities but don't consider NumPy's build configuration
+   - Highway constants (e.g., `HWY_HAVE_FLOAT16`) only check platform capabilities but don't consider NumPy's build configuration
    - Our constants combine both checks:
      ```cpp
      #define NPY_SIMDX_F16 (NPY_SIMDX && HWY_HAVE_FLOAT16)
      ```
    - This ensures SIMD features won't be used when:
-     - Hardware supports it but NumPy optimization is disabled via meson option:
+     - Platform supports it but NumPy optimization is disabled via meson option:
        ```
        option('disable-optimization', type: 'boolean', value: false,
               description: 'Disable CPU optimized code (dispatch,simd,unroll...)')
diff --git a/numpy/_core/src/common/simd/simd.inc.hpp b/numpy/_core/src/common/simd/simd.inc.hpp
index d052d829720a..6d12c9a3caeb 100644
--- a/numpy/_core/src/common/simd/simd.inc.hpp
+++ b/numpy/_core/src/common/simd/simd.inc.hpp
@@ -1,10 +1,10 @@
 #ifndef NPY_SIMDX
 #error "This is not a standalone header. Include simd.hpp instead."
+#define NPY_SIMDX 1 // Prevent editors from graying out the happy branch
 #endif
 
 // NOTE: This file is included by simd.hpp multiple times with different namespaces
 // so avoid including any headers here
-// #define NPY_SIMDX 1  // uncomment to enable Highlighting
 
 /**
  * Determines whether the specified lane type is supported by the SIMD extension.
@@ -21,6 +21,13 @@ template <>
 constexpr bool kSupportLane<hwy::float16_t> = HWY_HAVE_FLOAT16 != 0;
 template <>
 constexpr bool kSupportLane<double> = HWY_HAVE_FLOAT64 != 0;
+template <>
+constexpr bool kSupportLane<long double> =
+        HWY_HAVE_FLOAT64 != 0 && sizeof(long double) == sizeof(double);
+
+/// Maximum number of lanes supported by the SIMD extension for the specified lane type.
+template <typename TLane>
+constexpr size_t kMaxLanes = HWY_MAX_LANES_D(_Tag<TLane>);
 
 /// Represents an N-lane vector based on the specified lane type.
 /// @tparam TLane The scalar type for each vector lane
@@ -34,69 +41,86 @@ using Mask = hn::Mask<_Tag<TLane>>;
 
 /// Unaligned load of a vector from memory.
 template <typename TLane>
-HWY_API Vec<TLane> LoadU(const TLane* ptr) {
+HWY_API Vec<TLane>
+LoadU(const TLane *ptr)
+{
     return hn::LoadU(_Tag<TLane>(), ptr);
 }
 
 /// Unaligned store of a vector to memory.
 template <typename TLane>
-HWY_API void StoreU(const Vec<TLane>& a, TLane* ptr) {
+HWY_API void
+StoreU(const Vec<TLane> &a, TLane *ptr)
+{
     hn::StoreU(a, _Tag<TLane>(), ptr);
 }
 
 /// Returns the number of vector lanes based on the lane type.
 template <typename TLane>
-HWY_API constexpr size_t Lanes(TLane tag = 0) {
+HWY_API constexpr size_t
+Lanes(TLane tag = 0)
+{
     return hn::Lanes(_Tag<TLane>());
 }
 
 /// Returns an uninitialized N-lane vector.
 template <typename TLane>
-HWY_API Vec<TLane> Undefined(TLane tag = 0) {
+HWY_API Vec<TLane>
+Undefined(TLane tag = 0)
+{
     return hn::Undefined(_Tag<TLane>());
 }
 
 /// Returns N-lane vector with all lanes equal to zero.
 template <typename TLane>
-HWY_API Vec<TLane> Zero(TLane tag = 0) {
+HWY_API Vec<TLane>
+Zero(TLane tag = 0)
+{
     return hn::Zero(_Tag<TLane>());
 }
 
 /// Returns N-lane vector with all lanes equal to the given value of type `TLane`.
 template <typename TLane>
-HWY_API Vec<TLane> Set(TLane val) {
+HWY_API Vec<TLane>
+Set(TLane val)
+{
     return hn::Set(_Tag<TLane>(), val);
 }
 
 /// Converts a mask to a vector based on the specified lane type.
 template <typename TLane, typename TMask>
-HWY_API Vec<TLane> VecFromMask(const TMask &m) {
+HWY_API Vec<TLane>
+VecFromMask(const TMask &m)
+{
     return hn::VecFromMask(_Tag<TLane>(), m);
 }
 
-/// Convert (Reinterpret) an N-lane vector to a different type without modifying the underlying data.
+/// Convert (Reinterpret) an N-lane vector to a different type without modifying the
+/// underlying data.
 template <typename TLaneTo, typename TVec>
-HWY_API Vec<TLaneTo> BitCast(const TVec &v) {
+HWY_API Vec<TLaneTo>
+BitCast(const TVec &v)
+{
     return hn::BitCast(_Tag<TLaneTo>(), v);
 }
 
 // Import common Highway intrinsics
-using hn::Eq;
-using hn::Le;
-using hn::Lt;
-using hn::Gt;
-using hn::Ge;
+using hn::Abs;
+using hn::Add;
 using hn::And;
-using hn::Or;
-using hn::Xor;
 using hn::AndNot;
-using hn::Sub;
-using hn::Add;
-using hn::Mul;
 using hn::Div;
-using hn::Min;
+using hn::Eq;
+using hn::Ge;
+using hn::Gt;
+using hn::Le;
+using hn::Lt;
 using hn::Max;
-using hn::Abs;
+using hn::Min;
+using hn::Mul;
+using hn::Or;
 using hn::Sqrt;
+using hn::Sub;
+using hn::Xor;
 
-#endif // NPY_SIMDX
+#endif  // NPY_SIMDX

From 588de397fe4724096b4106ac64a9ec6f6eea2a95 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Thu, 17 Apr 2025 04:37:59 +0200
Subject: [PATCH 3/6] SIMD: Improve isolation and constexpr handling in wrapper

  - Add anonymous namespace around implementation to ensure each
    translation unit gets its own constants based on local flags
  - Use HWY_LANES_CONSTEXPR for Lanes function to ensure proper
    constexpr evaluation across platforms
---
 numpy/_core/src/common/simd/simd.inc.hpp               | 8 +++++++-
 numpy/_core/src/umath/loops_trigonometric.dispatch.cpp | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/numpy/_core/src/common/simd/simd.inc.hpp b/numpy/_core/src/common/simd/simd.inc.hpp
index 6d12c9a3caeb..64d28bc47118 100644
--- a/numpy/_core/src/common/simd/simd.inc.hpp
+++ b/numpy/_core/src/common/simd/simd.inc.hpp
@@ -3,6 +3,10 @@
 #define NPY_SIMDX 1 // Prevent editors from graying out the happy branch
 #endif
 
+// Using anonymous namespace instead of inline to ensure each translation unit
+// gets its own copy of constants based on local compilation flags
+namespace {
+
 // NOTE: This file is included by simd.hpp multiple times with different namespaces
 // so avoid including any headers here
 
@@ -57,7 +61,7 @@ StoreU(const Vec<TLane> &a, TLane *ptr)
 
 /// Returns the number of vector lanes based on the lane type.
 template <typename TLane>
-HWY_API constexpr size_t
+HWY_API HWY_LANES_CONSTEXPR size_t
 Lanes(TLane tag = 0)
 {
     return hn::Lanes(_Tag<TLane>());
@@ -124,3 +128,5 @@ using hn::Sub;
 using hn::Xor;
 
 #endif  // NPY_SIMDX
+
+} // namespace anonymous
diff --git a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
index ae696db4cd4a..9ce2571a1528 100644
--- a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
@@ -3,7 +3,9 @@
 #include "loops_utils.h"
 
 #include "simd/simd.h"
+#include "simd/simd.hpp" 
 #include <hwy/highway.h>
+
 namespace hn = hwy::HWY_NAMESPACE;
 
 /*

From d9e60ccf1029f9d50e3a9195d06a64a6bd919edb Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Thu, 17 Apr 2025 16:13:20 +0200
Subject: [PATCH 4/6] Update Highway submodule to latest master

---
 numpy/_core/src/highway | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/_core/src/highway b/numpy/_core/src/highway
index 0b696633f9ad..12b325bc1793 160000
--- a/numpy/_core/src/highway
+++ b/numpy/_core/src/highway
@@ -1 +1 @@
-Subproject commit 0b696633f9ad89497dd5532b55eaa01625ad71ca
+Subproject commit 12b325bc1793dee68ab2157995a690db859fe9e0

From 7127a0c1bdbe792101c62ddc8f7e0e4baa83c488 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Thu, 17 Apr 2025 16:27:30 +0200
Subject: [PATCH 5/6] SIMD: Fix compile error by using MaxLanes instead of
 Lanes for array size

  Replace hn::Lanes(f64) with hn::MaxLanes(f64) when defining the index array
  size to fix error C2131: "expression did not evaluate to a constant". This error
  occurs because Lanes() isn't always constexpr compatible, especially with
  scalable vector extensions. MaxLanes() provides a compile-time constant value
  suitable for static array allocation and should be used with non-scalable SIMD
  extensions when defining fixed-size arrays.
---
 numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src | 2 +-
 numpy/_core/src/umath/loops_trigonometric.dispatch.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src
index 8c66229942ee..93d288fbdb2e 100755
--- a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src
+++ b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src
@@ -385,7 +385,7 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
         vec_f64 b, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16;
         if constexpr(hn::MaxLanes(f64) == 2){
             vec_f64 e0e1_0, e0e1_1;
-            uint64_t index[hn::Lanes(f64)];
+            uint64_t index[hn::MaxLanes(f64)];
             hn::StoreU(idx, u64, index);
 
             /**begin repeat
diff --git a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
index 9ce2571a1528..d298a8596cc4 100644
--- a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
@@ -186,7 +186,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
                           "larger than 256 bits.");
             simd_maski = ((uint8_t *)&simd_maski)[0];
 #endif
-            float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[hn::Lanes(f32)];
+            float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[hn::MaxLanes(f32)];
             hn::Store(x_in, f32, ip_fback);
 
             // process elements using libc for large elements

From 7be22c7da63173c7be71b2107dd3e0c78f22adb2 Mon Sep 17 00:00:00 2001
From: amane-ame <haofen.opencv@isrc.iscas.ac.cn>
Date: Mon, 12 May 2025 19:15:55 +0800
Subject: [PATCH 6/6] Convert unary_fp_le to highway.

---
 numpy/_core/meson.build                       |   3 +-
 .../umath/loops_unary_fp_le.dispatch.c.src    | 555 ------------------
 .../src/umath/loops_unary_fp_le.dispatch.cpp  | 292 +++++++++
 3 files changed, 294 insertions(+), 556 deletions(-)
 delete mode 100644 numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src
 create mode 100644 numpy/_core/src/umath/loops_unary_fp_le.dispatch.cpp

diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index 3bffed752474..f5f2e5e76a2f 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -1021,12 +1021,13 @@ foreach gen_mtargets : [
   ],
   [
     'loops_unary_fp_le.dispatch.h',
-    src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
+    'src/umath/loops_unary_fp_le.dispatch.cpp',
     [
       SSE41, SSE2,
       VSX2,
       ASIMD, NEON,
       LSX,
+      RVV,
     ]
   ],
   [
diff --git a/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src
deleted file mode 100644
index ca02bc85608e..000000000000
--- a/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src
+++ /dev/null
@@ -1,555 +0,0 @@
-/**
- * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
- * through the baseline, since scatter(AVX512F) and gather very costly
- * to handle non-contiguous memory access comparing with SSE for
- * such small operations that this file covers.
- */
-#define NPY_SIMD_FORCE_128
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include <float.h>
-#include "numpy/npy_math.h"
-#include "simd/simd.h"
-#include "loops_utils.h"
-#include "loops.h"
-#include "lowlevel_strided_loops.h"
-// Provides the various *_LOOP macros
-#include "fast_loop_macros.h"
-
-/**
- * This code should really be merged into loops_unary_fp.dispatch.c.src
- * However there is an issue with enabling the code here for VX and VXE
- * as the shifts don't behave as expected.
- * See the code below that references NPY__CPU_TARGET_VX and
- * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue.
- *
- * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src
- * building for VX and VXE so we don't regress performance while adding this
- * code for other platforms.
- */
-// TODO(@seiko2plus): add support for big-endian
-#if NPY_SIMD_BIGENDIAN
-    #undef NPY_SIMD
-    #undef NPY_SIMD_F32
-    #undef NPY_SIMD_F64
-    #define NPY_SIMD 0
-    #define NPY_SIMD_F32 0
-    #define NPY_SIMD_F64 0
-#endif
-
-/*******************************************************************************
- ** extra SIMD intrinsics
- ******************************************************************************/
-
-#if NPY_SIMD
-
-/**
- * We define intrinsics for isnan, isinf, isfinite, and signbit below.  There's
- * a few flavors of each.  We'll use f32 as an example although f64 versions
- * are also defined.
- * 
- * npyv_u32 npyv_KIND_f32(npyv_f32 v)
- *   These are mainly used for the single vector loops.  As such, result should
- *   be bool true / false, ready to write back.
- * 
- * npyv_b32 _npyv_KIND_f32(npyv_f32 v)
- *   These are used by the geneal intrinsics above as well as the multi-vector
- *   packing intrinsics.  The multi-vector packing intrinsics are the ones
- *   utilized in the unrolled vector loops.  Results should be vector masks
- *   of 0x00/0xff.
- * 
- * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
- *   These are the multi-vector packing intrinsics utilized by unrolled vector
- *   loops.  They perform the operation on all input vectors and pack the
- *   results to a single npyv_u8.  Assuming NPY_SIMD == 128, that means we
- *   can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8.
- *   Result should be bool true / false, ready to write back.
- */
-
-#if NPY_SIMD_F32
-NPY_FINLINE npyv_u32
-npyv_isnan_f32(npyv_f32 v)
-{
-    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
-    npyv_u8 notnan = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v)));
-    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notnan));
-}
-NPY_FINLINE npyv_u8
-npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
-{
-    const npyv_u8 truemask = npyv_setall_u8(1==1);
-    npyv_b32 b0 = npyv_notnan_f32(v0);
-    npyv_b32 b1 = npyv_notnan_f32(v1);
-    npyv_b32 b2 = npyv_notnan_f32(v2);
-    npyv_b32 b3 = npyv_notnan_f32(v3);
-    npyv_b8 notnan = npyv_pack_b8_b32(b0, b1, b2, b3);
-    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
-}
-#endif
-#if NPY_SIMD_F64
-NPY_FINLINE npyv_u64
-npyv_isnan_f64(npyv_f64 v)
-{
-    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
-    npyv_u8 notnan = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v)));
-    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notnan));
-}
-NPY_FINLINE npyv_u8
-npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
-                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
-{
-    const npyv_u8 truemask = npyv_setall_u8(1==1);
-    npyv_b64 b0 = npyv_notnan_f64(v0);
-    npyv_b64 b1 = npyv_notnan_f64(v1);
-    npyv_b64 b2 = npyv_notnan_f64(v2);
-    npyv_b64 b3 = npyv_notnan_f64(v3);
-    npyv_b64 b4 = npyv_notnan_f64(v4);
-    npyv_b64 b5 = npyv_notnan_f64(v5);
-    npyv_b64 b6 = npyv_notnan_f64(v6);
-    npyv_b64 b7 = npyv_notnan_f64(v7);
-    npyv_b8 notnan = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
-    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
-}
-#endif
-
-#if NPY_SIMD_F32
-NPY_FINLINE npyv_b32
-_npyv_isinf_f32(npyv_f32 v)
-{
-#if defined(NPY_HAVE_NEON)
-    // abs(v) > FLT_MAX
-    const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX);
-    return vcagtq_f32(v, fltmax);
-#else
-    // cast out the sign and check if all exponent bits are set.
-    const npyv_u32 exp_mask = npyv_setall_u32(0xff000000);
-    npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1);
-    return npyv_cmpeq_u32(bits, exp_mask);
-#endif
-}
-NPY_FINLINE npyv_u32
-npyv_isinf_f32(npyv_f32 v)
-{
-    const npyv_u32 truemask = npyv_setall_u32(1==1);
-    return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v)));
-}
-NPY_FINLINE npyv_u8
-npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
-{
-    const npyv_u8 truemask = npyv_setall_u8(1==1);
-    npyv_b32 b0 = _npyv_isinf_f32(v0);
-    npyv_b32 b1 = _npyv_isinf_f32(v1);
-    npyv_b32 b2 = _npyv_isinf_f32(v2);
-    npyv_b32 b3 = _npyv_isinf_f32(v3);
-    npyv_b8 isinf = npyv_pack_b8_b32(b0, b1, b2, b3);
-    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
-}
-#endif // NPY_SIMD_F32
-#if NPY_SIMD_F64
-NPY_FINLINE npyv_b64
-_npyv_isinf_f64(npyv_f64 v)
-{
-#if defined(NPY_HAVE_NEON)
-    // abs(v) > DBL_MAX
-    const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX);
-    return vcagtq_f64(v, fltmax);
-#else
-    // cast out the sign and check if all exponent bits are set.
-    const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000);
-    npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1);
-    return npyv_cmpeq_u64(bits, exp_mask);
-#endif
-}
-NPY_FINLINE npyv_u64
-npyv_isinf_f64(npyv_f64 v)
-{
-    const npyv_u64 truemask = npyv_setall_u64(1==1);
-    return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v)));
-}
-NPY_FINLINE npyv_u8
-npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
-                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
-{
-    const npyv_u8 truemask = npyv_setall_u8(1==1);
-    npyv_b64 b0 = _npyv_isinf_f64(v0);
-    npyv_b64 b1 = _npyv_isinf_f64(v1);
-    npyv_b64 b2 = _npyv_isinf_f64(v2);
-    npyv_b64 b3 = _npyv_isinf_f64(v3);
-    npyv_b64 b4 = _npyv_isinf_f64(v4);
-    npyv_b64 b5 = _npyv_isinf_f64(v5);
-    npyv_b64 b6 = _npyv_isinf_f64(v6);
-    npyv_b64 b7 = _npyv_isinf_f64(v7);
-    npyv_b8 isinf = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
-    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
-}
-#endif // NPY_SIMD_F64
-
-
-#if NPY_SIMD_F32
-NPY_FINLINE npyv_b32
-npyv_notfinite_f32(npyv_f32 v)
-{
-    // cast out the sign and check if all exponent bits are set
-    // no matter the mentissa is.
-    const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000);
-    npyv_u32 bits = npyv_reinterpret_u32_f32(v);
-    bits = npyv_and_u32(bits, exp_mask);
-    return npyv_cmpeq_u32(bits, exp_mask);
-}
-NPY_FINLINE npyv_u32
-npyv_isfinite_f32(npyv_f32 v)
-{
-    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
-    npyv_u8 notfinite = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v)));
-    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notfinite));
-}
-NPY_FINLINE npyv_u8
-npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
-{
-#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
-    // F32 exponent is 8-bits, which means we can pack multiple into
-    // a single vector.  We shift out sign bit so that we're left
-    // with only exponent in high byte.  If not all bits are set,
-    // then we've got a finite number.
-    uint8x16x4_t tbl;
-    tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
-    tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
-    tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
-    tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
-
-    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
-    npyv_u8 r = vqtbl4q_u8(tbl, permute);
-
-    const npyv_u8 expmask = npyv_setall_u8(0xff);
-    r = npyv_cmpneq_u8(r, expmask);
-    r = vshrq_n_u8(r, 7);
-    return r;
-#else
-    const npyv_u8 truemask = npyv_setall_u8(1==1);
-    npyv_b32 b0 = npyv_notfinite_f32(v0);
-    npyv_b32 b1 = npyv_notfinite_f32(v1);
-    npyv_b32 b2 = npyv_notfinite_f32(v2);
-    npyv_b32 b3 = npyv_notfinite_f32(v3);
-    npyv_b8 notfinite = npyv_pack_b8_b32(b0, b1, b2, b3);
-    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
-#endif
-}
-#endif // NPY_SIMD_F32
-#if NPY_SIMD_F64
-NPY_FINLINE npyv_b64
-npyv_notfinite_f64(npyv_f64 v)
-{
-    // cast out the sign and check if all exponent bits are set
-    // no matter the mantissa is.
-    const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000);
-    npyv_u64 bits = npyv_reinterpret_u64_f64(v);
-    bits = npyv_and_u64(bits, exp_mask);
-    return npyv_cmpeq_u64(bits, exp_mask);
-}
-NPY_FINLINE npyv_u64
-npyv_isfinite_f64(npyv_f64 v)
-{
-    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
-    npyv_u8 notfinite = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v)));
-    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notfinite));
-}
-NPY_FINLINE npyv_u8
-npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
-                       npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
-{
-#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
-    // F64 exponent is 11-bits, which means we can pack multiple into
-    // a single vector.  We'll need to use u16 to fit all exponent
-    // bits.  If not all bits are set, then we've got a finite number.
-    uint8x16x4_t t0123, t4567;
-    t0123.val[0] = npyv_reinterpret_u8_f64(v0);
-    t0123.val[1] = npyv_reinterpret_u8_f64(v1);
-    t0123.val[2] = npyv_reinterpret_u8_f64(v2);
-    t0123.val[3] = npyv_reinterpret_u8_f64(v3);
-    t4567.val[0] = npyv_reinterpret_u8_f64(v4);
-    t4567.val[1] = npyv_reinterpret_u8_f64(v5);
-    t4567.val[2] = npyv_reinterpret_u8_f64(v6);
-    t4567.val[3] = npyv_reinterpret_u8_f64(v7);
-
-    const npyv_u8 permute = {6,7,14,15,  22,23,30,31,  38,39,46,47,  54,55,62,63};
-    npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
-    npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
-
-    const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
-    r0 = npyv_and_u16(r0, expmask);
-    r0 = npyv_cmpneq_u16(r0, expmask);
-    r0 = npyv_shri_u16(r0, 15);
-    r1 = npyv_and_u16(r1, expmask);
-    r1 = npyv_cmpneq_u16(r1, expmask);
-    r1 = npyv_shri_u16(r1, 15);
-
-    npyv_u8 r = npyv_pack_b8_b16(r0, r1);
-    return r;
-#else
-    const npyv_u8 truemask = npyv_setall_u8(1==1);
-    npyv_b64 b0 = npyv_notfinite_f64(v0);
-    npyv_b64 b1 = npyv_notfinite_f64(v1);
-    npyv_b64 b2 = npyv_notfinite_f64(v2);
-    npyv_b64 b3 = npyv_notfinite_f64(v3);
-    npyv_b64 b4 = npyv_notfinite_f64(v4);
-    npyv_b64 b5 = npyv_notfinite_f64(v5);
-    npyv_b64 b6 = npyv_notfinite_f64(v6);
-    npyv_b64 b7 = npyv_notfinite_f64(v7);
-    npyv_b8 notfinite = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
-    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
-#endif
-}
-#endif // NPY_SIMD_F64
-
-#if NPY_SIMD_F32
-NPY_FINLINE npyv_u32
-npyv_signbit_f32(npyv_f32 v)
-{
-    return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1);
-}
-NPY_FINLINE npyv_u8
-npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
-{
-#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
-    // We only need high byte for signbit, which means we can pack
-    // multiple inputs into a single vector.
-    uint8x16x4_t tbl;
-    tbl.val[0] = npyv_reinterpret_u8_f32(v0);
-    tbl.val[1] = npyv_reinterpret_u8_f32(v1);
-    tbl.val[2] = npyv_reinterpret_u8_f32(v2);
-    tbl.val[3] = npyv_reinterpret_u8_f32(v3);
-
-    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
-    npyv_u8 r = vqtbl4q_u8(tbl, permute);
-            r = vshrq_n_u8(r, 7);
-    return r;
-#else
-    npyv_b32 b0 = npyv_cvt_b32_u32(npyv_signbit_f32(v0));
-    npyv_b32 b1 = npyv_cvt_b32_u32(npyv_signbit_f32(v1));
-    npyv_b32 b2 = npyv_cvt_b32_u32(npyv_signbit_f32(v2));
-    npyv_b32 b3 = npyv_cvt_b32_u32(npyv_signbit_f32(v3));
-    npyv_b8 signbit = npyv_pack_b8_b32(b0, b1, b2, b3);
-    return npyv_cvt_u8_b8(signbit);
-#endif
-}
-#endif // NPY_SIMD_F32
-#if NPY_SIMD_F64
-NPY_FINLINE npyv_u64
-npyv_signbit_f64(npyv_f64 v)
-{
-    return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1);
-}
-NPY_FINLINE npyv_u8
-npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
-                      npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
-{
-#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
-    // We only need high byte for signbit, which means we can pack
-    // multiple inputs into a single vector.
-
-    // vuzp2 faster than vtbl for f64
-    npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
-    npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
-    npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
-    npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
-
-    npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
-    npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
-
-    npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
-            r = vshrq_n_u8(r, 7);
-    return r;
-#else
-    npyv_b64 b0 = npyv_cvt_b64_u64(npyv_signbit_f64(v0));
-    npyv_b64 b1 = npyv_cvt_b64_u64(npyv_signbit_f64(v1));
-    npyv_b64 b2 = npyv_cvt_b64_u64(npyv_signbit_f64(v2));
-    npyv_b64 b3 = npyv_cvt_b64_u64(npyv_signbit_f64(v3));
-    npyv_b64 b4 = npyv_cvt_b64_u64(npyv_signbit_f64(v4));
-    npyv_b64 b5 = npyv_cvt_b64_u64(npyv_signbit_f64(v5));
-    npyv_b64 b6 = npyv_cvt_b64_u64(npyv_signbit_f64(v6));
-    npyv_b64 b7 = npyv_cvt_b64_u64(npyv_signbit_f64(v7));
-    npyv_b8 signbit = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
-    return npyv_cvt_u8_b8(signbit);
-#endif
-}
-#endif // NPY_SIMD_F64
-
-#endif // NPY_SIMD
-
-/********************************************************************************
- ** Defining the SIMD kernels
- ********************************************************************************/
-/** Notes:
- * - avoid the use of libmath to unify fp/domain errors
- *   for both scalars and vectors among all compilers/architectures.
- * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
- *   to fill the remind lanes with 1.0 to avoid divide by zero fp
- *   exception in reciprocal.
- */
-#define CONTIG  0
-#define NCONTIG 1
-
-/**begin repeat
- * #TYPE = FLOAT, DOUBLE#
- * #sfx  = f32, f64#
- * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
- * #ssfx = 32, 64#
- */
-#if @VCHK@
-/**begin repeat1
- * #kind = isnan, isinf, isfinite, signbit#
- */
-/**begin repeat2
- * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
- * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
- */
-static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
-(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
-{
-    const npyv_lanetype_@sfx@ *ip = src;
-    npy_bool *op = dst;
-
-    // How many vectors can be packed into a u8 / bool vector?
-    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
-    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
-
-    const int vstep = npyv_nlanes_@sfx@;
-    const int wstep = vstep * PACK_FACTOR;
-
-    // unrolled iterations
-    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
-        // Load vectors
-        #if @STYPE@ == CONTIG
-            // contiguous input
-            npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
-            npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
-            npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
-            npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
-            #if PACK_FACTOR == 8
-            npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
-            npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
-            npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
-            npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
-            #endif
-        #else
-            // non-contiguous input
-            npyv_@sfx@ v0 = npyv_loadn_@sfx@(ip + istride * vstep * 0, istride);
-            npyv_@sfx@ v1 = npyv_loadn_@sfx@(ip + istride * vstep * 1, istride);
-            npyv_@sfx@ v2 = npyv_loadn_@sfx@(ip + istride * vstep * 2, istride);
-            npyv_@sfx@ v3 = npyv_loadn_@sfx@(ip + istride * vstep * 3, istride);
-            #if PACK_FACTOR == 8
-            npyv_@sfx@ v4 = npyv_loadn_@sfx@(ip + istride * vstep * 4, istride);
-            npyv_@sfx@ v5 = npyv_loadn_@sfx@(ip + istride * vstep * 5, istride);
-            npyv_@sfx@ v6 = npyv_loadn_@sfx@(ip + istride * vstep * 6, istride);
-            npyv_@sfx@ v7 = npyv_loadn_@sfx@(ip + istride * vstep * 7, istride);
-            #endif
-        #endif
-
-        #if PACK_FACTOR == 4
-        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3);
-        #elif PACK_FACTOR == 8
-        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3, v4, v5, v6, v7);
-        #endif
-
-        #if @DTYPE@ == CONTIG
-            npyv_store_u8(op, r);
-        #else // @DTYPE@ == CONTIG
-            // Results are packed, so we can just loop over them
-            npy_uint8 lane[npyv_nlanes_u8];
-            npyv_store_u8(lane, r);
-            for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
-                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_@sfx@)];
-            }
-        #endif // @DTYPE@ == CONTIG
-    }
-
-    // vector-sized iterations
-    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
-    #if @STYPE@ == CONTIG
-        npyv_@sfx@ v = npyv_load_@sfx@(ip);
-    #else
-        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
-    #endif
-
-        npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
-
-        npy_uint8 lane[npyv_nlanes_u8];
-        npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
-
-        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
-        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
-        #if npyv_nlanes_@sfx@ == 4
-        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
-        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
-        #endif
-    }
-
-    #undef PACK_FACTOR
-
-    // Scalar loop to finish off
-    for (; len > 0; --len, ip += istride, op += ostride) {
-        *op = (npy_@kind@(*ip) != 0);
-    }
-
-    npyv_cleanup();
-}
-/**end repeat2**/
-/**end repeat1**/
-
-#endif // @VCHK@
-/**end repeat**/
-
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * #TYPE = FLOAT, DOUBLE#
- * #sfx  = f32, f64#
- * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
- */
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite, signbit#
- **/
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-#if @VCHK@
-    const char *ip = args[0];
-    char *op = args[1];
-    const npy_intp istep = steps[0];
-    const npy_intp ostep = steps[1];
-    npy_intp len = dimensions[0];
-
-    if (!is_mem_overlap(ip, istep, op, ostep, len) &&
-        npyv_loadable_stride_@sfx@(istep) &&
-        npyv_storable_stride_@sfx@(ostep))
-    {
-        const npy_intp istride = istep / sizeof(npyv_lanetype_@sfx@);
-        const npy_intp ostride = ostep / sizeof(npy_bool);
-
-        if (istride == 1 && ostride == 1) {
-            simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
-        }
-        else if (ostride == 1) {
-            simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
-        }
-        else if (istride == 1) {
-            simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
-        } else {
-            simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
-        }
-    } else
-#endif // @VCHK@
-    {
-    UNARY_LOOP {
-        const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
-        *((npy_bool *)op1) = (npy_@kind@(in) != 0);
-    }
-    }
-
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
-/**end repeat**/
diff --git a/numpy/_core/src/umath/loops_unary_fp_le.dispatch.cpp b/numpy/_core/src/umath/loops_unary_fp_le.dispatch.cpp
new file mode 100644
index 000000000000..a279f6b71ff7
--- /dev/null
+++ b/numpy/_core/src/umath/loops_unary_fp_le.dispatch.cpp
@@ -0,0 +1,292 @@
+#include "loops_utils.h"
+#include "loops.h"
+
+#include <hwy/highway.h>
+#include "simd/simd.hpp"
+#include "common.hpp"
+
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+*/
+
+namespace {
+using namespace np::simd128;
+
+template <typename T> struct OpIsNaN {
+#if NPY_SIMDX
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE HWY_ATTR auto operator()(const V& v) const {
+        return hn::IsNaN(v);
+    }
+#endif
+
+    NPY_INLINE T operator()(T a) const {
+        return npy_isnan(a) != 0;
+    }
+};
+
+template <typename T> struct OpIsInf {
+#if NPY_SIMDX
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE HWY_ATTR auto operator()(const V& v) const {
+        return hn::IsInf(v);
+    }
+#endif
+
+    NPY_INLINE T operator()(T a) const {
+        return npy_isinf(a) != 0;
+    }
+};
+
+template <typename T> struct OpIsFinite {
+#if NPY_SIMDX
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE HWY_ATTR auto operator()(const V& v) const {
+        return hn::IsFinite(v);
+    }
+#endif
+
+    NPY_INLINE T operator()(T a) const {
+        return npy_isfinite(a) != 0;
+    }
+};
+
+template <typename T> struct OpSignbit {
+#if NPY_SIMDX
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE HWY_ATTR auto operator()(const V& v) const {
+        return hn::IsNegative(v);
+    }
+#endif
+
+    NPY_INLINE T operator()(T a) const {
+        return npy_signbit(a) != 0;
+    }
+};
+
+#if NPY_SIMDX
+template <typename T>
+HWY_INLINE HWY_ATTR auto LoadWithStride(const T* src, npy_intp ssrc, size_t n = Lanes<T>(), T val = 0) {
+    HWY_LANES_CONSTEXPR size_t lanes = Lanes<T>();
+    std::vector<T> temp(lanes, val);
+    for (size_t ii = 0; ii < lanes && ii < n; ++ii) {
+        temp[ii] = src[ii * ssrc];
+    }
+    return LoadU(temp.data());
+}
+
+template <typename D>
+HWY_INLINE HWY_ATTR void StoreWithStride(hn::Vec<D> vec, hn::TFromD<D>* dst, npy_intp sdst, size_t n = hn::Lanes(D())) {
+    HWY_LANES_CONSTEXPR size_t lanes = hn::Lanes(D());
+    std::vector<hn::TFromD<D>> temp(lanes);
+    hn::StoreU(vec, D(), temp.data());
+    for (size_t ii = 0; ii < lanes && ii < n; ++ii) {
+        dst[ii * sdst] = temp[ii];
+    }
+}
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic LoadNOr instead of LoadN
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+template <typename T, typename OP, int STYPE, int DTYPE>
+HWY_INLINE HWY_ATTR void
+simd_unary_fp_le(const T *src, npy_intp ssrc, npy_bool *dst, npy_intp sdst, npy_intp len) {
+    const OP op_func;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    constexpr int PACK_FACTOR = sizeof(T);
+    const int vstep = Lanes<T>();
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        Vec<T> v0, v1, v2, v3;
+        if constexpr (STYPE == 0) {
+            v0 = LoadU(src);
+            v1 = LoadU(src + vstep);
+            v2 = LoadU(src + vstep*2);
+            v3 = LoadU(src + vstep*3);
+        }
+        else {
+            v0 = LoadWithStride(src, ssrc);
+            v1 = LoadWithStride(src + ssrc*vstep, ssrc);
+            v2 = LoadWithStride(src + ssrc*vstep*2, ssrc);
+            v3 = LoadWithStride(src + ssrc*vstep*3, ssrc);
+        }
+
+        Vec<uint8_t> r;
+        if constexpr (PACK_FACTOR == 8) {
+            Vec<T> v4, v5, v6, v7;
+            if constexpr (STYPE == 0) {
+                v4 = LoadU(src + vstep*4);
+                v5 = LoadU(src + vstep*5);
+                v6 = LoadU(src + vstep*6);
+                v7 = LoadU(src + vstep*7);
+            }
+            else {
+                v4 = LoadWithStride(src + ssrc*vstep*4, ssrc);
+                v5 = LoadWithStride(src + ssrc*vstep*5, ssrc);
+                v6 = LoadWithStride(src + ssrc*vstep*6, ssrc);
+                v7 = LoadWithStride(src + ssrc*vstep*7, ssrc);
+            }
+
+            auto m0 = hn::OrderedDemote2MasksTo(_Tag<uint32_t>(), _Tag<T>(), op_func(v0), op_func(v1));
+            auto m1 = hn::OrderedDemote2MasksTo(_Tag<uint32_t>(), _Tag<T>(), op_func(v2), op_func(v3));
+            auto m2 = hn::OrderedDemote2MasksTo(_Tag<uint32_t>(), _Tag<T>(), op_func(v4), op_func(v5));
+            auto m3 = hn::OrderedDemote2MasksTo(_Tag<uint32_t>(), _Tag<T>(), op_func(v6), op_func(v7));
+            auto m00 = hn::OrderedDemote2MasksTo(_Tag<uint16_t>(), _Tag<uint32_t>(), m0, m1);
+            auto m01 = hn::OrderedDemote2MasksTo(_Tag<uint16_t>(), _Tag<uint32_t>(), m2, m3);
+            r = hn::VecFromMask(_Tag<uint8_t>(), hn::OrderedDemote2MasksTo(_Tag<uint8_t>(), _Tag<uint16_t>(), m00, m01));
+        }
+        else {
+            auto m0 = hn::OrderedDemote2MasksTo(_Tag<uint16_t>(), _Tag<T>(), op_func(v0), op_func(v1));
+            auto m1 = hn::OrderedDemote2MasksTo(_Tag<uint16_t>(), _Tag<T>(), op_func(v2), op_func(v3));
+            r = hn::VecFromMask(_Tag<uint8_t>(), hn::OrderedDemote2MasksTo(_Tag<uint8_t>(), _Tag<uint16_t>(), m0, m1));
+        }
+
+        if constexpr (DTYPE == 0) {
+            StoreU(r, dst);
+        }
+        else {
+            StoreWithStride<_Tag<uint8_t>>(r, dst, sdst);
+        }
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        Vec<T> v_src;
+        if constexpr (STYPE == 0) {
+            v_src = LoadU(src);
+        }
+        else {
+            v_src = LoadWithStride(src, ssrc);
+        }
+
+        if constexpr (PACK_FACTOR == 4) {
+            auto v_dst = hn::VecFromMask(hn::Half<hn::Half<_Tag<uint8_t>>>(), hn::DemoteMaskTo(hn::Half<hn::Half<_Tag<uint8_t>>>(), _Tag<T>(), op_func(v_src)));
+            if constexpr (DTYPE == 0) {
+                hn::StoreU(v_dst, hn::Half<hn::Half<_Tag<uint8_t>>>(), dst);
+            }
+            else {
+                StoreWithStride<hn::Half<hn::Half<_Tag<uint8_t>>>>(v_dst, dst, sdst);
+            }
+        }
+        else {
+            auto v_dst = hn::VecFromMask(hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>(), hn::DemoteMaskTo(hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>(), _Tag<T>(), op_func(v_src)));
+            if constexpr (DTYPE == 0) {
+                hn::StoreU(v_dst, hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>(), dst);
+            }
+            else {
+                StoreWithStride<hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>>(v_dst, dst, sdst);
+            }
+        }
+    }
+
+    // last partial iteration, if needed
+    if (len > 0) {
+        Vec<T> v_src;
+        if constexpr (STYPE == 0) {
+            v_src = hn::LoadN(_Tag<T>(), src, len);
+        }
+        else {
+            v_src = LoadWithStride(src, ssrc, len);
+        }
+
+        if constexpr (PACK_FACTOR == 4) {
+            auto v_dst = hn::VecFromMask(hn::Half<hn::Half<_Tag<uint8_t>>>(), hn::DemoteMaskTo(hn::Half<hn::Half<_Tag<uint8_t>>>(), _Tag<T>(), op_func(v_src)));
+            if constexpr (DTYPE == 0) {
+                hn::StoreN(v_dst, hn::Half<hn::Half<_Tag<uint8_t>>>(), dst, len);
+            }
+            else {
+                StoreWithStride<hn::Half<hn::Half<_Tag<uint8_t>>>>(v_dst, dst, sdst, len);
+            }
+        }
+        else {
+            auto v_dst = hn::VecFromMask(hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>(), hn::DemoteMaskTo(hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>(), _Tag<T>(), op_func(v_src)));
+            if constexpr (DTYPE == 0) {
+                hn::StoreN(v_dst, hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>(), dst, len);
+            }
+            else {
+                StoreWithStride<hn::Half<hn::Half<hn::Half<_Tag<uint8_t>>>>>(v_dst, dst, sdst, len);
+            }
+        }
+    }
+}
+#endif // NPY_SIMDX
+
+template <typename T, typename OP>
+HWY_INLINE HWY_ATTR void
+unary_fp_le(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const OP op_func;
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+
+    bool unrolled = false;
+#if NPY_SIMDX
+    if constexpr (kSupportLane<T>) {
+        if (!is_mem_overlap(src, src_step, dst, dst_step, len) && alignof(T) == sizeof(T) &&
+                src_step % sizeof(T) == 0) {
+            const npy_intp ssrc = src_step / sizeof(T);
+            const npy_intp sdst = dst_step / sizeof(npy_bool);
+            if (ssrc == 1 && sdst == 1) {
+                simd_unary_fp_le<T, OP, 0, 0>(reinterpret_cast<const T*>(src), 1, reinterpret_cast<npy_bool*>(dst), 1, len);
+            }
+            else if (sdst == 1) {
+                simd_unary_fp_le<T, OP, 1, 0>(reinterpret_cast<const T*>(src), ssrc, reinterpret_cast<npy_bool*>(dst), 1, len);
+            }
+            else if (ssrc == 1) {
+                simd_unary_fp_le<T, OP, 0, 1>(reinterpret_cast<const T*>(src), 1, reinterpret_cast<npy_bool*>(dst), sdst, len);
+            }
+            else {
+                simd_unary_fp_le<T, OP, 1, 1>(reinterpret_cast<const T*>(src), ssrc, reinterpret_cast<npy_bool*>(dst), sdst, len);
+            }
+            unrolled = true;
+        }
+    }
+#endif
+
+    // fallback to scalar implementation
+    if (!unrolled) {
+        for (; len > 0; --len, src += src_step, dst += dst_step) {
+            const T src0 = *reinterpret_cast<const T*>(src);
+            *reinterpret_cast<npy_bool*>(dst) = op_func(src0);
+        }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+} // anonymous namespace
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ *******************************************************************************/
+#define DEFINE_UNARY_FP_LE_FUNCTION(TYPE, KIND, INTR, T)                                 \
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                                 \
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) \
+{                                                                                        \
+    unary_fp_le<T, Op##INTR<T>>(args, dimensions, steps);                                \
+}
+
+DEFINE_UNARY_FP_LE_FUNCTION(FLOAT, isnan   , IsNaN   , npy_float)
+DEFINE_UNARY_FP_LE_FUNCTION(FLOAT, isinf   , IsInf   , npy_float)
+DEFINE_UNARY_FP_LE_FUNCTION(FLOAT, isfinite, IsFinite, npy_float)
+DEFINE_UNARY_FP_LE_FUNCTION(FLOAT, signbit , Signbit , npy_float)
+DEFINE_UNARY_FP_LE_FUNCTION(DOUBLE, isnan   , IsNaN   , npy_double)
+DEFINE_UNARY_FP_LE_FUNCTION(DOUBLE, isinf   , IsInf   , npy_double)
+DEFINE_UNARY_FP_LE_FUNCTION(DOUBLE, isfinite, IsFinite, npy_double)
+DEFINE_UNARY_FP_LE_FUNCTION(DOUBLE, signbit , Signbit , npy_double)
+#undef DEFINE_UNARY_FP_LE_FUNCTION