diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index 3bffed752474..4fd648541829 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -902,13 +902,14 @@ umath_gen_headers = [
 foreach gen_mtargets : [
   [
     'loops_arithm_fp.dispatch.h',
-    src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'),
+    src_file.process('src/umath/loops_arithm_fp.dispatch.cpp.src'),
     [
       [AVX2, FMA3], SSE2,
       ASIMD, NEON,
       VSX3, VSX2,
       VXE, VX,
       LSX,
+      RVV,
     ]
   ],
   [
diff --git a/numpy/_core/src/common/simd/README.md b/numpy/_core/src/common/simd/README.md
new file mode 100644
index 000000000000..9a68d1aa1bfc
--- /dev/null
+++ b/numpy/_core/src/common/simd/README.md
@@ -0,0 +1,263 @@
+# NumPy SIMD Wrapper for Highway
+
+This directory contains a lightweight C++ wrapper over Google's [Highway](https://github.com/google/highway) SIMD library, designed specifically for NumPy's needs.
+
+> **Note**: This directory also contains the C interface of universal intrinsics (under `simd.h`) which is no longer supported. The Highway wrapper described in this document should be used instead for all new SIMD code.
+
+## Overview
+
+The wrapper simplifies Highway's SIMD interface by eliminating class tags and using lane types directly, which can be deduced from arguments in most cases. This design makes the SIMD code more intuitive and easier to maintain while still leveraging Highway generic intrinsics.
+
+## Architecture
+
+The wrapper consists of two main headers:
+
+1. `simd.hpp`: The main header that defines namespaces and includes configuration macros
+2. `simd.inc.hpp`: Implementation details included by `simd.hpp` multiple times for different namespaces
+
+Additionally, this directory contains legacy C interface files for universal intrinsics (`simd.h` and related files) which are deprecated and should not be used for new code. All new SIMD code should use the Highway wrapper.
+
+
+## Usage
+
+### Basic Usage
+
+```cpp
+#include "simd/simd.hpp"
+
+// Use np::simd for maximum width SIMD operations
+using namespace np::simd;
+float *data = /* ... */;
+Vec<float> v = LoadU(data);
+v = Add(v, v);
+StoreU(v, data);
+
+// Use np::simd128 for fixed 128-bit SIMD operations
+using namespace np::simd128;
+Vec<float> v128 = LoadU(data);
+v128 = Add(v128, v128);
+StoreU(v128, data);
+```
+
+### Checking for SIMD Support
+
+```cpp
+#include "simd/simd.hpp"
+
+// Check if SIMD is enabled
+#if NPY_SIMDX
+    // SIMD code
+#else
+    // Scalar fallback code
+#endif
+
+// Check for float64 support
+#if NPY_SIMDX_F64
+    // Use float64 SIMD operations
+#endif
+
+// Check for FMA support
+#if NPY_SIMDX_FMA
+    // Use FMA operations
+#endif
+```
+
+## Type Support and Constraints
+
+The wrapper provides type constraints to help with SFINAE (Substitution Failure Is Not An Error) and compile-time type checking:
+
+- `kSupportLane<TLane>`: Determines whether the specified lane type is supported by the SIMD extension.
+  ```cpp
+  // Base template - always defined, even when SIMD is not enabled (for SFINAE)
+  template <typename TLane>
+  constexpr bool kSupportLane = NPY_SIMDX != 0;
+  template <>
+  constexpr bool kSupportLane<double> = NPY_SIMDX_F64 != 0;
+  ```
+
+- `kMaxLanes<TLane>`: Maximum number of lanes supported by the SIMD extension for the specified lane type.
+  ```cpp
+  template <typename TLane>
+  constexpr size_t kMaxLanes = HWY_MAX_LANES_D(_Tag<TLane>);
+  ```
+
+```cpp
+#include "simd/simd.hpp"
+
+// Check if float64 operations are supported
+if constexpr (np::simd::kSupportLane<double>) {
+    // Use float64 operations
+}
+```
+
+These constraints allow for compile-time checking of which lane types are supported, which can be used in SFINAE contexts to enable or disable functions based on type support.
+
+## Available Operations
+
+The wrapper provides the following common operations that are used in NumPy:
+
+- Vector creation operations:
+  - `Zero`: Returns a vector with all lanes set to zero
+  - `Set`: Returns a vector with all lanes set to the given value
+  - `Undefined`: Returns an uninitialized vector
+  
+- Memory operations:
+  - `LoadU`: Unaligned load of a vector from memory
+  - `StoreU`: Unaligned store of a vector to memory
+  
+- Vector information:
+  - `Lanes`: Returns the number of vector lanes based on the lane type
+  
+- Type conversion:
+  - `BitCast`: Reinterprets a vector to a different type without modifying the underlying data
+  - `VecFromMask`: Converts a mask to a vector
+  
+- Comparison operations:
+  - `Eq`: Element-wise equality comparison
+  - `Le`: Element-wise less than or equal comparison
+  - `Lt`: Element-wise less than comparison
+  - `Gt`: Element-wise greater than comparison
+  - `Ge`: Element-wise greater than or equal comparison
+  
+- Arithmetic operations:
+  - `Add`: Element-wise addition
+  - `Sub`: Element-wise subtraction
+  - `Mul`: Element-wise multiplication
+  - `Div`: Element-wise division
+  - `Min`: Element-wise minimum
+  - `Max`: Element-wise maximum
+  - `Abs`: Element-wise absolute value
+  - `Sqrt`: Element-wise square root
+  
+- Logical operations:
+  - `And`: Bitwise AND
+  - `Or`: Bitwise OR
+  - `Xor`: Bitwise XOR
+  - `AndNot`: Bitwise AND NOT (a & ~b)
+
+Additional Highway operations can be accessed via the `hn` namespace alias inside the `simd` or `simd128` namespaces.
+
+## Extending
+
+To add more operations from Highway:
+
+1. Import them in the `simd.inc.hpp` file using the `using` directive if they don't require a tag:
+   ```cpp
+   // For operations that don't require a tag
+   using hn::FunctionName;
+   ```
+
+2. Define wrapper functions for intrinsics that require a class tag:
+   ```cpp
+   // For operations that require a tag
+   template <typename TLane>
+   HWY_API ReturnType FunctionName(Args... args) {
+       return hn::FunctionName(_Tag<TLane>(), args...);
+   }
+   ```
+
+3. Add appropriate documentation and SFINAE constraints if needed
+
+
+## Build Configuration
+
+The SIMD wrapper automatically disables SIMD operations when optimizations are disabled:
+
+- When `NPY_DISABLE_OPTIMIZATION` is defined, SIMD operations are disabled
+- SIMD is enabled only when the Highway target is not scalar (`HWY_TARGET != HWY_SCALAR`)
+
+## Design Notes
+
+1. **Why avoid Highway scalar operations?**
+   - NumPy already provides kernels for scalar operations
+   - Compilers can better optimize standard library implementations
+   - Not all Highway intrinsics are fully supported in scalar mode
+
+2. **Legacy Universal Intrinsics**
+   - The older universal intrinsics C interface (in `simd.h` and accessible via `NPY_SIMD` macros) is deprecated
+   - All new SIMD code should use this Highway-based wrapper (accessible via `NPY_SIMDX` macros) 
+   - The legacy code is maintained for compatibility but will eventually be removed
+   
+3. **Feature Detection Constants vs. Highway Constants**
+   - NumPy-specific constants (`NPY_SIMDX_F16`, `NPY_SIMDX_F64`, `NPY_SIMDX_FMA`) provide additional safety beyond raw Highway constants
+   - Highway constants (e.g., `HWY_HAVE_FLOAT16`) only check platform capabilities but don't consider NumPy's build configuration
+   - Our constants combine both checks:
+     ```cpp
+     #define NPY_SIMDX_F16 (NPY_SIMDX && HWY_HAVE_FLOAT16)
+     ```
+   - This ensures SIMD features won't be used when:
+     - Platform supports it but NumPy optimization is disabled via meson option:
+       ```
+       option('disable-optimization', type: 'boolean', value: false,
+              description: 'Disable CPU optimized code (dispatch,simd,unroll...)')
+       ```
+     - Highway target is scalar (`HWY_TARGET == HWY_SCALAR`)
+   - Using these constants ensures consistent behavior across different compilation settings
+   - Without this additional layer, code might incorrectly try to use SIMD paths in scalar mode
+
+4. **Namespace Design**
+   - `np::simd`: Maximum width SIMD operations (scalable)
+   - `np::simd128`: Fixed 128-bit SIMD operations
+   - `hn`: Highway namespace alias (available within the SIMD namespaces)
+
+5. **Why Namespaces and Why Not Just Use Highway Directly?**
+   - Highway's design uses class tag types as template parameters (e.g., `Vec<ScalableTag<float>>`) when defining vector types
+   - Many Highway functions require explicitly passing a tag instance as the first parameter
+   - This class tag-based approach increases verbosity and complexity in user code
+   - Our wrapper eliminates this by internally managing tags through namespaces, letting users directly use types e.g. `Vec<float>`
+   - Simple example with raw Highway:
+     ```cpp
+     // Highway's approach
+     float *data = /* ... */;
+     
+     namespace hn = hwy::HWY_NAMESPACE;
+     using namespace hn;
+     
+     // Full-width operations
+     ScalableTag<float> df;  // Create a tag instance
+     Vec<decltype(df)> v = LoadU(df, data);  // LoadU requires a tag instance
+     StoreU(v, df, data);  // StoreU requires a tag instance
+     
+     // 128-bit operations
+     Full128<float> df128;  // Create a 128-bit tag instance
+     Vec<decltype(df128)> v128 = LoadU(df128, data);  // LoadU requires a tag instance
+     StoreU(v128, df128, data);  // StoreU requires a tag instance
+     ```
+  
+   - Simple example with our wrapper:
+     ```cpp
+     // Our wrapper approach
+     float *data = /* ... */;
+     
+     // Full-width operations
+     using namespace np::simd;
+     Vec<float> v = LoadU(data);  // Full-width vector load
+     StoreU(v, data);
+     
+     // 128-bit operations
+     using namespace np::simd128;
+     Vec<float> v128 = LoadU(data);  // 128-bit vector load
+     StoreU(v128, data);
+     ```
+   
+   - The namespaced approach simplifies code, reduces errors, and provides a more intuitive interface
+   - It preserves all Highway operations benefits while reducing cognitive overhead
+
+5. **Why Namespaces Are Essential for This Design?**
+   - Namespaces allow us to define different internal tag types (`hn::ScalableTag<TLane>` in `np::simd` vs `hn::Full128<TLane>` in `np::simd128`)
+   - This provides a consistent type-based interface (`Vec<float>`) without requiring users to manually create tags
+   - Enables using the same function names (like `LoadU`) with different implementations based on SIMD width
+   - Without namespaces, we'd have to either reintroduce tags (defeating the purpose of the wrapper) or create different function names for each variant (e.g., `LoadU` vs `LoadU128`)
+
+6. **Template Type Parameters**
+   - `TLane`: The scalar type for each vector lane (e.g., uint8_t, float, double)
+
+
+## Requirements
+
+- C++17 or later
+- Google Highway library
+
+## License
+
+Same as NumPy's license
diff --git a/numpy/_core/src/common/simd/simd.hpp b/numpy/_core/src/common/simd/simd.hpp
new file mode 100644
index 000000000000..698da4adf865
--- /dev/null
+++ b/numpy/_core/src/common/simd/simd.hpp
@@ -0,0 +1,80 @@
+#ifndef NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_
+#define NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_
+
+/**
+ * This header provides a thin wrapper over Google's Highway SIMD library.
+ *
+ * The wrapper aims to simplify the SIMD interface of Google's Highway by
+ * get ride of its class tags and use lane types directly which can be deduced
+ * from the args in most cases.
+ */
+/**
+ * Since `NPY_SIMD` is only limited to NumPy C universal intrinsics,
+ * `NPY_SIMDX` is defined to indicate the SIMD availability for Google's Highway
+ * C++ code.
+ *
+ * Highway SIMD is only available when optimization is enabled.
+ * When NPY_DISABLE_OPTIMIZATION is defined, SIMD operations are disabled
+ * and the code falls back to scalar implementations.
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+#include <hwy/highway.h>
+
+/**
+ * We avoid using Highway scalar operations for the following reasons:
+ * 1. We already provide kernels for scalar operations, so falling back to
+ *    the NumPy implementation is more appropriate. Compilers can often
+ *    optimize these better since they rely on standard libraries.
+ * 2. Not all Highway intrinsics are fully supported in scalar mode.
+ *
+ * Therefore, we only enable SIMD when the Highway target is not scalar.
+ */
+#define NPY_SIMDX (HWY_TARGET != HWY_SCALAR)
+
+// Indicates if the SIMD operations are available for float16.
+#define NPY_SIMDX_F16 (NPY_SIMDX && HWY_HAVE_FLOAT16)
+// Note: Highway requires SIMD extentions with native float32 support, so we don't need
+// to check for it.
+
+// Indicates if the SIMD operations are available for float64.
+#define NPY_SIMDX_F64 (NPY_SIMDX && HWY_HAVE_FLOAT64)
+
+// Indicates if the SIMD floating operations are natively supports fma.
+#define NPY_SIMDX_FMA (NPY_SIMDX && HWY_NATIVE_FMA)
+
+#else
+#define NPY_SIMDX 0
+#define NPY_SIMDX_F16 0
+#define NPY_SIMDX_F64 0
+#define NPY_SIMDX_FMA 0
+#endif
+
+namespace np {
+
+/// Represents the max SIMD width supported by the platform.
+namespace simd {
+#if NPY_SIMDX
+/// The highway namespace alias.
+/// We can not import all the symbols from the HWY_NAMESPACE because it will
+/// conflict with the existing symbols in the numpy namespace.
+namespace hn = hwy::HWY_NAMESPACE;
+// internaly used by the template header
+template <typename TLane>
+using _Tag = hn::ScalableTag<TLane>;
+#endif
+#include "simd.inc.hpp"
+}  // namespace simd
+
+/// Represents the 128-bit SIMD width.
+namespace simd128 {
+#if NPY_SIMDX
+namespace hn = hwy::HWY_NAMESPACE;
+template <typename TLane>
+using _Tag = hn::Full128<TLane>;
+#endif
+#include "simd.inc.hpp"
+}  // namespace simd128
+
+}  // namespace np
+
+#endif  // NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_
diff --git a/numpy/_core/src/common/simd/simd.inc.hpp b/numpy/_core/src/common/simd/simd.inc.hpp
new file mode 100644
index 000000000000..64d28bc47118
--- /dev/null
+++ b/numpy/_core/src/common/simd/simd.inc.hpp
@@ -0,0 +1,132 @@
+#ifndef NPY_SIMDX
+#error "This is not a standalone header. Include simd.hpp instead."
+#define NPY_SIMDX 1 // Prevent editors from graying out the happy branch
+#endif
+
+// Using anonymous namespace instead of inline to ensure each translation unit
+// gets its own copy of constants based on local compilation flags
+namespace {
+
+// NOTE: This file is included by simd.hpp multiple times with different namespaces
+// so avoid including any headers here
+
+/**
+ * Determines whether the specified lane type is supported by the SIMD extension.
+ * Always defined as false when SIMD is not enabled, so it can be used in SFINAE.
+ *
+ * @tparam TLane The lane type to check for support.
+ */
+template <typename TLane>
+constexpr bool kSupportLane = NPY_SIMDX != 0;
+
+#if NPY_SIMDX
+// Define lane type support based on Highway capabilities
+template <>
+constexpr bool kSupportLane<hwy::float16_t> = HWY_HAVE_FLOAT16 != 0;
+template <>
+constexpr bool kSupportLane<double> = HWY_HAVE_FLOAT64 != 0;
+template <>
+constexpr bool kSupportLane<long double> =
+        HWY_HAVE_FLOAT64 != 0 && sizeof(long double) == sizeof(double);
+
+/// Maximum number of lanes supported by the SIMD extension for the specified lane type.
+template <typename TLane>
+constexpr size_t kMaxLanes = HWY_MAX_LANES_D(_Tag<TLane>);
+
+/// Represents an N-lane vector based on the specified lane type.
+/// @tparam TLane The scalar type for each vector lane
+template <typename TLane>
+using Vec = hn::Vec<_Tag<TLane>>;
+
+/// Represents a mask vector with boolean values or as a bitmask.
+/// @tparam TLane The scalar type the mask corresponds to
+template <typename TLane>
+using Mask = hn::Mask<_Tag<TLane>>;
+
+/// Unaligned load of a vector from memory.
+template <typename TLane>
+HWY_API Vec<TLane>
+LoadU(const TLane *ptr)
+{
+    return hn::LoadU(_Tag<TLane>(), ptr);
+}
+
+/// Unaligned store of a vector to memory.
+template <typename TLane>
+HWY_API void
+StoreU(const Vec<TLane> &a, TLane *ptr)
+{
+    hn::StoreU(a, _Tag<TLane>(), ptr);
+}
+
+/// Returns the number of vector lanes based on the lane type.
+template <typename TLane>
+HWY_API HWY_LANES_CONSTEXPR size_t
+Lanes(TLane tag = 0)
+{
+    return hn::Lanes(_Tag<TLane>());
+}
+
+/// Returns an uninitialized N-lane vector.
+template <typename TLane>
+HWY_API Vec<TLane>
+Undefined(TLane tag = 0)
+{
+    return hn::Undefined(_Tag<TLane>());
+}
+
+/// Returns N-lane vector with all lanes equal to zero.
+template <typename TLane>
+HWY_API Vec<TLane>
+Zero(TLane tag = 0)
+{
+    return hn::Zero(_Tag<TLane>());
+}
+
+/// Returns N-lane vector with all lanes equal to the given value of type `TLane`.
+template <typename TLane>
+HWY_API Vec<TLane>
+Set(TLane val)
+{
+    return hn::Set(_Tag<TLane>(), val);
+}
+
+/// Converts a mask to a vector based on the specified lane type.
+template <typename TLane, typename TMask>
+HWY_API Vec<TLane>
+VecFromMask(const TMask &m)
+{
+    return hn::VecFromMask(_Tag<TLane>(), m);
+}
+
+/// Convert (Reinterpret) an N-lane vector to a different type without modifying the
+/// underlying data.
+template <typename TLaneTo, typename TVec>
+HWY_API Vec<TLaneTo>
+BitCast(const TVec &v)
+{
+    return hn::BitCast(_Tag<TLaneTo>(), v);
+}
+
+// Import common Highway intrinsics
+using hn::Abs;
+using hn::Add;
+using hn::And;
+using hn::AndNot;
+using hn::Div;
+using hn::Eq;
+using hn::Ge;
+using hn::Gt;
+using hn::Le;
+using hn::Lt;
+using hn::Max;
+using hn::Min;
+using hn::Mul;
+using hn::Or;
+using hn::Sqrt;
+using hn::Sub;
+using hn::Xor;
+
+#endif  // NPY_SIMDX
+
+} // namespace anonymous
diff --git a/numpy/_core/src/highway b/numpy/_core/src/highway
index 0b696633f9ad..12b325bc1793 160000
--- a/numpy/_core/src/highway
+++ b/numpy/_core/src/highway
@@ -1 +1 @@
-Subproject commit 0b696633f9ad89497dd5532b55eaa01625ad71ca
+Subproject commit 12b325bc1793dee68ab2157995a690db859fe9e0
diff --git a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
deleted file mode 100644
index 94bc24811e1d..000000000000
--- a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
+++ /dev/null
@@ -1,652 +0,0 @@
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "simd/simd.h"
-#include "loops_utils.h"
-#include "loops.h"
-#include "lowlevel_strided_loops.h"
-// Provides the various *_LOOP macros
-#include "fast_loop_macros.h"
-
-/**
- * TODO:
- *  - Improve the implementation of SIMD complex absolute,
- *    current one kinda slow and it can be optimized by
- *    at least avoiding the division and keep sqrt.
- *  - Vectorize reductions
- *  - Add support for ASIMD/VCMLA through universal intrinsics.
- */
-
-//###############################################################################
-//## Real Single/Double precision
-//###############################################################################
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #sfx  = f32, f64#
- *  #c = f, #
- *  #C = F, #
- *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # intrin = add, sub, mul, div#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- * # is_div = 0*3, 1#
- * # is_mul = 0*2, 1, 0#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    npy_intp len = dimensions[0];
-    char *src0 = args[0], *src1 = args[1], *dst = args[2];
-    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
-    // reduce
-    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
-    #if @PW@
-        *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1);
-    #else
-        @type@ acc = *((@type@*)src0);
-        if (ssrc1 == sizeof(@type@)) {
-            for (; len > 0; --len, src1 += sizeof(@type@)) {
-                acc @OP@= *(@type@ *)src1;
-            }
-        } else {
-            for (; len > 0; --len, src1 += ssrc1) {
-                acc @OP@= *(@type@ *)src1;
-            }
-        }
-        *((@type@*)src0) = acc;
-    #endif
-        return;
-    }
-#if @is_div@ && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
-    /**
-     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
-     * support for single-precision floating-point division. Only scalar division is
-     * supported natively, and without hardware for performance and accuracy comparison,
-     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
-     * native scalar division.
-     *
-     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
-     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
-     * precision. However, this approach has limitations:
-     *
-     * - It can cause unexpected floating-point overflows in special cases, such as when
-     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
-     *
-     * - The precision may vary between the emulated SIMD and scalar division due to
-     *   non-uniform branches (non-contiguous) in the code, leading to precision
-     *   inconsistencies.
-     *
-     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
-     *   gain may not sufficiently offset these drawbacks.
-     */
-#elif @VECTOR@
-    if (len > npyv_nlanes_@sfx@*2 &&
-        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
-        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
-    ) {
-        const int vstep = npyv_nlanes_u8;
-        const int wstep = vstep * 2;
-        const int hstep = npyv_nlanes_@sfx@;
-        const int lstep = hstep * 2;
-        // lots of specializations, to squeeze out max performance
-        if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) {
-            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
-                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
-                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
-                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
-                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
-                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0);
-                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1);
-                npyv_store_@sfx@((@type@*)dst, r0);
-                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
-            }
-            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
-            #if @is_div@
-                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
-                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
-            #else
-                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
-                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
-            #endif
-                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
-                npyv_store_till_@sfx@((@type@*)dst, len, r);
-            }
-        }
-        else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
-            npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
-            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
-                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
-                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
-                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0);
-                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1);
-                npyv_store_@sfx@((@type@*)dst, r0);
-                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
-            }
-            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
-            #if @is_div@ || @is_mul@
-                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
-            #else
-                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
-            #endif
-                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
-                npyv_store_till_@sfx@((@type@*)dst, len, r);
-            }
-        }
-        else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
-            npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
-            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
-                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
-                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
-                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b);
-                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b);
-                npyv_store_@sfx@((@type@*)dst, r0);
-                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
-            }
-            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
-            #if @is_mul@
-                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
-            #elif @is_div@
-                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, NPY_NAN@C@);
-            #else
-                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
-            #endif
-                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
-                npyv_store_till_@sfx@((@type@*)dst, len, r);
-            }
-        } else {
-            goto loop_scalar;
-        }
-        npyv_cleanup();
-        return;
-    }
-loop_scalar:
-#endif
-    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
-        const @type@ a = *((@type@*)src0);
-        const @type@ b = *((@type@*)src1);
-        *((@type@*)dst) = a @OP@ b;
-    }
-}
-
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
-(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
-{
-    char *ip1 = args[0];
-    char *indxp = args[1];
-    char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
-    npy_intp shape = steps[3];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-    @type@ *indexed;
-    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
-        npy_intp indx = *(npy_intp *)indxp;
-        if (indx < 0) {
-            indx += shape;
-        }
-        indexed = (@type@ *)(ip1 + is1 * indx);
-        *indexed = *indexed @OP@ *(@type@ *)value;
-    }
-    return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-//###############################################################################
-//## Complex Single/Double precision
-//###############################################################################
-
-/********************************************************************************
- ** op intrinsics
- ********************************************************************************/
-
-#if NPY_SIMD_F32
-NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
-{
-    npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
-    npyv_f32x2 r;
-    r.val[0] = fill;
-    r.val[1] = fill;
-    return r;
-}
-
-NPY_FINLINE npyv_f32
-simd_cconjugate_f32(npyv_f32 x)
-{
-#if NPY_SIMD_BIGENDIAN
-    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
-#else
-    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
-#endif
-    return npyv_xor_f32(x, mask);
-}
-
-NPY_FINLINE npyv_f32
-simd_cmul_f32(npyv_f32 a, npyv_f32 b)
-{
-    npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
-    npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
-    npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
-    // a_im * b_im, a_im * b_re
-    npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
-    return npyv_muladdsub_f32(a_re, b, ab_iiir);
-}
-
-NPY_FINLINE npyv_f32
-simd_csquare_f32(npyv_f32 x)
-{ return simd_cmul_f32(x, x); }
-#endif
-
-#if NPY_SIMD_F64
-
-NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
-{
-    npyv_f64 r = npyv_setall_f64(a[0]);
-    npyv_f64 i = npyv_setall_f64(a[1]);
-    return npyv_zip_f64(r, i);
-}
-
-NPY_FINLINE npyv_f64
-simd_cconjugate_f64(npyv_f64 x)
-{
-    const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
-       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
-    ));
-    return npyv_xor_f64(x, mask);
-}
-
-NPY_FINLINE npyv_f64
-simd_cmul_f64(npyv_f64 a, npyv_f64 b)
-{
-    npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
-    npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
-    npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
-    // a_im * b_im, a_im * b_re
-    npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
-    return npyv_muladdsub_f64(a_re, b, ab_iiir);
-}
-
-NPY_FINLINE npyv_f64
-simd_csquare_f64(npyv_f64 x)
-{ return simd_cmul_f64(x, x); }
-#endif
-
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * complex types
- * #TYPE = CFLOAT, CDOUBLE#
- * #ftype = npy_float, npy_double#
- * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
- * #sfx = f32, f64#
- * #c = f, #
- * #C = F, #
- */
-/**begin repeat1
- * arithmetic
- * #kind = add, subtract, multiply#
- * #vectorf = npyv_add, npyv_sub, simd_cmul#
- * #OP = +, -, *#
- * #PW = 1, 0, 0#
- * #is_mul = 0*2, 1#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    npy_intp len = dimensions[0];
-    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
-    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
-#if @PW@
-    // reduce
-    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
-        b_ssrc1 % (sizeof(@ftype@)*2) == 0
-    ) {
-        @ftype@ *rl_im = (@ftype@ *)b_src0;
-        @ftype@ rr, ri;
-        @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
-        rl_im[0] @OP@= rr;
-        rl_im[1] @OP@= ri;
-        return;
-    }
-#endif
-#if @VECTOR@
-    // Certain versions of Apple clang (commonly used in CI images) produce
-    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
-    // Work around by scalarising.
-    #if @is_mul@ \
-            && defined(NPY_CPU_AMD64) && defined(__clang__) \
-            && defined(__apple_build_version__) \
-            && __apple_build_version__ >= 14000000 \
-            && __apple_build_version__ < 14030000
-        goto loop_scalar;
-    #endif  // end affected Apple clang.
-
-    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
-        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
-        !npyv_loadable_stride_@sfx@(b_ssrc0) ||
-        !npyv_loadable_stride_@sfx@(b_ssrc1) ||
-        !npyv_storable_stride_@sfx@(b_sdst)  ||
-        b_sdst == 0
-    ) {
-        goto loop_scalar;
-    }
-
-    const @ftype@ *src0 = (@ftype@*)b_src0;
-    const @ftype@ *src1 = (@ftype@*)b_src1;
-          @ftype@ *dst  = (@ftype@*)b_dst;
-
-    const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@);
-    const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@);
-    const npy_intp sdst  = b_sdst / sizeof(@ftype@);
-
-    const int vstep = npyv_nlanes_@sfx@;
-    const int wstep = vstep * 2;
-    const int hstep = vstep / 2;
-
-    // lots**lots of specializations, to squeeze out max performance
-    // contig
-    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
-        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
-            npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
-            npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
-            npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
-            npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
-            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
-            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
-            npyv_store_@sfx@(dst, r0);
-            npyv_store_@sfx@(dst + vstep, r1);
-        }
-        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
-            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
-            npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
-            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
-            npyv_store2_till_@sfx@(dst, len, r);
-        }
-    }
-    // scalar 0
-    else if (ssrc0 == 0) {
-        npyv_@sfx@x2 a = simd_set2_@sfx@(src0);
-        // contig
-        if (ssrc1 == 2 && sdst == ssrc1) {
-            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
-                npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
-                npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
-                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
-                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
-                npyv_store_@sfx@(dst, r0);
-                npyv_store_@sfx@(dst + vstep, r1);
-            }
-            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
-            #if @is_mul@
-                npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@);
-            #else
-                npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
-            #endif
-                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
-                npyv_store2_till_@sfx@(dst, len, r);
-            }
-        }
-        // non-contig
-        else {
-            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
-                npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
-                npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
-                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
-                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
-                npyv_storen2_@sfx@(dst, sdst, r0);
-                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
-            }
-            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
-            #if @is_mul@
-                npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
-            #else
-                npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
-            #endif
-                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
-                npyv_storen2_till_@sfx@(dst, sdst, len, r);
-            }
-        }
-    }
-    // scalar 1
-    else if (ssrc1 == 0) {
-        npyv_@sfx@x2 b = simd_set2_@sfx@(src1);
-        if (ssrc0 == 2 && sdst == ssrc0) {
-            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
-                npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
-                npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
-                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
-                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
-                npyv_store_@sfx@(dst, r0);
-                npyv_store_@sfx@(dst + vstep, r1);
-            }
-            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
-            #if @is_mul@
-                npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@);
-            #else
-                npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
-            #endif
-                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
-                npyv_store2_till_@sfx@(dst, len, r);
-            }
-        }
-        // non-contig
-        else {
-            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
-                npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
-                npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
-                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
-                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
-                npyv_storen2_@sfx@(dst, sdst, r0);
-                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
-            }
-            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
-            #if @is_mul@
-                npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
-            #else
-                npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
-            #endif
-                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
-                npyv_storen2_till_@sfx@(dst, sdst, len, r);
-            }
-        }
-    }
-    #if @is_mul@
-    // non-contig
-    else {
-        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
-                            src1 += ssrc1*vstep, dst += sdst*vstep
-        ) {
-            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
-            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
-            npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
-            npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
-            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
-            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
-            npyv_storen2_@sfx@(dst, sdst, r0);
-            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
-        }
-        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
-                       src1 += ssrc1*hstep, dst += sdst*hstep
-        ) {
-        #if @is_mul@
-            npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
-            npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
-        #else
-            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
-            npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
-        #endif
-            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
-            npyv_storen2_till_@sfx@(dst, sdst, len, r);
-        }
-    }
-    #else  /* @is_mul@ */
-    else {
-        // Only multiply is vectorized for the generic non-contig case.
-        goto loop_scalar;
-    }
-    #endif  /* @is_mul@ */
-
-    npyv_cleanup();
-    return;
-
-loop_scalar:
-#endif
-    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
-        const @ftype@ a_r = ((@ftype@ *)b_src0)[0];
-        const @ftype@ a_i = ((@ftype@ *)b_src0)[1];
-        const @ftype@ b_r = ((@ftype@ *)b_src1)[0];
-        const @ftype@ b_i = ((@ftype@ *)b_src1)[1];
-    #if @is_mul@
-        ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i;
-        ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r;
-    #else
-        ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r;
-        ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i;
-    #endif
-    }
-}
-
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
-(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
-{
-    char *ip1 = args[0];
-    char *indxp = args[1];
-    char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
-    npy_intp shape = steps[3];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-    @ftype@ *indexed;
-    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
-        npy_intp indx = *(npy_intp *)indxp;
-        if (indx < 0) {
-            indx += shape;
-        }
-        indexed = (@ftype@ *)(ip1 + is1 * indx);
-        const @ftype@ b_r = ((@ftype@ *)value)[0];
-        const @ftype@ b_i = ((@ftype@ *)value)[1];
-    #if @is_mul@
-        const @ftype@ a_r = indexed[0];
-        const @ftype@ a_i = indexed[1];
-        indexed[0] = a_r*b_r - a_i*b_i;
-        indexed[1] = a_r*b_i + a_i*b_r;
-    #else
-        indexed[0] @OP@= b_r;
-        indexed[1] @OP@= b_i;
-    #endif
-    }
-    return 0;
-}
-/**end repeat1**/
-
-/**begin repeat1
- *  #kind = conjugate, square#
- *  #is_square = 0, 1#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    npy_intp len = dimensions[0];
-    char *b_src = args[0], *b_dst = args[1];
-    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
-#if @VECTOR@
-    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
-        !npyv_loadable_stride_@sfx@(b_ssrc) ||
-        !npyv_storable_stride_@sfx@(b_sdst)
-    ) {
-        goto loop_scalar;
-    }
-    const @ftype@ *src  = (@ftype@*)b_src;
-          @ftype@ *dst  = (@ftype@*)b_dst;
-    const npy_intp ssrc = b_ssrc / sizeof(@ftype@);
-    const npy_intp sdst = b_sdst / sizeof(@ftype@);
-
-    const int vstep = npyv_nlanes_@sfx@;
-    const int wstep = vstep * 2;
-    const int hstep = vstep / 2;
-
-    if (ssrc == 2 && ssrc == sdst) {
-        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
-            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
-            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
-            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
-            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
-            npyv_store_@sfx@(dst, r0);
-            npyv_store_@sfx@(dst + vstep, r1);
-        }
-        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
-            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
-            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
-            npyv_store2_till_@sfx@(dst, len, r);
-        }
-    }
-    else if (ssrc == 2) {
-        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
-            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
-            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
-            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
-            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
-            npyv_storen2_@sfx@(dst, sdst, r0);
-            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
-        }
-        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
-            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
-            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
-            npyv_storen2_till_@sfx@(dst, sdst, len, r);
-        }
-    }
-    else if (sdst == 2) {
-        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
-            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
-            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
-            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
-            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
-            npyv_store_@sfx@(dst, r0);
-            npyv_store_@sfx@(dst + vstep, r1);
-        }
-        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
-            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len);
-            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
-            npyv_store2_till_@sfx@(dst, len, r);
-        }
-    }
-    else {
-        goto loop_scalar;
-    }
-    npyv_cleanup();
-    return;
-loop_scalar:
-#endif
-    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
-        const @ftype@ rl = ((@ftype@ *)b_src)[0];
-        const @ftype@ im = ((@ftype@ *)b_src)[1];
-    #if @is_square@
-        ((@ftype@ *)b_dst)[0] = rl*rl - im*im;
-        ((@ftype@ *)b_dst)[1] = rl*im + im*rl;
-    #else
-        ((@ftype@ *)b_dst)[0] = rl;
-        ((@ftype@ *)b_dst)[1] = -im;
-    #endif
-    }
-}
-/**end repeat1**/
-/**end repeat**/
diff --git a/numpy/_core/src/umath/loops_arithm_fp.dispatch.cpp.src b/numpy/_core/src/umath/loops_arithm_fp.dispatch.cpp.src
new file mode 100644
index 000000000000..643816655e56
--- /dev/null
+++ b/numpy/_core/src/umath/loops_arithm_fp.dispatch.cpp.src
@@ -0,0 +1,615 @@
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+#include <hwy/highway.h>
+#include "simd/simd.hpp"
+using namespace np::simd;
+
+/**
+ * TODO:
+ *  - Improve the implementation of SIMD complex absolute,
+ *    current one kinda slow and it can be optimized by
+ *    at least avoiding the division and keep sqrt.
+ *  - Vectorize reductions
+ *  - Add support for ASIMD/VCMLA through universal intrinsics.
+ */
+
+//###############################################################################
+//## Real Single/Double precision
+//###############################################################################
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #c = f, #
+ *  #C = F, #
+ *  #VECTOR = NPY_HWY, (NPY_HWY && HWY_HAVE_FLOAT64)#
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # intrin = Add, Sub, Mul, Div#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ * # is_div = 0*3, 1#
+ * # is_mul = 0*2, 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    using T = @type@;
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if @PW@
+        *((T*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1);
+    #else
+        T acc = *((T*)src0);
+        if (ssrc1 == sizeof(T)) {
+            for (; len > 0; --len, src1 += sizeof(T)) {
+                acc @OP@= *(T *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc @OP@= *(T *)src1;
+            }
+        }
+        *((T*)src0) = acc;
+    #endif
+        return;
+    }
+#if @is_div@ && HWY_HAVE_NEON_FP && !HWY_HAVE_FLOAT64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif @VECTOR@
+    if (static_cast<size_t>(len) > Lanes<T>()*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = Lanes<uint8_t>();
+        const int wstep = vstep * 2;
+        const int hstep = Lanes<T>();
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(T) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                auto a0 = LoadU<T>((const T*)src0);
+                auto a1 = LoadU<T>((const T*)(src0 + vstep));
+                auto b0 = LoadU<T>((const T*)src1);
+                auto b1 = LoadU<T>((const T*)(src1 + vstep));
+                auto r0 = @intrin@(a0, b0);
+                auto r1 = @intrin@(a1, b1);
+                StoreU<T>(r0, (T*)dst);
+                StoreU<T>(r1, (T*)(dst + vstep));
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if @is_div@
+                auto a = hn::LoadNOr(Set<T>(1.0@c@), _Tag<T>(), (const T*)src0, len);
+                auto b = hn::LoadNOr(Set<T>(1.0@c@), _Tag<T>(), (const T*)src1, len);
+            #else
+                auto a = hn::LoadN(_Tag<T>(), (const T*)src0, len);
+                auto b = hn::LoadN(_Tag<T>(), (const T*)src1, len);
+            #endif
+                auto r = @intrin@(a, b);
+                hn::StoreN(r, _Tag<T>(), (T*)dst, len);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(T) && sdst == ssrc1) {
+            auto a = Set<T>(*((T*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                auto b0 = LoadU<T>((const T*)src1);
+                auto b1 = LoadU<T>((const T*)(src1 + vstep));
+                auto r0 = @intrin@(a, b0);
+                auto r1 = @intrin@(a, b1);
+                StoreU<T>(r0, (T*)dst);
+                StoreU<T>(r1, (T*)(dst + vstep));
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if @is_div@ || @is_mul@
+                auto b = hn::LoadNOr(Set<T>(1.0@c@), _Tag<T>(), (const T*)src1, len);
+            #else
+                auto b = hn::LoadN(_Tag<T>(), (const T*)src1, len);
+            #endif
+                auto r = @intrin@(a, b);
+                hn::StoreN(r, _Tag<T>(), (T*)dst, len);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(T) && sdst == ssrc0) {
+            auto b = Set<T>(*((T*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                auto a0 = LoadU<T>((const T*)src0);
+                auto a1 = LoadU<T>((const T*)(src0 + vstep));
+                auto r0 = @intrin@(a0, b);
+                auto r1 = @intrin@(a1, b);
+                StoreU<T>(r0, (T*)dst);
+                StoreU<T>(r1, (T*)(dst + vstep));
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if @is_mul@
+                auto a = hn::LoadNOr(Set<T>(1.0@c@), _Tag<T>(), (const T*)src0, len);
+            #elif @is_div@
+                auto a = hn::LoadNOr(Set<T>(NPY_NAN@C@), _Tag<T>(), (const T*)src0, len);
+            #else
+                auto a = hn::LoadN(_Tag<T>(), (const T*)src0, len);
+            #endif
+                auto r = @intrin@(a, b);
+                hn::StoreN(r, _Tag<T>(), (T*)dst, len);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const T a = *((T*)src0);
+        const T b = *((T*)src1);
+        *((T*)dst) = a @OP@ b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (@type@ *)(ip1 + is1 * indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
+    }
+    return 0;
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #c = f, #
+ * #C = F, #
+ * #VECTOR = NPY_HWY, (NPY_HWY && HWY_HAVE_FLOAT64)#
+ */
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract, multiply#
+ * #intrin = Add, Sub, MulComplex#
+ * #OP = +, -, *#
+ * #PW = 1, 0, 0#
+ * #is_mul = 0*2, 1#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    using T = @ftype@;
+    using D = std::conditional_t<sizeof(T) == 8, int64_t, int32_t>;
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if @PW@
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(T)*2) == 0
+    ) {
+        T *rl_im = (T *)b_src0;
+        T rr, ri;
+        @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] @OP@= rr;
+        rl_im[1] @OP@= ri;
+        return;
+    }
+#endif
+#if @VECTOR@
+    // Certain versions of Apple clang (commonly used in CI images) produce
+    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
+    // Work around by scalarising.
+    #if @is_mul@ \
+            && defined(NPY_CPU_AMD64) && defined(__clang__) \
+            && defined(__apple_build_version__) \
+            && __apple_build_version__ >= 14000000 \
+            && __apple_build_version__ < 14030000
+        goto loop_scalar;
+    #endif  // end affected Apple clang.
+
+    if (!is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) &&
+        !is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) &&
+        sizeof(T) == alignof(T) && b_ssrc0 % sizeof(T) == 0 && b_ssrc1 % sizeof(T) == 0 && 
+        b_sdst % sizeof(T) == 0 && b_sdst != 0
+    ) {
+        const T *src0 = (T*)b_src0;
+        const T *src1 = (T*)b_src1;
+              T *dst  = (T*)b_dst;
+
+        const npy_intp ssrc0 = b_ssrc0 / sizeof(T);
+        const npy_intp ssrc1 = b_ssrc1 / sizeof(T);
+        const npy_intp sdst  = b_sdst / sizeof(T);
+
+        const int vstep = Lanes<T>();
+        const int wstep = vstep * 2;
+        const int hstep = vstep / 2;
+
+        // lots**lots of specializations, to squeeze out max performance
+        // contig
+        if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                auto a0 = LoadU<T>(src0);
+                auto a1 = LoadU<T>(src0 + vstep);
+                auto b0 = LoadU<T>(src1);
+                auto b1 = LoadU<T>(src1 + vstep);
+                auto r0 = hn::@intrin@(a0, b0);
+                auto r1 = hn::@intrin@(a1, b1);
+                StoreU(r0, dst);
+                StoreU(r1, dst + vstep);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+                auto a = hn::LoadN(_Tag<T>(), src0, len*2);
+                auto b = hn::LoadN(_Tag<T>(), src1, len*2);
+                auto r = hn::@intrin@(a, b);
+                hn::StoreN(r, _Tag<T>(), dst, len*2);
+            }
+        }
+        // scalar 0
+        else if (ssrc0 == 0) {
+            auto a = hn::OddEven(Set<T>(src0[1]), Set<T>(src0[0]));
+            // contig
+            if (ssrc1 == 2 && sdst == ssrc1) {
+                for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                    auto b0 = LoadU<T>(src1);
+                    auto b1 = LoadU<T>(src1 + vstep);
+                    auto r0 = hn::@intrin@(a, b0);
+                    auto r1 = hn::@intrin@(a, b1);
+                    StoreU(r0, dst);
+                    StoreU(r1, dst + vstep);
+                }
+                for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+                #if @is_mul@
+                    auto b = hn::LoadNOr(Set<T>(1.0@c@), _Tag<T>(), src1, len*2);
+                #else
+                    auto b = hn::LoadN(_Tag<T>(), src1, len*2);
+                #endif
+                    auto r = hn::@intrin@(a, b);
+                    hn::StoreN(r, _Tag<T>(), dst, len*2);
+                }
+            }
+            // non-contig
+            else if (static_cast<D>(ssrc1) >= 0 && static_cast<D>(sdst) >= 0) {
+                auto i0 = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(ssrc1));
+                auto i1 = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(sdst));
+                i0 = hn::OddEven(Add(i0, Set<D>(1)), i0);
+                i1 = hn::OddEven(Add(i1, Set<D>(1)), i1);
+                for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                    auto b0 = hn::GatherIndex(_Tag<T>(), src1, i0);
+                    auto b1 = hn::GatherIndex(_Tag<T>(), src1 + ssrc1*hstep, i0);
+                    auto r0 = hn::@intrin@(a, b0);
+                    auto r1 = hn::@intrin@(a, b1);
+                    hn::ScatterIndex(r0, _Tag<T>(), dst, i1);
+                    hn::ScatterIndex(r1, _Tag<T>(), dst + sdst*hstep, i1);
+                }
+                for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+                #if @is_mul@
+                    auto b = hn::MaskedGatherIndexOr(Set<T>(1.0@c@), hn::FirstN(_Tag<T>(), len*2), _Tag<T>(), src1, i0);
+                #else
+                    auto b = hn::GatherIndexN(_Tag<T>(), src1, i0, len*2);
+                #endif
+                    auto r = hn::@intrin@(a, b);
+                    hn::ScatterIndexN(r, _Tag<T>(), dst, i1, len*2);
+                }
+            }
+            else {
+                goto loop_scalar;
+            }
+        }
+        // scalar 1
+        else if (ssrc1 == 0) {
+            auto b = hn::OddEven(Set<T>(src1[1]), Set<T>(src1[0]));
+            if (ssrc0 == 2 && sdst == ssrc0) {
+                for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                    auto a0 = LoadU<T>(src0);
+                    auto a1 = LoadU<T>(src0 + vstep);
+                    auto r0 = hn::@intrin@(a0, b);
+                    auto r1 = hn::@intrin@(a1, b);
+                    StoreU<T>(r0, dst);
+                    StoreU<T>(r1, dst + vstep);
+                }
+                for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+                #if @is_mul@
+                    auto a = hn::LoadNOr(Set<T>(1.0@c@), _Tag<T>(), src0, len*2);
+                #else
+                    auto a = hn::LoadN(_Tag<T>(), src0, len*2);
+                #endif
+                    auto r = hn::@intrin@(a, b);
+                    hn::StoreN(r, _Tag<T>(), dst, len*2);
+                }
+            }
+            // non-contig
+            else if (static_cast<D>(ssrc0) >= 0 && static_cast<D>(sdst) >= 0) {
+                auto i0 = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(ssrc0));
+                auto i1 = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(sdst));
+                i0 = hn::OddEven(Add(i0, Set<D>(1)), i0);
+                i1 = hn::OddEven(Add(i1, Set<D>(1)), i1);
+                for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                    auto a0 = hn::GatherIndex(_Tag<T>(), src0, i0);
+                    auto a1 = hn::GatherIndex(_Tag<T>(), src0 + ssrc0*hstep, i0);
+                    auto r0 = hn::@intrin@(a0, b);
+                    auto r1 = hn::@intrin@(a1, b);
+                    hn::ScatterIndex(r0, _Tag<T>(), dst, i1);
+                    hn::ScatterIndex(r1, _Tag<T>(), dst + sdst*hstep, i1);
+                }
+                for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+                #if @is_mul@
+                    auto a = hn::MaskedGatherIndexOr(Set<T>(1.0@c@), hn::FirstN(_Tag<T>(), len*2), _Tag<T>(), src0, i0);
+                #else
+                    auto a = hn::GatherIndexN(_Tag<T>(), src0, i0, len*2);
+                #endif
+                    auto r = hn::@intrin@(a, b);
+                    hn::ScatterIndexN(r, _Tag<T>(), dst, i1, len*2);
+                }
+            }
+            else {
+                goto loop_scalar;
+            }
+        }
+        #if @is_mul@
+        // non-contig
+        else if (static_cast<D>(ssrc0) >= 0 && static_cast<D>(ssrc1) >= 0 && static_cast<D>(sdst) >= 0) {
+            auto i0 = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(ssrc0));
+            auto i1 = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(ssrc1));
+            auto i2 = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(sdst));
+            i0 = hn::OddEven(Add(i0, Set<D>(1)), i0);
+            i1 = hn::OddEven(Add(i1, Set<D>(1)), i1);
+            i2 = hn::OddEven(Add(i2, Set<D>(1)), i2);
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                                src1 += ssrc1*vstep, dst += sdst*vstep
+            ) {
+                auto a0 = hn::GatherIndex(_Tag<T>(), src0, i0);
+                auto a1 = hn::GatherIndex(_Tag<T>(), src0 + ssrc0*hstep, i0);
+                auto b0 = hn::GatherIndex(_Tag<T>(), src1, i1);
+                auto b1 = hn::GatherIndex(_Tag<T>(), src1 + ssrc1*hstep, i1);
+                auto r0 = hn::@intrin@(a0, b0);
+                auto r1 = hn::@intrin@(a1, b1);
+                hn::ScatterIndex(r0, _Tag<T>(), dst, i2);
+                hn::ScatterIndex(r1, _Tag<T>(), dst + sdst*hstep, i2);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                           src1 += ssrc1*hstep, dst += sdst*hstep
+            ) {
+            #if @is_mul@
+                auto a = hn::MaskedGatherIndexOr(Set<T>(1.0@c@), hn::FirstN(_Tag<T>(), len*2), _Tag<T>(), src0, i0);
+                auto b = hn::MaskedGatherIndexOr(Set<T>(1.0@c@), hn::FirstN(_Tag<T>(), len*2), _Tag<T>(), src1, i1);
+            #else
+                auto a = hn::GatherIndexN(_Tag<T>(), src0, i0, len*2);
+                auto b = hn::GatherIndexN(_Tag<T>(), src1, i1, len*2);
+            #endif
+                auto r = hn::@intrin@(a, b);
+                hn::ScatterIndexN(r, _Tag<T>(), dst, i2, len*2);
+            }
+        }
+        #else  /* @is_mul@ */
+        else {
+            // Only multiply is vectorized for the generic non-contig case.
+            goto loop_scalar;
+        }
+        #endif  /* @is_mul@ */
+
+        return;
+    }
+
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const T a_r = ((T *)b_src0)[0];
+        const T a_i = ((T *)b_src0)[1];
+        const T b_r = ((T *)b_src1)[0];
+        const T b_i = ((T *)b_src1)[1];
+    #if @is_mul@
+        ((T *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((T *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((T *)b_dst)[0] = a_r @OP@ b_r;
+        ((T *)b_dst)[1] = a_i @OP@ b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    using T = @ftype@;
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    T *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (T *)(ip1 + is1 * indx);
+        const T b_r = ((T *)value)[0];
+        const T b_i = ((T *)value)[1];
+    #if @is_mul@
+        const T a_r = indexed[0];
+        const T a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] @OP@= b_r;
+        indexed[1] @OP@= b_i;
+    #endif
+    }
+    return 0;
+}
+/**end repeat1**/
+
+/**begin repeat1
+ *  #kind = conjugate, square#
+ *  #is_square = 0, 1#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    using T = @ftype@;
+    using D = std::conditional_t<sizeof(T) == 8, int64_t, int32_t>;
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if @VECTOR@
+    if (!is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) && sizeof(T) == alignof(T) &&
+        b_ssrc % sizeof(T) == 0 && b_sdst % sizeof(T) == 0) {
+        const T *src  = (T*)b_src;
+              T *dst  = (T*)b_dst;
+        const npy_intp ssrc = b_ssrc / sizeof(T);
+        const npy_intp sdst = b_sdst / sizeof(T);
+
+        const int vstep = Lanes<T>();
+        const int wstep = vstep * 2;
+        const int hstep = vstep / 2;
+
+        if (ssrc == 2 && ssrc == sdst) {
+            for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+                auto a0 = LoadU<T>(src);
+                auto a1 = LoadU<T>(src + vstep);
+            #if @is_square@
+                auto r0 = hn::MulComplex(a0, a0);
+                auto r1 = hn::MulComplex(a1, a1);
+            #else
+                auto r0 = hn::ComplexConj(a0);
+                auto r1 = hn::ComplexConj(a1);
+            #endif
+                StoreU<T>(r0, dst);
+                StoreU<T>(r1, dst + vstep);
+            }
+            for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+                auto a = hn::LoadN(_Tag<T>(), src, len*2);
+            #if @is_square@
+                auto r = hn::MulComplex(a, a);
+            #else
+                auto r = hn::ComplexConj(a);
+            #endif
+                hn::StoreN(r, _Tag<T>(), dst, len*2);
+            }
+        }
+        else if (ssrc == 2 && static_cast<D>(sdst) >= 0) {
+            auto i = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(sdst));
+            i = hn::OddEven(Add(i, Set<D>(1)), i);
+            for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+                auto a0 = LoadU<T>(src);
+                auto a1 = LoadU<T>(src + vstep);
+            #if @is_square@
+                auto r0 = hn::MulComplex(a0, a0);
+                auto r1 = hn::MulComplex(a1, a1);
+            #else
+                auto r0 = hn::ComplexConj(a0);
+                auto r1 = hn::ComplexConj(a1);
+            #endif
+                hn::ScatterIndex(r0, _Tag<T>(), dst, i);
+                hn::ScatterIndex(r1, _Tag<T>(), dst + sdst*hstep, i);
+            }
+            for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+                auto a = hn::LoadN(_Tag<T>(), src, len*2);
+            #if @is_square@
+                auto r = hn::MulComplex(a, a);
+            #else
+                auto r = hn::ComplexConj(a);
+            #endif
+                hn::ScatterIndexN(r, _Tag<T>(), dst, i, len*2);
+            }
+        }
+        else if (sdst == 2 && static_cast<D>(ssrc) >= 0) {
+            auto i = Mul(hn::ShiftRight<1>(hn::Iota(_Tag<D>(), 0)), Set<D>(ssrc));
+            i = hn::OddEven(Add(i, Set<D>(1)), i);
+            for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+                auto a0 = hn::GatherIndex(_Tag<T>(), src, i);
+                auto a1 = hn::GatherIndex(_Tag<T>(), src + ssrc*hstep, i);
+            #if @is_square@
+                auto r0 = hn::MulComplex(a0, a0);
+                auto r1 = hn::MulComplex(a1, a1);
+            #else
+                auto r0 = hn::ComplexConj(a0);
+                auto r1 = hn::ComplexConj(a1);
+            #endif
+                StoreU<T>(r0, dst);
+                StoreU<T>(r1, dst + vstep);
+            }
+            for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+                auto a = hn::GatherIndexN(_Tag<T>(), src, i, len*2);
+            #if @is_square@
+                auto r = hn::MulComplex(a, a);
+            #else
+                auto r = hn::ComplexConj(a);
+            #endif
+                hn::StoreN(r, _Tag<T>(), dst, len*2);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+        return;
+    }
+
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const T rl = ((T *)b_src)[0];
+        const T im = ((T *)b_src)[1];
+    #if @is_square@
+        ((T *)b_dst)[0] = rl*rl - im*im;
+        ((T *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((T *)b_dst)[0] = rl;
+        ((T *)b_dst)[1] = -im;
+    #endif
+    }
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src
index 8c66229942ee..93d288fbdb2e 100755
--- a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src
+++ b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src
@@ -385,7 +385,7 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
         vec_f64 b, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16;
         if constexpr(hn::MaxLanes(f64) == 2){
             vec_f64 e0e1_0, e0e1_1;
-            uint64_t index[hn::Lanes(f64)];
+            uint64_t index[hn::MaxLanes(f64)];
             hn::StoreU(idx, u64, index);
 
             /**begin repeat
diff --git a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
index ae696db4cd4a..d298a8596cc4 100644
--- a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
@@ -3,7 +3,9 @@
 #include "loops_utils.h"
 
 #include "simd/simd.h"
+#include "simd/simd.hpp" 
 #include <hwy/highway.h>
+
 namespace hn = hwy::HWY_NAMESPACE;
 
 /*
@@ -184,7 +186,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
                           "larger than 256 bits.");
             simd_maski = ((uint8_t *)&simd_maski)[0];
 #endif
-            float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[hn::Lanes(f32)];
+            float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[hn::MaxLanes(f32)];
             hn::Store(x_in, f32, ip_fback);
 
             // process elements using libc for large elements