diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 3bffed752474..a84080c57296 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -1031,13 +1031,14 @@ foreach gen_mtargets : [ ], [ 'loops_unary_complex.dispatch.h', - src_file.process('src/umath/loops_unary_complex.dispatch.c.src'), + 'src/umath/loops_unary_complex.dispatch.cpp', [ AVX512F, [AVX2, FMA3], SSE2, ASIMD, NEON, VSX3, VSX2, VXE, VX, LSX, + RVV, ] ], [ diff --git a/numpy/_core/src/common/simd/README.md b/numpy/_core/src/common/simd/README.md new file mode 100644 index 000000000000..9a68d1aa1bfc --- /dev/null +++ b/numpy/_core/src/common/simd/README.md @@ -0,0 +1,263 @@ +# NumPy SIMD Wrapper for Highway + +This directory contains a lightweight C++ wrapper over Google's [Highway](https://github.com/google/highway) SIMD library, designed specifically for NumPy's needs. + +> **Note**: This directory also contains the C interface of universal intrinsics (under `simd.h`) which is no longer supported. The Highway wrapper described in this document should be used instead for all new SIMD code. + +## Overview + +The wrapper simplifies Highway's SIMD interface by eliminating class tags and using lane types directly, which can be deduced from arguments in most cases. This design makes the SIMD code more intuitive and easier to maintain while still leveraging Highway generic intrinsics. + +## Architecture + +The wrapper consists of two main headers: + +1. `simd.hpp`: The main header that defines namespaces and includes configuration macros +2. `simd.inc.hpp`: Implementation details included by `simd.hpp` multiple times for different namespaces + +Additionally, this directory contains legacy C interface files for universal intrinsics (`simd.h` and related files) which are deprecated and should not be used for new code. All new SIMD code should use the Highway wrapper. + + +## Usage + +### Basic Usage + +```cpp +#include "simd/simd.hpp" + +// Use np::simd for maximum width SIMD operations +using namespace np::simd; +float *data = /* ... */; +Vec v = LoadU(data); +v = Add(v, v); +StoreU(v, data); + +// Use np::simd128 for fixed 128-bit SIMD operations +using namespace np::simd128; +Vec v128 = LoadU(data); +v128 = Add(v128, v128); +StoreU(v128, data); +``` + +### Checking for SIMD Support + +```cpp +#include "simd/simd.hpp" + +// Check if SIMD is enabled +#if NPY_SIMDX + // SIMD code +#else + // Scalar fallback code +#endif + +// Check for float64 support +#if NPY_SIMDX_F64 + // Use float64 SIMD operations +#endif + +// Check for FMA support +#if NPY_SIMDX_FMA + // Use FMA operations +#endif +``` + +## Type Support and Constraints + +The wrapper provides type constraints to help with SFINAE (Substitution Failure Is Not An Error) and compile-time type checking: + +- `kSupportLane`: Determines whether the specified lane type is supported by the SIMD extension. + ```cpp + // Base template - always defined, even when SIMD is not enabled (for SFINAE) + template + constexpr bool kSupportLane = NPY_SIMDX != 0; + template <> + constexpr bool kSupportLane = NPY_SIMDX_F64 != 0; + ``` + +- `kMaxLanes`: Maximum number of lanes supported by the SIMD extension for the specified lane type. + ```cpp + template + constexpr size_t kMaxLanes = HWY_MAX_LANES_D(_Tag); + ``` + +```cpp +#include "simd/simd.hpp" + +// Check if float64 operations are supported +if constexpr (np::simd::kSupportLane) { + // Use float64 operations +} +``` + +These constraints allow for compile-time checking of which lane types are supported, which can be used in SFINAE contexts to enable or disable functions based on type support. + +## Available Operations + +The wrapper provides the following common operations that are used in NumPy: + +- Vector creation operations: + - `Zero`: Returns a vector with all lanes set to zero + - `Set`: Returns a vector with all lanes set to the given value + - `Undefined`: Returns an uninitialized vector + +- Memory operations: + - `LoadU`: Unaligned load of a vector from memory + - `StoreU`: Unaligned store of a vector to memory + +- Vector information: + - `Lanes`: Returns the number of vector lanes based on the lane type + +- Type conversion: + - `BitCast`: Reinterprets a vector to a different type without modifying the underlying data + - `VecFromMask`: Converts a mask to a vector + +- Comparison operations: + - `Eq`: Element-wise equality comparison + - `Le`: Element-wise less than or equal comparison + - `Lt`: Element-wise less than comparison + - `Gt`: Element-wise greater than comparison + - `Ge`: Element-wise greater than or equal comparison + +- Arithmetic operations: + - `Add`: Element-wise addition + - `Sub`: Element-wise subtraction + - `Mul`: Element-wise multiplication + - `Div`: Element-wise division + - `Min`: Element-wise minimum + - `Max`: Element-wise maximum + - `Abs`: Element-wise absolute value + - `Sqrt`: Element-wise square root + +- Logical operations: + - `And`: Bitwise AND + - `Or`: Bitwise OR + - `Xor`: Bitwise XOR + - `AndNot`: Bitwise AND NOT (a & ~b) + +Additional Highway operations can be accessed via the `hn` namespace alias inside the `simd` or `simd128` namespaces. + +## Extending + +To add more operations from Highway: + +1. Import them in the `simd.inc.hpp` file using the `using` directive if they don't require a tag: + ```cpp + // For operations that don't require a tag + using hn::FunctionName; + ``` + +2. Define wrapper functions for intrinsics that require a class tag: + ```cpp + // For operations that require a tag + template + HWY_API ReturnType FunctionName(Args... args) { + return hn::FunctionName(_Tag(), args...); + } + ``` + +3. Add appropriate documentation and SFINAE constraints if needed + + +## Build Configuration + +The SIMD wrapper automatically disables SIMD operations when optimizations are disabled: + +- When `NPY_DISABLE_OPTIMIZATION` is defined, SIMD operations are disabled +- SIMD is enabled only when the Highway target is not scalar (`HWY_TARGET != HWY_SCALAR`) + +## Design Notes + +1. **Why avoid Highway scalar operations?** + - NumPy already provides kernels for scalar operations + - Compilers can better optimize standard library implementations + - Not all Highway intrinsics are fully supported in scalar mode + +2. **Legacy Universal Intrinsics** + - The older universal intrinsics C interface (in `simd.h` and accessible via `NPY_SIMD` macros) is deprecated + - All new SIMD code should use this Highway-based wrapper (accessible via `NPY_SIMDX` macros) + - The legacy code is maintained for compatibility but will eventually be removed + +3. **Feature Detection Constants vs. Highway Constants** + - NumPy-specific constants (`NPY_SIMDX_F16`, `NPY_SIMDX_F64`, `NPY_SIMDX_FMA`) provide additional safety beyond raw Highway constants + - Highway constants (e.g., `HWY_HAVE_FLOAT16`) only check platform capabilities but don't consider NumPy's build configuration + - Our constants combine both checks: + ```cpp + #define NPY_SIMDX_F16 (NPY_SIMDX && HWY_HAVE_FLOAT16) + ``` + - This ensures SIMD features won't be used when: + - Platform supports it but NumPy optimization is disabled via meson option: + ``` + option('disable-optimization', type: 'boolean', value: false, + description: 'Disable CPU optimized code (dispatch,simd,unroll...)') + ``` + - Highway target is scalar (`HWY_TARGET == HWY_SCALAR`) + - Using these constants ensures consistent behavior across different compilation settings + - Without this additional layer, code might incorrectly try to use SIMD paths in scalar mode + +4. **Namespace Design** + - `np::simd`: Maximum width SIMD operations (scalable) + - `np::simd128`: Fixed 128-bit SIMD operations + - `hn`: Highway namespace alias (available within the SIMD namespaces) + +5. **Why Namespaces and Why Not Just Use Highway Directly?** + - Highway's design uses class tag types as template parameters (e.g., `Vec>`) when defining vector types + - Many Highway functions require explicitly passing a tag instance as the first parameter + - This class tag-based approach increases verbosity and complexity in user code + - Our wrapper eliminates this by internally managing tags through namespaces, letting users directly use types e.g. `Vec` + - Simple example with raw Highway: + ```cpp + // Highway's approach + float *data = /* ... */; + + namespace hn = hwy::HWY_NAMESPACE; + using namespace hn; + + // Full-width operations + ScalableTag df; // Create a tag instance + Vec v = LoadU(df, data); // LoadU requires a tag instance + StoreU(v, df, data); // StoreU requires a tag instance + + // 128-bit operations + Full128 df128; // Create a 128-bit tag instance + Vec v128 = LoadU(df128, data); // LoadU requires a tag instance + StoreU(v128, df128, data); // StoreU requires a tag instance + ``` + + - Simple example with our wrapper: + ```cpp + // Our wrapper approach + float *data = /* ... */; + + // Full-width operations + using namespace np::simd; + Vec v = LoadU(data); // Full-width vector load + StoreU(v, data); + + // 128-bit operations + using namespace np::simd128; + Vec v128 = LoadU(data); // 128-bit vector load + StoreU(v128, data); + ``` + + - The namespaced approach simplifies code, reduces errors, and provides a more intuitive interface + - It preserves all Highway operations benefits while reducing cognitive overhead + +5. **Why Namespaces Are Essential for This Design?** + - Namespaces allow us to define different internal tag types (`hn::ScalableTag` in `np::simd` vs `hn::Full128` in `np::simd128`) + - This provides a consistent type-based interface (`Vec`) without requiring users to manually create tags + - Enables using the same function names (like `LoadU`) with different implementations based on SIMD width + - Without namespaces, we'd have to either reintroduce tags (defeating the purpose of the wrapper) or create different function names for each variant (e.g., `LoadU` vs `LoadU128`) + +6. **Template Type Parameters** + - `TLane`: The scalar type for each vector lane (e.g., uint8_t, float, double) + + +## Requirements + +- C++17 or later +- Google Highway library + +## License + +Same as NumPy's license diff --git a/numpy/_core/src/common/simd/simd.hpp b/numpy/_core/src/common/simd/simd.hpp new file mode 100644 index 000000000000..698da4adf865 --- /dev/null +++ b/numpy/_core/src/common/simd/simd.hpp @@ -0,0 +1,80 @@ +#ifndef NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_ +#define NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_ + +/** + * This header provides a thin wrapper over Google's Highway SIMD library. + * + * The wrapper aims to simplify the SIMD interface of Google's Highway by + * get ride of its class tags and use lane types directly which can be deduced + * from the args in most cases. + */ +/** + * Since `NPY_SIMD` is only limited to NumPy C universal intrinsics, + * `NPY_SIMDX` is defined to indicate the SIMD availability for Google's Highway + * C++ code. + * + * Highway SIMD is only available when optimization is enabled. + * When NPY_DISABLE_OPTIMIZATION is defined, SIMD operations are disabled + * and the code falls back to scalar implementations. + */ +#ifndef NPY_DISABLE_OPTIMIZATION +#include + +/** + * We avoid using Highway scalar operations for the following reasons: + * 1. We already provide kernels for scalar operations, so falling back to + * the NumPy implementation is more appropriate. Compilers can often + * optimize these better since they rely on standard libraries. + * 2. Not all Highway intrinsics are fully supported in scalar mode. + * + * Therefore, we only enable SIMD when the Highway target is not scalar. + */ +#define NPY_SIMDX (HWY_TARGET != HWY_SCALAR) + +// Indicates if the SIMD operations are available for float16. +#define NPY_SIMDX_F16 (NPY_SIMDX && HWY_HAVE_FLOAT16) +// Note: Highway requires SIMD extentions with native float32 support, so we don't need +// to check for it. + +// Indicates if the SIMD operations are available for float64. +#define NPY_SIMDX_F64 (NPY_SIMDX && HWY_HAVE_FLOAT64) + +// Indicates if the SIMD floating operations are natively supports fma. +#define NPY_SIMDX_FMA (NPY_SIMDX && HWY_NATIVE_FMA) + +#else +#define NPY_SIMDX 0 +#define NPY_SIMDX_F16 0 +#define NPY_SIMDX_F64 0 +#define NPY_SIMDX_FMA 0 +#endif + +namespace np { + +/// Represents the max SIMD width supported by the platform. +namespace simd { +#if NPY_SIMDX +/// The highway namespace alias. +/// We can not import all the symbols from the HWY_NAMESPACE because it will +/// conflict with the existing symbols in the numpy namespace. +namespace hn = hwy::HWY_NAMESPACE; +// internaly used by the template header +template +using _Tag = hn::ScalableTag; +#endif +#include "simd.inc.hpp" +} // namespace simd + +/// Represents the 128-bit SIMD width. +namespace simd128 { +#if NPY_SIMDX +namespace hn = hwy::HWY_NAMESPACE; +template +using _Tag = hn::Full128; +#endif +#include "simd.inc.hpp" +} // namespace simd128 + +} // namespace np + +#endif // NUMPY__CORE_SRC_COMMON_SIMD_SIMD_HPP_ diff --git a/numpy/_core/src/common/simd/simd.inc.hpp b/numpy/_core/src/common/simd/simd.inc.hpp new file mode 100644 index 000000000000..64d28bc47118 --- /dev/null +++ b/numpy/_core/src/common/simd/simd.inc.hpp @@ -0,0 +1,132 @@ +#ifndef NPY_SIMDX +#error "This is not a standalone header. Include simd.hpp instead." +#define NPY_SIMDX 1 // Prevent editors from graying out the happy branch +#endif + +// Using anonymous namespace instead of inline to ensure each translation unit +// gets its own copy of constants based on local compilation flags +namespace { + +// NOTE: This file is included by simd.hpp multiple times with different namespaces +// so avoid including any headers here + +/** + * Determines whether the specified lane type is supported by the SIMD extension. + * Always defined as false when SIMD is not enabled, so it can be used in SFINAE. + * + * @tparam TLane The lane type to check for support. + */ +template +constexpr bool kSupportLane = NPY_SIMDX != 0; + +#if NPY_SIMDX +// Define lane type support based on Highway capabilities +template <> +constexpr bool kSupportLane = HWY_HAVE_FLOAT16 != 0; +template <> +constexpr bool kSupportLane = HWY_HAVE_FLOAT64 != 0; +template <> +constexpr bool kSupportLane = + HWY_HAVE_FLOAT64 != 0 && sizeof(long double) == sizeof(double); + +/// Maximum number of lanes supported by the SIMD extension for the specified lane type. +template +constexpr size_t kMaxLanes = HWY_MAX_LANES_D(_Tag); + +/// Represents an N-lane vector based on the specified lane type. +/// @tparam TLane The scalar type for each vector lane +template +using Vec = hn::Vec<_Tag>; + +/// Represents a mask vector with boolean values or as a bitmask. +/// @tparam TLane The scalar type the mask corresponds to +template +using Mask = hn::Mask<_Tag>; + +/// Unaligned load of a vector from memory. +template +HWY_API Vec +LoadU(const TLane *ptr) +{ + return hn::LoadU(_Tag(), ptr); +} + +/// Unaligned store of a vector to memory. +template +HWY_API void +StoreU(const Vec &a, TLane *ptr) +{ + hn::StoreU(a, _Tag(), ptr); +} + +/// Returns the number of vector lanes based on the lane type. +template +HWY_API HWY_LANES_CONSTEXPR size_t +Lanes(TLane tag = 0) +{ + return hn::Lanes(_Tag()); +} + +/// Returns an uninitialized N-lane vector. +template +HWY_API Vec +Undefined(TLane tag = 0) +{ + return hn::Undefined(_Tag()); +} + +/// Returns N-lane vector with all lanes equal to zero. +template +HWY_API Vec +Zero(TLane tag = 0) +{ + return hn::Zero(_Tag()); +} + +/// Returns N-lane vector with all lanes equal to the given value of type `TLane`. +template +HWY_API Vec +Set(TLane val) +{ + return hn::Set(_Tag(), val); +} + +/// Converts a mask to a vector based on the specified lane type. +template +HWY_API Vec +VecFromMask(const TMask &m) +{ + return hn::VecFromMask(_Tag(), m); +} + +/// Convert (Reinterpret) an N-lane vector to a different type without modifying the +/// underlying data. +template +HWY_API Vec +BitCast(const TVec &v) +{ + return hn::BitCast(_Tag(), v); +} + +// Import common Highway intrinsics +using hn::Abs; +using hn::Add; +using hn::And; +using hn::AndNot; +using hn::Div; +using hn::Eq; +using hn::Ge; +using hn::Gt; +using hn::Le; +using hn::Lt; +using hn::Max; +using hn::Min; +using hn::Mul; +using hn::Or; +using hn::Sqrt; +using hn::Sub; +using hn::Xor; + +#endif // NPY_SIMDX + +} // namespace anonymous diff --git a/numpy/_core/src/highway b/numpy/_core/src/highway index 0b696633f9ad..12b325bc1793 160000 --- a/numpy/_core/src/highway +++ b/numpy/_core/src/highway @@ -1 +1 @@ -Subproject commit 0b696633f9ad89497dd5532b55eaa01625ad71ca +Subproject commit 12b325bc1793dee68ab2157995a690db859fe9e0 diff --git a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src index 8c66229942ee..93d288fbdb2e 100755 --- a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src +++ b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src @@ -385,7 +385,7 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ vec_f64 b, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16; if constexpr(hn::MaxLanes(f64) == 2){ vec_f64 e0e1_0, e0e1_1; - uint64_t index[hn::Lanes(f64)]; + uint64_t index[hn::MaxLanes(f64)]; hn::StoreU(idx, u64, index); /**begin repeat diff --git a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp index ae696db4cd4a..d298a8596cc4 100644 --- a/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp +++ b/numpy/_core/src/umath/loops_trigonometric.dispatch.cpp @@ -3,7 +3,9 @@ #include "loops_utils.h" #include "simd/simd.h" +#include "simd/simd.hpp" #include + namespace hn = hwy::HWY_NAMESPACE; /* @@ -184,7 +186,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, "larger than 256 bits."); simd_maski = ((uint8_t *)&simd_maski)[0]; #endif - float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[hn::Lanes(f32)]; + float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[hn::MaxLanes(f32)]; hn::Store(x_in, f32, ip_fback); // process elements using libc for large elements diff --git a/numpy/_core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/_core/src/umath/loops_unary_complex.dispatch.c.src deleted file mode 100644 index 4b4457e6aada..000000000000 --- a/numpy/_core/src/umath/loops_unary_complex.dispatch.c.src +++ /dev/null @@ -1,132 +0,0 @@ -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include "simd/simd.h" -#include "loops_utils.h" -#include "loops.h" -#include "lowlevel_strided_loops.h" -// Provides the various *_LOOP macros -#include "fast_loop_macros.h" - -/**begin repeat - * #type = npy_float, npy_double# - * #sfx = f32, f64# - * #bsfx = b32, b64# - * #usfx = b32, u64# - * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# - * #is_double = 0, 1# - * #c = f, # - * #INF = NPY_INFINITYF, NPY_INFINITY# - * #NAN = NPY_NANF, NPY_NAN# - */ -#if @VECTOR@ -NPY_FINLINE npyv_@sfx@ -simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im) -{ - const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@); - const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@); - - re = npyv_abs_@sfx@(re); - im = npyv_abs_@sfx@(im); - /* - * If real or imag = INF, then convert it to inf + j*inf - * Handles: inf + j*nan, nan + j*inf - */ - npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf); - npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf); - im = npyv_select_@sfx@(re_infmask, inf, im); - re = npyv_select_@sfx@(im_infmask, inf, re); - /* - * If real or imag = NAN, then convert it to nan + j*nan - * Handles: x + j*nan, nan + j*x - */ - npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re); - npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im); - im = npyv_select_@sfx@(re_nnanmask, im, nan); - re = npyv_select_@sfx@(im_nnanmask, re, nan); - - npyv_@sfx@ larger = npyv_max_@sfx@(re, im); - npyv_@sfx@ smaller = npyv_min_@sfx@(im, re); - /* - * Calculate div_mask to prevent 0./0. and inf/inf operations in div - */ - npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@()); - npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf); - npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask)); - - npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger); - npyv_@sfx@ hypot = npyv_sqrt_@sfx@( - npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@) - )); - return npyv_mul_@sfx@(hypot, larger); -} -#endif // VECTOR -/**end repeat**/ - -/******************************************************************************** - ** Defining ufunc inner functions - ********************************************************************************/ -/**begin repeat - * complex types - * #TYPE = CFLOAT, CDOUBLE# - * #ftype = npy_float, npy_double# - * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# - * #sfx = f32, f64# - * #c = f, # - * #C = F, # - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if @VECTOR@ - npy_intp len = dimensions[0]; - - if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) && - npyv_loadable_stride_@sfx@(steps[0]) && - npyv_storable_stride_@sfx@(steps[1]) - ) { - npy_intp ssrc = steps[0] / sizeof(@ftype@); - npy_intp sdst = steps[1] / sizeof(@ftype@); - - const @ftype@ *src = (@ftype@*)args[0]; - @ftype@ *dst = (@ftype@*)args[1]; - - const int vstep = npyv_nlanes_@sfx@; - const int wstep = vstep * 2; - const int hstep = vstep / 2; - - if (ssrc == 2 && sdst == 1) { - for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) { - npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src); - npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]); - npyv_store_@sfx@(dst, r); - } - } - else { - for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { - npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc); - npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc); - npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1); - npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]); - npyv_storen_@sfx@(dst, sdst, r); - } - } - for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { - npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len); - npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len); - npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im); - npyv_storen_till_@sfx@(dst, sdst, len, r); - } - npyv_cleanup(); - npy_clear_floatstatus_barrier((char*)&len); - return; - } -#endif - UNARY_LOOP { - const @ftype@ re = ((@ftype@ *)ip1)[0]; - const @ftype@ im = ((@ftype@ *)ip1)[1]; - *((@ftype@ *)op1) = npy_hypot@c@(re, im); - } -} -/**end repeat**/ diff --git a/numpy/_core/src/umath/loops_unary_complex.dispatch.cpp b/numpy/_core/src/umath/loops_unary_complex.dispatch.cpp new file mode 100644 index 000000000000..27b9a32c0b92 --- /dev/null +++ b/numpy/_core/src/umath/loops_unary_complex.dispatch.cpp @@ -0,0 +1,159 @@ +#include "loops_utils.h" +#include "loops.h" + +#include +#include "simd/simd.hpp" + +namespace { +using namespace np::simd; + +template struct OpCabs { +#if NPY_SIMDX + template >> + HWY_INLINE HWY_ATTR auto operator()(const V& a, const V& b) const { + V inf, nan; + if constexpr (std::is_same_v) { + inf = Set(NPY_INFINITYF); + nan = Set(NPY_NANF); + } + else { + inf = Set(NPY_INFINITY); + nan = Set(NPY_NAN); + } + auto re = hn::Abs(a), im = hn::Abs(b); + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + auto re_infmask = hn::IsInf(re), im_infmask = hn::IsInf(im); + im = hn::IfThenElse(re_infmask, inf, im); + re = hn::IfThenElse(im_infmask, inf, re); + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + auto re_nanmask = hn::IsNaN(re), im_nanmask = hn::IsNaN(im); + im = hn::IfThenElse(re_nanmask, nan, im); + re = hn::IfThenElse(im_nanmask, nan, re); + + auto larger = hn::Max(re, im), smaller = hn::Min(im, re); + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + auto zeromask = hn::Eq(larger, Set(static_cast(0))); + auto infmask = hn::IsInf(smaller); + auto div_mask = hn::ExclusiveNeither(zeromask, infmask); + + auto ratio = hn::MaskedDiv(div_mask, smaller, larger); + auto hypot = hn::Sqrt(hn::MulAdd(ratio, ratio, Set(static_cast(1)))); + return hn::Mul(hypot, larger); + } +#endif + + NPY_INLINE T operator()(T a, T b) const { + if constexpr (std::is_same_v) { + return npy_hypotf(a, b); + } else { + return npy_hypot(a, b); + } + } +}; + +#if NPY_SIMDX +template +HWY_INLINE HWY_ATTR auto LoadWithStride(const T* src, npy_intp ssrc, size_t n = Lanes(), T val = 0) { + HWY_LANES_CONSTEXPR size_t lanes = Lanes(); + std::vector temp(lanes, val); + for (size_t ii = 0; ii < lanes && ii < n; ++ii) { + temp[ii] = src[ii * ssrc]; + } + return LoadU(temp.data()); +} + +template +HWY_INLINE HWY_ATTR void StoreWithStride(Vec vec, T* dst, npy_intp sdst, size_t n = Lanes()) { + HWY_LANES_CONSTEXPR size_t lanes = Lanes(); + std::vector temp(lanes); + StoreU(vec, temp.data()); + for (size_t ii = 0; ii < lanes && ii < n; ++ii) { + dst[ii * sdst] = temp[ii]; + } +} +#endif // NPY_SIMDX + +template +HWY_INLINE HWY_ATTR void +unary_complex(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + const OpCabs op_func; + const char *src = args[0]; char *dst = args[1]; + const npy_intp src_step = steps[0]; + const npy_intp dst_step = steps[1]; + npy_intp len = dimensions[0]; + +#if NPY_SIMDX + if constexpr (kSupportLane) { + if (!is_mem_overlap(src, src_step, dst, dst_step, len) && alignof(T) == sizeof(T) && + src_step % sizeof(T) == 0 && dst_step % sizeof(T) == 0) { + const int lsize = sizeof(T); + const npy_intp ssrc = src_step / lsize; + const npy_intp sdst = dst_step / lsize; + + const int vstep = Lanes(); + const int wstep = vstep * 2; + + const T* src_T = reinterpret_cast(src); + T* dst_T = reinterpret_cast(dst); + + if (ssrc == 2 && sdst == 1) { + for (; len >= vstep; len -= vstep, src_T += wstep, dst_T += vstep) { + Vec re, im; + hn::LoadInterleaved2(_Tag(), src_T, re, im); + auto r = op_func(re, im); + StoreU(r, dst_T); + } + } + else { + for (; len >= vstep; len -= vstep, src_T += ssrc*vstep, dst_T += sdst*vstep) { + auto re = LoadWithStride(src_T, ssrc); + auto im = LoadWithStride(src_T + 1, ssrc); + auto r = op_func(re, im); + StoreWithStride(r, dst_T, sdst); + } + } + if (len > 0) { + auto re = LoadWithStride(src_T, ssrc, len); + auto im = LoadWithStride(src_T + 1, ssrc, len); + auto r = op_func(re, im); + StoreWithStride(r, dst_T, sdst, len); + } + // clear the float status flags + npy_clear_floatstatus_barrier((char*)&len); + return; + } + } +#endif + + // fallback to scalar implementation + for (; len > 0; --len, src += src_step, dst += dst_step) { + const T src0 = *reinterpret_cast(src); + const T src1 = *(reinterpret_cast(src) + 1); + *reinterpret_cast(dst) = op_func(src0, src1); + } +} + +} // anonymous namespace + +/******************************************************************************* + ** Defining ufunc inner functions + *******************************************************************************/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + unary_complex(args, dimensions, steps); +} +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + unary_complex(args, dimensions, steps); +}