Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
alalek committed Nov 22, 2019
2 parents a4d16ac + 373160c commit ad0ab41
Show file tree
Hide file tree
Showing 19 changed files with 669 additions and 41 deletions.
123 changes: 123 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,50 @@ inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
return _mm256_packus_epi32(am, bm);
}

template<int i>
inline int _v256_extract_epi8(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi8(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
return _mm_extract_epi8(b, i & 15); // SSE4.1
#endif
}

template<int i>
inline int _v256_extract_epi16(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi16(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
return _mm_extract_epi16(b, i & 7); // SSE2
#endif
}

template<int i>
inline int _v256_extract_epi32(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi32(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
return _mm_extract_epi32(b, i & 3); // SSE4.1
#endif
}

template<int i>
inline int64 _v256_extract_epi64(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi64(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
return _mm_extract_epi64(b, i & 1); // SSE4.1
#endif
}

///////// Types ////////////

struct v_uint8x32
Expand Down Expand Up @@ -2195,6 +2239,85 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)

template<int i>
inline uchar v_extract_n(v_uint8x32 a)
{
return (uchar)_v256_extract_epi8<i>(a.val);
}

template<int i>
inline schar v_extract_n(v_int8x32 a)
{
return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
}

template<int i>
inline ushort v_extract_n(v_uint16x16 a)
{
return (ushort)_v256_extract_epi16<i>(a.val);
}

template<int i>
inline short v_extract_n(v_int16x16 a)
{
return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
}

template<int i>
inline uint v_extract_n(v_uint32x8 a)
{
return (uint)_v256_extract_epi32<i>(a.val);
}

template<int i>
inline int v_extract_n(v_int32x8 a)
{
return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
}

template<int i>
inline uint64 v_extract_n(v_uint64x4 a)
{
return (uint64)_v256_extract_epi64<i>(a.val);
}

template<int i>
inline int64 v_extract_n(v_int64x4 v)
{
return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
}

template<int i>
inline float v_extract_n(v_float32x8 v)
{
union { uint iv; float fv; } d;
d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
return d.fv;
}

template<int i>
inline double v_extract_n(v_float64x4 v)
{
union { uint64 iv; double dv; } d;
d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
return d.dv;
}

template<int i>
inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
{
static const __m256i perm = _mm256_set1_epi32((char)i);
return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
}

template<int i>
inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }

template<int i>
inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


///////////////////// load deinterleave /////////////////////////////

Expand Down
29 changes: 29 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin_avx512.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2228,6 +2228,35 @@ OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)

#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)

template<int i>
inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
{
static const __m512i perm = _mm512_set1_epi32((char)i);
return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
}

template<int i>
inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }

template<int i>
inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


///////////////////// load deinterleave /////////////////////////////

Expand Down
41 changes: 41 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin_cpp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ Regular integers:
|cvt_flt64 | | | | | | x |
|transpose4x4 | | | | | x | x |
|reverse | x | x | x | x | x | x |
|extract_n | x | x | x | x | x | x |
|broadcast_element | | | | | x | x |
Big integers:
Expand All @@ -230,6 +232,7 @@ Big integers:
|extract | x | x |
|rotate (lanes) | x | x |
|cvt_flt64 | | x |
|extract_n | x | x |
Floating point:
Expand All @@ -254,6 +257,8 @@ Floating point:
|extract | x | x |
|rotate (lanes) | x | x |
|reverse | x | x |
|extract_n | x | x |
|broadcast_element | x | |
@{ */

Expand Down Expand Up @@ -1784,6 +1789,42 @@ inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
return r;
}

/** @brief Vector extract
Scheme:
Return the s-th element of v.
Restriction: 0 <= s < nlanes
Usage:
@code
v_int32x4 a;
int r;
r = v_extract_n<2>(a);
@endcode
For all types. */
template<int s, typename _Tp, int n>
inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
{
CV_DbgAssert(s >= 0 && s < n);
return v.s[s];
}

/** @brief Broadcast i-th element of vector
Scheme:
@code
{ v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] }
@endcode
Restriction: 0 <= i < nlanes
Supported types: 32-bit integers and floats (s32/u32/f32)
*/
template<int i, typename _Tp, int n>
inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
{
CV_DbgAssert(i >= 0 && i < n);
return v_reg<_Tp, n>::all(a.s[i]);
}

/** @brief Round
Rounds each value. Input type is float vector ==> output type is int vector.*/
Expand Down
22 changes: 22 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin_msa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1783,6 +1783,28 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
}

template<int i, typename _Tp>
inline typename _Tp::lane_type v_extract_n(const _Tp& a)
{
return v_rotate_right<i>(a).get0();
}

template<int i>
inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
{
return v_setall_u32(v_extract_n<i>(a));
}
template<int i>
inline v_int32x4 v_broadcast_element(const v_int32x4& a)
{
return v_setall_s32(v_extract_n<i>(a));
}
template<int i>
inline v_float32x4 v_broadcast_element(const v_float32x4& a)
{
return v_setall_f32(v_extract_n<i>(a));
}

////// FP16 suport ///////
#if CV_FP16
inline v_float32x4 v_load_expand(const float16_t* ptr)
Expand Down
32 changes: 32 additions & 0 deletions modules/core/include/opencv2/core/hal/intrin_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1651,6 +1651,38 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
#endif

#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }

OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
#endif

#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }

OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
#endif

#if CV_SIMD128_64F
inline v_int32x4 v_round(const v_float32x4& a)
{
Expand Down
Loading

0 comments on commit ad0ab41

Please sign in to comment.