forked from hxim/paq8px
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimd.hpp
120 lines (115 loc) · 3.07 KB
/
simd.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#ifndef PAQ8PX_SIMD_HPP
#define PAQ8PX_SIMD_HPP
///////////////////////// SIMD Vectorization detection //////////////////////////////////
#if (!defined(__ARM_FEATURE_SIMD32) && !defined(__ARM_NEON))
// Uncomment one or more of the following includes if you plan adding more SIMD dispatching
//#include <mmintrin.h> //MMX
//#include <xmmintrin.h> //SSE
#include <emmintrin.h> //SSE2
//#include <pmmintrin.h> //SSE3
//#include <tmmintrin.h> //SSSE3
//#include <smmintrin.h> //SSE4.1
//#include <nmmintrin.h> //SSE4.2
//#include <ammintrin.h> //SSE4A
#include <immintrin.h> //AVX, AVX2
//#include <zmmintrin.h> //AVX512
#endif
//define CPUID
#if defined(__GNUC__) || defined(__clang__)
#if !defined(__ARM_FEATURE_SIMD32) && !defined(__ARM_NEON)
#include <cpuid.h>
#define cpuid(info, x) __cpuid_count(x, 0, (info)[0], (info)[1], (info)[2], (info)[3])
#endif
#elif defined(_MSC_VER)
#include <immintrin.h>
#define cpuid(info, x) __cpuidex(info, x, 0)
#else
#error Unknown compiler
#endif
// Define interface to xgetbv instruction
static inline auto xgetbv(unsigned long ctr) -> unsigned long long {
#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
return _xgetbv(ctr);
#elif defined(__GNUC__)
uint32_t a = 0;
uint32_t d;
__asm("xgetbv"
: "=a"(a), "=d"(d)
: "c"(ctr)
:);
return a | ((static_cast<uint64_t>(d)) << 32U);
#else
#error Unknown compiler
#endif
}
/* Returns system's highest supported SIMD instruction set as
0: none
1: MMX
2: SSE
3: SSE2
4: SSE3
5: SSSE3
6: SSE4.1
7: SSE4.2
: SSE4A //SSE4A is not supported on Intel, so we will exclude it
8: AVX
9: AVX2
: AVX512 //TODO
11: NEON
*/
static auto simdDetect() -> int {
#if defined(__ARM_FEATURE_SIMD32) || defined(__ARM_NEON)
return 11;
#else
int cpuidResult[4] = {0, 0, 0, 0};
cpuid(cpuidResult, 0); // call cpuid function 0 ("Get vendor ID and highest basic calling parameter")
if( cpuidResult[0] == 0 ) {
return 0; //cpuid is not supported
}
cpuid(cpuidResult, 1); // call cpuid function 1 ("Processor Info and Feature Bits")
if((cpuidResult[3] & (1U << 23U)) == 0 ) {
return 0; //no MMX
}
if((cpuidResult[3] & (1U << 25U)) == 0 ) {
return 1; //no SSE
}
//SSE: OK
if((cpuidResult[3] & (1U << 26U)) == 0 ) {
return 2; //no SSE2
}
//SSE2: OK
if((cpuidResult[2] & (1U << 0U)) == 0 ) {
return 3; //no SSE3
}
//SSE3: OK
if((cpuidResult[2] & (1U << 9U)) == 0 ) {
return 4; //no SSSE3
}
//SSSE3: OK
if((cpuidResult[2] & (1U << 19U)) == 0 ) {
return 5; //no SSE4.1
}
//SSE4.1: OK
if((cpuidResult[2] & (1U << 20U)) == 0 ) {
return 6; //no SSE4.2
}
//SSE4.2: OK
if((cpuidResult[2] & (1U << 27U)) == 0 ) {
return 7; //no OSXSAVE (no XGETBV)
}
if((xgetbv(0) & 6) != 6 ) {
return 7; //AVX is not enabled in OS
}
if((cpuidResult[2] & (1U << 28U)) == 0 ) {
return 7; //no AVX
}
//AVX: OK
cpuid(cpuidResult, 7); // call cpuid function 7 ("Extended Feature Bits")
if((cpuidResult[1] & (1U << 5U)) == 0 ) {
return 8; //no AVX2
}
//AVX2: OK
return 9;
#endif
}
#endif //PAQ8PX_SIMD_HPP