@@ -32,7 +32,7 @@ extern "C" {
32
32
///
33
33
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
34
34
#[ inline]
35
- #[ target_feature( enable = "avx512vpclmulqdq,avx512f" ) ]
35
+ #[ target_feature( enable = "avx512vpclmulqdq,avx512f" ) ]
36
36
// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
37
37
#[ cfg_attr( test, assert_instr( vpclmul, imm8 = 0 ) ) ]
38
38
#[ rustc_args_required_const( 2 ) ]
@@ -57,6 +57,26 @@ pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m
57
57
#[ target_feature( enable = "avx512vpclmulqdq,avx512vl" ) ]
58
58
#[ cfg_attr( test, assert_instr( vpclmul, imm8 = 0 ) ) ]
59
59
#[ rustc_args_required_const( 2 ) ]
60
+ pub unsafe fn _mm256_clmulepi64_epi128 ( a : __m256i , b : __m256i , imm8 : i32 ) -> __m256i {
61
+ macro_rules! call {
62
+ ( $imm8: expr) => {
63
+ pclmulqdq_256( a, b, $imm8)
64
+ } ;
65
+ }
66
+ constify_imm8 ! ( imm8, call)
67
+ }
68
+
69
+ #[ cfg( test) ]
70
+ mod tests {
71
+ // The constants in the tests below are just bit patterns. They should not
72
+ // be interpreted as integers; signedness does not make sense for them, but
73
+ // __mXXXi happens to be defined in terms of signed integers.
74
+ #![ allow( overflowing_literals) ]
75
+
76
+ use stdarch_test:: simd_test;
77
+
78
+ use crate :: core_arch:: x86:: * ;
79
+
60
80
macro_rules! verify_kat_pclmul {
61
81
( $broadcast: ident, $clmul: ident, $assert: ident) => {
62
82
// Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
@@ -72,14 +92,14 @@ pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m
72
92
let r10 = $broadcast( r10) ;
73
93
let r11 = _mm_set_epi64x( 0x1d1e1f2c592e7c45 , 0xd66ee03e410fd4ed ) ;
74
94
let r11 = $broadcast( r11) ;
75
-
95
+
76
96
$assert( $clmul( a, b, 0x00 ) , r00) ;
77
97
$assert( $clmul( a, b, 0x10 ) , r01) ;
78
98
$assert( $clmul( a, b, 0x01 ) , r10) ;
79
99
$assert( $clmul( a, b, 0x11 ) , r11) ;
80
-
100
+
81
101
let a0 = _mm_set_epi64x( 0x0000000000000000 , 0x8000000000000000 ) ;
82
- let a0 = $broadcast( a0) ;
102
+ let a0 = $broadcast( a0) ;
83
103
let r = _mm_set_epi64x( 0x4000000000000000 , 0x0000000000000000 ) ;
84
104
let r = $broadcast( r) ;
85
105
$assert( $clmul( a0, a0, 0x00 ) , r) ;
@@ -88,94 +108,159 @@ pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m
88
108
89
109
macro_rules! unroll {
90
110
( $target: ident[ 4 ] = $op: ident( $source: ident, 4 ) ; ) => {
91
- $target[ 3 ] = $op( $source, 3 ) ;
92
- $target[ 2 ] = $op( $source, 2 ) ;
93
- unroll!{ $target[ 2 ] = $op( $source, 2 ) ; }
111
+ $target[ 3 ] = $op( $source, 3 ) ;
112
+ $target[ 2 ] = $op( $source, 2 ) ;
113
+ unroll! { $target[ 2 ] = $op( $source, 2 ) ; }
94
114
} ;
95
115
( $target: ident[ 2 ] = $op: ident( $source: ident, 2 ) ; ) => {
96
- $target[ 1 ] = $op( $source, 1 ) ;
97
- $target[ 0 ] = $op( $source, 0 ) ;
116
+ $target[ 1 ] = $op( $source, 1 ) ;
117
+ $target[ 0 ] = $op( $source, 0 ) ;
98
118
} ;
99
119
( assert_eq_m128i( $op: ident( $vec_res: ident, 4 ) , $lin_res: ident[ 4 ] ) ; ) => {
100
- assert_eq_m128i( $op( $vec_res, 3 ) , $lin_res[ 3 ] ) ;
101
- assert_eq_m128i( $op( $vec_res, 2 ) , $lin_res[ 2 ] ) ;
102
- unroll!{ assert_eq_m128i( $op( $vec_res, 2 ) , $lin_res[ 2 ] ) ; }
120
+ assert_eq_m128i( $op( $vec_res, 3 ) , $lin_res[ 3 ] ) ;
121
+ assert_eq_m128i( $op( $vec_res, 2 ) , $lin_res[ 2 ] ) ;
122
+ unroll! { assert_eq_m128i( $op( $vec_res, 2 ) , $lin_res[ 2 ] ) ; }
103
123
} ;
104
124
( assert_eq_m128i( $op: ident( $vec_res: ident, 2 ) , $lin_res: ident[ 2 ] ) ; ) => {
105
- assert_eq_m128i( $op( $vec_res, 1 ) , $lin_res[ 1 ] ) ;
106
- assert_eq_m128i( $op( $vec_res, 0 ) , $lin_res[ 0 ] ) ;
107
- }
125
+ assert_eq_m128i( $op( $vec_res, 1 ) , $lin_res[ 1 ] ) ;
126
+ assert_eq_m128i( $op( $vec_res, 0 ) , $lin_res[ 0 ] ) ;
127
+ } ;
108
128
}
109
129
110
130
// this function tests one of the possible 4 instances
111
131
// with different inputs across lanes
112
132
#[ target_feature( enable = "avx512vpclmulqdq,avx512f" ) ]
113
- unsafe fn verify_512_helper ( linear : unsafe fn ( __m128i , __m128i ) ->__m128i , vectorized : unsafe fn ( __m512i , __m512i ) ->__m512i ) {
133
+ unsafe fn verify_512_helper (
134
+ linear : unsafe fn ( __m128i , __m128i ) -> __m128i ,
135
+ vectorized : unsafe fn ( __m512i , __m512i ) -> __m512i ,
136
+ ) {
114
137
let a = _mm512_set_epi64 (
115
- 0xDCB4DB3657BF0B7D , 0x18DB0601068EDD9F , 0xB76B908233200DC5 , 0xE478235FA8E22D5E ,
116
- 0xAB05CFFA2621154C , 0x1171B47A186174C9 , 0x8C6B6C0E7595CEC9 , 0xBE3E7D4934E961BD
138
+ 0xDCB4DB3657BF0B7D ,
139
+ 0x18DB0601068EDD9F ,
140
+ 0xB76B908233200DC5 ,
141
+ 0xE478235FA8E22D5E ,
142
+ 0xAB05CFFA2621154C ,
143
+ 0x1171B47A186174C9 ,
144
+ 0x8C6B6C0E7595CEC9 ,
145
+ 0xBE3E7D4934E961BD ,
117
146
) ;
118
147
let b = _mm512_set_epi64 (
119
- 0x672F6F105A94CEA7 , 0x8298B8FFCA5F829C , 0xA3927047B3FB61D8 , 0x978093862CDE7187 ,
120
- 0xB1927AB22F31D0EC , 0xA9A5DA619BE4D7AF , 0xCA2590F56884FDC6 , 0x19BE9F660038BDB5
148
+ 0x672F6F105A94CEA7 ,
149
+ 0x8298B8FFCA5F829C ,
150
+ 0xA3927047B3FB61D8 ,
151
+ 0x978093862CDE7187 ,
152
+ 0xB1927AB22F31D0EC ,
153
+ 0xA9A5DA619BE4D7AF ,
154
+ 0xCA2590F56884FDC6 ,
155
+ 0x19BE9F660038BDB5 ,
121
156
) ;
122
157
123
- let mut a_decomp = [ _mm_setzero_si128 ( ) ; 4 ] ;
158
+ let mut a_decomp = [ _mm_setzero_si128 ( ) ; 4 ] ;
124
159
unroll ! { a_decomp[ 4 ] = _mm512_extracti32x4_epi32( a, 4 ) ; }
125
- let mut b_decomp = [ _mm_setzero_si128 ( ) ; 4 ] ;
160
+ let mut b_decomp = [ _mm_setzero_si128 ( ) ; 4 ] ;
126
161
unroll ! { b_decomp[ 4 ] = _mm512_extracti32x4_epi32( b, 4 ) ; }
127
162
128
163
let r = vectorized ( a, b) ;
129
- let mut e_decomp = [ _mm_setzero_si128 ( ) ; 4 ] ;
164
+ let mut e_decomp = [ _mm_setzero_si128 ( ) ; 4 ] ;
130
165
for i in 0 ..4 {
131
- e_decomp[ i] = linear ( a_decomp[ i] , b_decomp[ i] ) ;
166
+ e_decomp[ i] = linear ( a_decomp[ i] , b_decomp[ i] ) ;
132
167
}
133
- unroll ! { assert_eq_m128i( _mm512_extracti32x4_epi32( r, 4 ) , e_decomp[ 4 ] ) ; }
168
+ unroll ! { assert_eq_m128i( _mm512_extracti32x4_epi32( r, 4 ) , e_decomp[ 4 ] ) ; }
134
169
}
135
170
136
171
// this function tests one of the possible 4 instances
137
172
// with different inputs across lanes for the VL version
138
173
#[ target_feature( enable = "avx512vpclmulqdq,avx512vl" ) ]
139
- unsafe fn verify_256_helper ( linear : unsafe fn ( __m128i , __m128i ) ->__m128i , vectorized : unsafe fn ( __m256i , __m256i ) ->__m256i ) {
174
+ unsafe fn verify_256_helper (
175
+ linear : unsafe fn ( __m128i , __m128i ) -> __m128i ,
176
+ vectorized : unsafe fn ( __m256i , __m256i ) -> __m256i ,
177
+ ) {
140
178
let a = _mm512_set_epi64 (
141
- 0xDCB4DB3657BF0B7D , 0x18DB0601068EDD9F , 0xB76B908233200DC5 , 0xE478235FA8E22D5E ,
142
- 0xAB05CFFA2621154C , 0x1171B47A186174C9 , 0x8C6B6C0E7595CEC9 , 0xBE3E7D4934E961BD
179
+ 0xDCB4DB3657BF0B7D ,
180
+ 0x18DB0601068EDD9F ,
181
+ 0xB76B908233200DC5 ,
182
+ 0xE478235FA8E22D5E ,
183
+ 0xAB05CFFA2621154C ,
184
+ 0x1171B47A186174C9 ,
185
+ 0x8C6B6C0E7595CEC9 ,
186
+ 0xBE3E7D4934E961BD ,
143
187
) ;
144
188
let b = _mm512_set_epi64 (
145
- 0x672F6F105A94CEA7 , 0x8298B8FFCA5F829C , 0xA3927047B3FB61D8 , 0x978093862CDE7187 ,
146
- 0xB1927AB22F31D0EC , 0xA9A5DA619BE4D7AF , 0xCA2590F56884FDC6 , 0x19BE9F660038BDB5
189
+ 0x672F6F105A94CEA7 ,
190
+ 0x8298B8FFCA5F829C ,
191
+ 0xA3927047B3FB61D8 ,
192
+ 0x978093862CDE7187 ,
193
+ 0xB1927AB22F31D0EC ,
194
+ 0xA9A5DA619BE4D7AF ,
195
+ 0xCA2590F56884FDC6 ,
196
+ 0x19BE9F660038BDB5 ,
147
197
) ;
148
198
149
- let mut a_decomp = [ _mm_setzero_si128 ( ) ; 2 ] ;
199
+ let mut a_decomp = [ _mm_setzero_si128 ( ) ; 2 ] ;
150
200
unroll ! { a_decomp[ 2 ] = _mm512_extracti32x4_epi32( a, 2 ) ; }
151
- let mut b_decomp = [ _mm_setzero_si128 ( ) ; 2 ] ;
201
+ let mut b_decomp = [ _mm_setzero_si128 ( ) ; 2 ] ;
152
202
unroll ! { b_decomp[ 2 ] = _mm512_extracti32x4_epi32( b, 2 ) ; }
153
203
154
- let r = vectorized ( _mm512_extracti64x4_epi64 ( a, 0 ) , _mm512_extracti64x4_epi64 ( b, 0 ) ) ;
155
- let mut e_decomp = [ _mm_setzero_si128 ( ) ; 2 ] ;
204
+ let r = vectorized (
205
+ _mm512_extracti64x4_epi64 ( a, 0 ) ,
206
+ _mm512_extracti64x4_epi64 ( b, 0 ) ,
207
+ ) ;
208
+ let mut e_decomp = [ _mm_setzero_si128 ( ) ; 2 ] ;
156
209
for i in 0 ..2 {
157
- e_decomp[ i] = linear ( a_decomp[ i] , b_decomp[ i] ) ;
210
+ e_decomp[ i] = linear ( a_decomp[ i] , b_decomp[ i] ) ;
158
211
}
159
- unroll ! { assert_eq_m128i( _mm256_extracti128_si256( r, 2 ) , e_decomp[ 2 ] ) ; }
212
+ unroll ! { assert_eq_m128i( _mm256_extracti128_si256( r, 2 ) , e_decomp[ 2 ] ) ; }
160
213
}
161
214
162
215
#[ simd_test( enable = "avx512vpclmulqdq,avx512f" ) ]
163
216
unsafe fn test_mm512_clmulepi64_epi128 ( ) {
164
- verify_kat_pclmul ! ( _mm512_broadcast_i32x4, _mm512_clmulepi64_epi128, assert_eq_m512i) ;
217
+ verify_kat_pclmul ! (
218
+ _mm512_broadcast_i32x4,
219
+ _mm512_clmulepi64_epi128,
220
+ assert_eq_m512i
221
+ ) ;
165
222
166
- verify_512_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x00 ) , |a, b|_mm512_clmulepi64_epi128 ( a, b, 0x00 ) ) ;
167
- verify_512_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x01 ) , |a, b|_mm512_clmulepi64_epi128 ( a, b, 0x01 ) ) ;
168
- verify_512_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x10 ) , |a, b|_mm512_clmulepi64_epi128 ( a, b, 0x10 ) ) ;
169
- verify_512_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x11 ) , |a, b|_mm512_clmulepi64_epi128 ( a, b, 0x11 ) ) ;
223
+ verify_512_helper (
224
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x00 ) ,
225
+ |a, b| _mm512_clmulepi64_epi128 ( a, b, 0x00 ) ,
226
+ ) ;
227
+ verify_512_helper (
228
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x01 ) ,
229
+ |a, b| _mm512_clmulepi64_epi128 ( a, b, 0x01 ) ,
230
+ ) ;
231
+ verify_512_helper (
232
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x10 ) ,
233
+ |a, b| _mm512_clmulepi64_epi128 ( a, b, 0x10 ) ,
234
+ ) ;
235
+ verify_512_helper (
236
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x11 ) ,
237
+ |a, b| _mm512_clmulepi64_epi128 ( a, b, 0x11 ) ,
238
+ ) ;
170
239
}
171
240
172
241
#[ simd_test( enable = "avx512vpclmulqdq,avx512vl" ) ]
173
242
unsafe fn test_mm256_clmulepi64_epi128 ( ) {
174
- verify_kat_pclmul ! ( _mm256_broadcastsi128_si256, _mm256_clmulepi64_epi128, assert_eq_m256i) ;
243
+ verify_kat_pclmul ! (
244
+ _mm256_broadcastsi128_si256,
245
+ _mm256_clmulepi64_epi128,
246
+ assert_eq_m256i
247
+ ) ;
175
248
176
- verify_256_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x00 ) , |a, b|_mm256_clmulepi64_epi128 ( a, b, 0x00 ) ) ;
177
- verify_256_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x01 ) , |a, b|_mm256_clmulepi64_epi128 ( a, b, 0x01 ) ) ;
178
- verify_256_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x10 ) , |a, b|_mm256_clmulepi64_epi128 ( a, b, 0x10 ) ) ;
179
- verify_256_helper ( |a, b|_mm_clmulepi64_si128 ( a, b, 0x11 ) , |a, b|_mm256_clmulepi64_epi128 ( a, b, 0x11 ) ) ;
249
+ verify_256_helper (
250
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x00 ) ,
251
+ |a, b| _mm256_clmulepi64_epi128 ( a, b, 0x00 ) ,
252
+ ) ;
253
+ verify_256_helper (
254
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x01 ) ,
255
+ |a, b| _mm256_clmulepi64_epi128 ( a, b, 0x01 ) ,
256
+ ) ;
257
+ verify_256_helper (
258
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x10 ) ,
259
+ |a, b| _mm256_clmulepi64_epi128 ( a, b, 0x10 ) ,
260
+ ) ;
261
+ verify_256_helper (
262
+ |a, b| _mm_clmulepi64_si128 ( a, b, 0x11 ) ,
263
+ |a, b| _mm256_clmulepi64_epi128 ( a, b, 0x11 ) ,
264
+ ) ;
180
265
}
181
266
}
0 commit comments